clinical-data-validators 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clinical_data_validators-0.4.0.dist-info/METADATA +49 -0
- clinical_data_validators-0.4.0.dist-info/RECORD +15 -0
- clinical_data_validators-0.4.0.dist-info/WHEEL +5 -0
- clinical_data_validators-0.4.0.dist-info/top_level.txt +1 -0
- clinical_validators/__init__.py +24 -0
- clinical_validators/age_consistency.py +56 -0
- clinical_validators/data_completeness.py +57 -0
- clinical_validators/duplicate_records.py +62 -0
- clinical_validators/future_dates.py +37 -0
- clinical_validators/gender_based_validation.py +61 -0
- clinical_validators/invalid_data_types.py +46 -0
- clinical_validators/invalid_patient_ids.py +74 -0
- clinical_validators/missing_fields.py +113 -0
- clinical_validators/missing_visit_data.py +43 -0
- clinical_validators/out_of_range_values.py +61 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: clinical-data-validators
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: A library for validating clinical data quality
|
|
5
|
+
Author: Navin Kumar
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: pandas>=1.3.0
|
|
12
|
+
Dynamic: author
|
|
13
|
+
Dynamic: classifier
|
|
14
|
+
Dynamic: description
|
|
15
|
+
Dynamic: description-content-type
|
|
16
|
+
Dynamic: requires-dist
|
|
17
|
+
Dynamic: requires-python
|
|
18
|
+
Dynamic: summary
|
|
19
|
+
|
|
20
|
+
# Clinical Data Validators
|
|
21
|
+
|
|
22
|
+
A Python library for validating clinical datasets.
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
pip install clinical-data-validators
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
python
|
|
31
|
+
import pandas as pd
|
|
32
|
+
from clinical_validators import validate_missing_critical_fields
|
|
33
|
+
|
|
34
|
+
df = pd.read_csv("data.csv")
|
|
35
|
+
result = validate_missing_critical_fields(df)
|
|
36
|
+
print(result)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
## Validators
|
|
40
|
+
1. Missing Critical Fields
|
|
41
|
+
2. Invalid Data Types
|
|
42
|
+
3. Future Dates
|
|
43
|
+
4. Out of Range Values
|
|
44
|
+
5. Duplicate Records
|
|
45
|
+
6. Invalid Patient IDs
|
|
46
|
+
7. Missing Visit Data
|
|
47
|
+
8. Age Consistency
|
|
48
|
+
9. Gender-Based Validation
|
|
49
|
+
10. Data Completeness
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
clinical_validators/__init__.py,sha256=y3ziB5Ikc-lL3M80UnyL5iKzQTSSKEMDgpHTR7XaltY,958
|
|
2
|
+
clinical_validators/age_consistency.py,sha256=Uw75D5_i8RKwv0Z4NNRJs-2CiTRN4It3MLkUm2TujCI,1958
|
|
3
|
+
clinical_validators/data_completeness.py,sha256=IhcChtVydd2Y6T5ReDPGsiZYyjAUt5ArhgNnEmRJy-A,2142
|
|
4
|
+
clinical_validators/duplicate_records.py,sha256=IReFwioNGpHlNrGx3q30BQohEvTt96Y30MpR_NJjMOo,2449
|
|
5
|
+
clinical_validators/future_dates.py,sha256=Xy0JbqdU-vQdUFmY8jZLjasApt1LASLdmCue-mf5F-0,1508
|
|
6
|
+
clinical_validators/gender_based_validation.py,sha256=VWi28vGAyg8MVNjEKB9XfnK0rR-Ol0gylN0u6uxI05c,2113
|
|
7
|
+
clinical_validators/invalid_data_types.py,sha256=zsFifgoLUZ6REobPj72sAKgt9CthDAoeA8XjVBD6wEA,1973
|
|
8
|
+
clinical_validators/invalid_patient_ids.py,sha256=Sh-BrLXeLEUQJ54I9xTA6xg97__nxZpPuBv0w6SJNzA,2773
|
|
9
|
+
clinical_validators/missing_fields.py,sha256=08vI5SJVLsfg_y1wpiNxye9hCxFRVShv8Q_5UVdTFog,4269
|
|
10
|
+
clinical_validators/missing_visit_data.py,sha256=i940dRgy58soz8W2xlds7VQCoi0aNFCPZeSUM1HC42E,1565
|
|
11
|
+
clinical_validators/out_of_range_values.py,sha256=Rtag7MJrdyMLajNyBtDQ16bHWhtp6DHopoxBOozkxgA,2226
|
|
12
|
+
clinical_data_validators-0.4.0.dist-info/METADATA,sha256=oHNClMVzg6dhOCweyszLGi_93PY0fsvym5utiR6X6HI,1156
|
|
13
|
+
clinical_data_validators-0.4.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
14
|
+
clinical_data_validators-0.4.0.dist-info/top_level.txt,sha256=-OtSRpWXkxFLxhj6uy-Vn3HWbgHzWithqh4CWmkGWnQ,20
|
|
15
|
+
clinical_data_validators-0.4.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
clinical_validators
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from .missing_fields import validate_missing_critical_fields
|
|
2
|
+
from .invalid_data_types import validate_invalid_data_types
|
|
3
|
+
from .future_dates import validate_future_dates
|
|
4
|
+
from .out_of_range_values import validate_out_of_range_values
|
|
5
|
+
from .duplicate_records import validate_duplicate_records
|
|
6
|
+
from .invalid_patient_ids import validate_patient_ids
|
|
7
|
+
from .missing_visit_data import validate_missing_visit_data
|
|
8
|
+
from .age_consistency import validate_age_consistency
|
|
9
|
+
from .gender_based_validation import validate_gender_based_tests
|
|
10
|
+
from .data_completeness import validate_data_completeness
|
|
11
|
+
|
|
12
|
+
__version__ = "0.4.0"
|
|
13
|
+
__all__ = [
|
|
14
|
+
'validate_missing_critical_fields',
|
|
15
|
+
'validate_invalid_data_types',
|
|
16
|
+
'validate_future_dates',
|
|
17
|
+
'validate_out_of_range_values',
|
|
18
|
+
'validate_duplicate_records',
|
|
19
|
+
'validate_patient_ids',
|
|
20
|
+
'validate_missing_visit_data',
|
|
21
|
+
'validate_age_consistency',
|
|
22
|
+
'validate_gender_based_tests',
|
|
23
|
+
'validate_data_completeness'
|
|
24
|
+
]
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
|
|
4
|
+
def validate_age_consistency(dataframe, age_field='age', birth_date_field='birth_date', tolerance_years=1):
|
|
5
|
+
"""
|
|
6
|
+
Validates that the recorded age is consistent with the birth date.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
dataframe: pandas DataFrame.
|
|
10
|
+
age_field: Column name for age.
|
|
11
|
+
birth_date_field: Column name for birth date.
|
|
12
|
+
tolerance_years: Allowed difference in years (default 1 for leap year/rounding).
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
dict with validation results.
|
|
16
|
+
"""
|
|
17
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
18
|
+
raise TypeError("Expected pandas DataFrame")
|
|
19
|
+
if dataframe.empty:
|
|
20
|
+
raise ValueError("DataFrame is empty")
|
|
21
|
+
|
|
22
|
+
failures = []
|
|
23
|
+
failed_row_indices = set()
|
|
24
|
+
now = datetime.now()
|
|
25
|
+
|
|
26
|
+
for idx, row in dataframe.iterrows():
|
|
27
|
+
age = row.get(age_field)
|
|
28
|
+
birth_date = row.get(birth_date_field)
|
|
29
|
+
|
|
30
|
+
if pd.isna(age) or pd.isna(birth_date):
|
|
31
|
+
continue
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
birth_dt = pd.to_datetime(birth_date).to_pydatetime()
|
|
35
|
+
calculated_age = (now - birth_dt).days / 365.25
|
|
36
|
+
if abs(calculated_age - float(age)) > tolerance_years:
|
|
37
|
+
failed_row_indices.add(idx)
|
|
38
|
+
patient_id = row.get('patient_id', 'UNKNOWN')
|
|
39
|
+
failures.append({
|
|
40
|
+
'row_index': idx,
|
|
41
|
+
'patient_id': patient_id,
|
|
42
|
+
'recorded_age': age,
|
|
43
|
+
'calculated_age': round(calculated_age, 1),
|
|
44
|
+
'error': 'Age does not match birth date'
|
|
45
|
+
})
|
|
46
|
+
except (ValueError, TypeError):
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
return {
|
|
50
|
+
'validator_name': 'age_consistency',
|
|
51
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
52
|
+
'total_records': len(dataframe),
|
|
53
|
+
'failed_records': len(failed_row_indices),
|
|
54
|
+
'failure_count': len(failures),
|
|
55
|
+
'failures': failures
|
|
56
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def validate_data_completeness(dataframe, test_field='test_type', completeness_rules=None):
|
|
4
|
+
"""
|
|
5
|
+
Validates that required fields are present for specific test types.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
dataframe: pandas DataFrame.
|
|
9
|
+
test_field: Column name indicating the test type.
|
|
10
|
+
completeness_rules: Dict mapping test types to lists of required fields.
|
|
11
|
+
Default: {'Blood': ['hemoglobin', 'wbc'], 'Urine': ['ph', 'protein']}
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
dict with validation results.
|
|
15
|
+
"""
|
|
16
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
17
|
+
raise TypeError("Expected pandas DataFrame")
|
|
18
|
+
if dataframe.empty:
|
|
19
|
+
raise ValueError("DataFrame is empty")
|
|
20
|
+
|
|
21
|
+
if completeness_rules is None:
|
|
22
|
+
completeness_rules = {
|
|
23
|
+
'Blood': ['hemoglobin', 'wbc'],
|
|
24
|
+
'Urine': ['ph', 'protein']
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
failures = []
|
|
28
|
+
failed_row_indices = set()
|
|
29
|
+
|
|
30
|
+
for idx, row in dataframe.iterrows():
|
|
31
|
+
test_type = row.get(test_field)
|
|
32
|
+
if pd.isna(test_type):
|
|
33
|
+
continue
|
|
34
|
+
|
|
35
|
+
test_str = str(test_type).strip()
|
|
36
|
+
if test_str in completeness_rules:
|
|
37
|
+
required_fields = completeness_rules[test_str]
|
|
38
|
+
for field in required_fields:
|
|
39
|
+
if field not in dataframe.columns or pd.isna(row.get(field)) or str(row.get(field)).strip() == '':
|
|
40
|
+
failed_row_indices.add(idx)
|
|
41
|
+
patient_id = row.get('patient_id', 'UNKNOWN')
|
|
42
|
+
failures.append({
|
|
43
|
+
'row_index': idx,
|
|
44
|
+
'patient_id': patient_id,
|
|
45
|
+
'test_type': test_str,
|
|
46
|
+
'missing_field': field,
|
|
47
|
+
'error': f"Missing required field '{field}' for {test_str} test"
|
|
48
|
+
})
|
|
49
|
+
|
|
50
|
+
return {
|
|
51
|
+
'validator_name': 'data_completeness',
|
|
52
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
53
|
+
'total_records': len(dataframe),
|
|
54
|
+
'failed_records': len(failed_row_indices),
|
|
55
|
+
'failure_count': len(failures),
|
|
56
|
+
'failures': failures
|
|
57
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def validate_duplicate_records(dataframe, duplicate_fields=None):
|
|
4
|
+
"""
|
|
5
|
+
Validates that there are no duplicate records based on specified fields.
|
|
6
|
+
Typical use case: same patient + same test + same date
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
dataframe: pandas DataFrame with clinical data
|
|
10
|
+
duplicate_fields: list of field names to check for duplicates
|
|
11
|
+
Default: ['patient_id', 'test_name', 'test_date']
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
dict with validation results
|
|
15
|
+
"""
|
|
16
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
17
|
+
raise TypeError("Expected pandas DataFrame")
|
|
18
|
+
if dataframe.empty:
|
|
19
|
+
raise ValueError("DataFrame is empty")
|
|
20
|
+
|
|
21
|
+
if duplicate_fields is None:
|
|
22
|
+
duplicate_fields = ['patient_id', 'test_name', 'test_date']
|
|
23
|
+
|
|
24
|
+
# Check if all required fields exist
|
|
25
|
+
available_fields = [f for f in duplicate_fields if f in dataframe.columns]
|
|
26
|
+
if len(available_fields) < 2:
|
|
27
|
+
raise ValueError(f"Need at least 2 fields to check duplicates. Found: {available_fields}")
|
|
28
|
+
|
|
29
|
+
# Find duplicates
|
|
30
|
+
duplicates = dataframe[dataframe.duplicated(subset=available_fields, keep=False)]
|
|
31
|
+
|
|
32
|
+
failures = []
|
|
33
|
+
if not duplicates.empty:
|
|
34
|
+
# Get unique duplicate groups
|
|
35
|
+
duplicate_groups = duplicates.groupby(available_fields).size().reset_index(name='count')
|
|
36
|
+
duplicate_groups = duplicate_groups[duplicate_groups['count'] > 1]
|
|
37
|
+
|
|
38
|
+
for _, row in duplicate_groups.iterrows():
|
|
39
|
+
# Find all rows in this duplicate group
|
|
40
|
+
mask = True
|
|
41
|
+
for field in available_fields:
|
|
42
|
+
mask = mask & (duplicates[field] == row[field])
|
|
43
|
+
duplicate_indices = duplicates[mask].index.tolist()
|
|
44
|
+
|
|
45
|
+
for idx in duplicate_indices:
|
|
46
|
+
failures.append({
|
|
47
|
+
'row_index': idx,
|
|
48
|
+
'duplicate_fields': available_fields,
|
|
49
|
+
'duplicate_values': {field: row[field] for field in available_fields},
|
|
50
|
+
'occurrences': int(row['count'])
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
failed_row_indices = set(f['row_index'] for f in failures)
|
|
54
|
+
|
|
55
|
+
return {
|
|
56
|
+
'validator_name': 'duplicate_records',
|
|
57
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
58
|
+
'total_records': len(dataframe),
|
|
59
|
+
'failed_records': len(failed_row_indices),
|
|
60
|
+
'failure_count': len(failures),
|
|
61
|
+
'failures': failures
|
|
62
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def validate_future_dates(dataframe, date_fields=None):
|
|
4
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
5
|
+
raise TypeError("Expected pandas DataFrame")
|
|
6
|
+
if dataframe.empty:
|
|
7
|
+
raise ValueError("DataFrame is empty")
|
|
8
|
+
|
|
9
|
+
if date_fields is None:
|
|
10
|
+
date_fields = ['visit_date', 'test_date']
|
|
11
|
+
|
|
12
|
+
failures = []
|
|
13
|
+
failed_row_indices = set()
|
|
14
|
+
now = pd.Timestamp.now()
|
|
15
|
+
|
|
16
|
+
for field in date_fields:
|
|
17
|
+
if field not in dataframe.columns: continue
|
|
18
|
+
for idx, val in dataframe[field].items():
|
|
19
|
+
if pd.isna(val) or str(val).strip() == '': continue
|
|
20
|
+
try:
|
|
21
|
+
# QA FIX: Force tz-naive to prevent comparison crashes with mixed timezone data
|
|
22
|
+
date_val = pd.to_datetime(val).tz_localize(None)
|
|
23
|
+
if date_val > now:
|
|
24
|
+
failed_row_indices.add(idx)
|
|
25
|
+
patient_id = dataframe.at[idx, 'patient_id'] if 'patient_id' in dataframe.columns else 'UNKNOWN'
|
|
26
|
+
failures.append({'row_index': idx, 'field': field, 'invalid_value': str(val), 'error': 'Date is in the future', 'patient_id': patient_id})
|
|
27
|
+
except (ValueError, TypeError):
|
|
28
|
+
continue
|
|
29
|
+
|
|
30
|
+
return {
|
|
31
|
+
'validator_name': 'future_dates',
|
|
32
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
33
|
+
'total_records': len(dataframe),
|
|
34
|
+
'failed_records': len(failed_row_indices),
|
|
35
|
+
'failure_count': len(failures),
|
|
36
|
+
'failures': failures
|
|
37
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def validate_gender_based_tests(dataframe, test_field='test_name', gender_field='gender', rules=None):
|
|
4
|
+
"""
|
|
5
|
+
Validates that certain medical tests are only performed on appropriate genders.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
dataframe: pandas DataFrame.
|
|
9
|
+
test_field: Column name for test name.
|
|
10
|
+
gender_field: Column name for gender.
|
|
11
|
+
rules: Dict mapping test names to allowed genders.
|
|
12
|
+
Default: {'Prostate_Specific_Antigen': ['M', 'Male'], 'Pap_Smear': ['F', 'Female']}
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
dict with validation results.
|
|
16
|
+
"""
|
|
17
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
18
|
+
raise TypeError("Expected pandas DataFrame")
|
|
19
|
+
if dataframe.empty:
|
|
20
|
+
raise ValueError("DataFrame is empty")
|
|
21
|
+
|
|
22
|
+
if rules is None:
|
|
23
|
+
rules = {
|
|
24
|
+
'Prostate_Specific_Antigen': ['M', 'Male'],
|
|
25
|
+
'Pap_Smear': ['F', 'Female']
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
failures = []
|
|
29
|
+
failed_row_indices = set()
|
|
30
|
+
|
|
31
|
+
for idx, row in dataframe.iterrows():
|
|
32
|
+
test_name = row.get(test_field)
|
|
33
|
+
gender = row.get(gender_field)
|
|
34
|
+
|
|
35
|
+
if pd.isna(test_name) or pd.isna(gender):
|
|
36
|
+
continue
|
|
37
|
+
|
|
38
|
+
test_str = str(test_name).strip()
|
|
39
|
+
gender_str = str(gender).strip()
|
|
40
|
+
|
|
41
|
+
if test_str in rules:
|
|
42
|
+
allowed_genders = [g.lower() for g in rules[test_str]]
|
|
43
|
+
if gender_str.lower() not in allowed_genders:
|
|
44
|
+
failed_row_indices.add(idx)
|
|
45
|
+
patient_id = row.get('patient_id', 'UNKNOWN')
|
|
46
|
+
failures.append({
|
|
47
|
+
'row_index': idx,
|
|
48
|
+
'patient_id': patient_id,
|
|
49
|
+
'test': test_str,
|
|
50
|
+
'gender': gender_str,
|
|
51
|
+
'error': f"Test {test_str} is invalid for gender {gender_str}"
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
return {
|
|
55
|
+
'validator_name': 'gender_based_validation',
|
|
56
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
57
|
+
'total_records': len(dataframe),
|
|
58
|
+
'failed_records': len(failed_row_indices),
|
|
59
|
+
'failure_count': len(failures),
|
|
60
|
+
'failures': failures
|
|
61
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def validate_invalid_data_types(dataframe, numeric_fields=None, date_fields=None):
|
|
4
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
5
|
+
raise TypeError("Expected pandas DataFrame")
|
|
6
|
+
if dataframe.empty:
|
|
7
|
+
raise ValueError("DataFrame is empty")
|
|
8
|
+
|
|
9
|
+
if numeric_fields is None:
|
|
10
|
+
numeric_fields = ['lab_value']
|
|
11
|
+
if date_fields is None:
|
|
12
|
+
date_fields = ['visit_date', 'test_date']
|
|
13
|
+
|
|
14
|
+
failures = []
|
|
15
|
+
failed_row_indices = set()
|
|
16
|
+
|
|
17
|
+
for field in numeric_fields:
|
|
18
|
+
if field not in dataframe.columns: continue
|
|
19
|
+
for idx, val in dataframe[field].items():
|
|
20
|
+
if pd.isna(val) or str(val).strip() == '': continue
|
|
21
|
+
try:
|
|
22
|
+
float(val)
|
|
23
|
+
except (ValueError, TypeError):
|
|
24
|
+
failed_row_indices.add(idx)
|
|
25
|
+
patient_id = dataframe.at[idx, 'patient_id'] if 'patient_id' in dataframe.columns else 'UNKNOWN'
|
|
26
|
+
failures.append({'row_index': idx, 'field': field, 'invalid_value': str(val), 'expected_type': 'numeric', 'patient_id': patient_id})
|
|
27
|
+
|
|
28
|
+
for field in date_fields:
|
|
29
|
+
if field not in dataframe.columns: continue
|
|
30
|
+
for idx, val in dataframe[field].items():
|
|
31
|
+
if pd.isna(val) or str(val).strip() == '': continue
|
|
32
|
+
try:
|
|
33
|
+
pd.to_datetime(val)
|
|
34
|
+
except (ValueError, TypeError):
|
|
35
|
+
failed_row_indices.add(idx)
|
|
36
|
+
patient_id = dataframe.at[idx, 'patient_id'] if 'patient_id' in dataframe.columns else 'UNKNOWN'
|
|
37
|
+
failures.append({'row_index': idx, 'field': field, 'invalid_value': str(val), 'expected_type': 'datetime', 'patient_id': patient_id})
|
|
38
|
+
|
|
39
|
+
return {
|
|
40
|
+
'validator_name': 'invalid_data_types',
|
|
41
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
42
|
+
'total_records': len(dataframe),
|
|
43
|
+
'failed_records': len(failed_row_indices),
|
|
44
|
+
'failure_count': len(failures),
|
|
45
|
+
'failures': failures
|
|
46
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
def validate_patient_ids(dataframe, patient_id_field='patient_id', pattern=None, min_length=None, max_length=None):
|
|
5
|
+
"""
|
|
6
|
+
Validates patient ID format according to specified rules.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
dataframe: pandas DataFrame with clinical data
|
|
10
|
+
patient_id_field: name of the patient ID column (default: 'patient_id')
|
|
11
|
+
pattern: regex pattern for validation (default: None)
|
|
12
|
+
min_length: minimum length of patient ID (default: None)
|
|
13
|
+
max_length: maximum length of patient ID (default: None)
|
|
14
|
+
|
|
15
|
+
Default behavior: Validates that patient IDs are exactly 5 digits
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
dict with validation results
|
|
19
|
+
"""
|
|
20
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
21
|
+
raise TypeError("Expected pandas DataFrame")
|
|
22
|
+
if dataframe.empty:
|
|
23
|
+
raise ValueError("DataFrame is empty")
|
|
24
|
+
|
|
25
|
+
if patient_id_field not in dataframe.columns:
|
|
26
|
+
raise ValueError(f"Column '{patient_id_field}' not found in DataFrame")
|
|
27
|
+
|
|
28
|
+
# Default: 5-digit numeric ID
|
|
29
|
+
if pattern is None and min_length is None and max_length is None:
|
|
30
|
+
pattern = r'^\d{5}$' # Exactly 5 digits
|
|
31
|
+
|
|
32
|
+
failures = []
|
|
33
|
+
failed_row_indices = set()
|
|
34
|
+
|
|
35
|
+
for idx, val in dataframe[patient_id_field].items():
|
|
36
|
+
if pd.isna(val) or str(val).strip() == '':
|
|
37
|
+
continue # Skip missing values
|
|
38
|
+
|
|
39
|
+
patient_id_str = str(val).strip()
|
|
40
|
+
is_valid = True
|
|
41
|
+
error_reason = ""
|
|
42
|
+
|
|
43
|
+
# Check pattern
|
|
44
|
+
if pattern:
|
|
45
|
+
if not re.match(pattern, patient_id_str):
|
|
46
|
+
is_valid = False
|
|
47
|
+
error_reason = f"Does not match pattern: {pattern}"
|
|
48
|
+
|
|
49
|
+
# Check length
|
|
50
|
+
if is_valid and (min_length is not None or max_length is not None):
|
|
51
|
+
if min_length is not None and len(patient_id_str) < min_length:
|
|
52
|
+
is_valid = False
|
|
53
|
+
error_reason = f"Length {len(patient_id_str)} < minimum {min_length}"
|
|
54
|
+
elif max_length is not None and len(patient_id_str) > max_length:
|
|
55
|
+
is_valid = False
|
|
56
|
+
error_reason = f"Length {len(patient_id_str)} > maximum {max_length}"
|
|
57
|
+
|
|
58
|
+
if not is_valid:
|
|
59
|
+
failed_row_indices.add(idx)
|
|
60
|
+
failures.append({
|
|
61
|
+
'row_index': idx,
|
|
62
|
+
'field': patient_id_field,
|
|
63
|
+
'invalid_value': patient_id_str,
|
|
64
|
+
'error': error_reason
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
'validator_name': 'invalid_patient_ids',
|
|
69
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
70
|
+
'total_records': len(dataframe),
|
|
71
|
+
'failed_records': len(failed_row_indices),
|
|
72
|
+
'failure_count': len(failures),
|
|
73
|
+
'failures': failures
|
|
74
|
+
}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
def validate_missing_critical_fields(
|
|
5
|
+
dataframe: pd.DataFrame,
|
|
6
|
+
critical_fields: Optional[List[str]] = None
|
|
7
|
+
) -> Dict:
|
|
8
|
+
"""
|
|
9
|
+
Validates that critical clinical fields are not NULL/empty.
|
|
10
|
+
|
|
11
|
+
This validator checks for missing values (NaN, None, empty strings) in
|
|
12
|
+
critical clinical data fields. It's designed to catch common data quality
|
|
13
|
+
issues in clinical research datasets.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
dataframe: pandas DataFrame with clinical data
|
|
17
|
+
critical_fields: list of required field names. If None, uses default
|
|
18
|
+
clinical fields (patient_id, visit_date, lab_test_name,
|
|
19
|
+
lab_value, test_date)
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Dictionary with validation results containing:
|
|
23
|
+
- validator_name: name of the validator
|
|
24
|
+
- status: 'PASS' or 'FAIL'
|
|
25
|
+
- total_records: total rows in dataframe
|
|
26
|
+
- failed_records: number of rows with missing values
|
|
27
|
+
- failure_count: total number of missing value instances
|
|
28
|
+
- critical_fields_checked: list of fields that were validated
|
|
29
|
+
- failures: list of detailed failure information
|
|
30
|
+
|
|
31
|
+
Raises:
|
|
32
|
+
ValueError: if dataframe is empty or critical_fields is empty list
|
|
33
|
+
TypeError: if dataframe is not a pandas DataFrame
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> import pandas as pd
|
|
37
|
+
>>> from clinical_validators import validate_missing_critical_fields
|
|
38
|
+
>>> df = pd.read_csv('lab_data.csv')
|
|
39
|
+
>>> result = validate_missing_critical_fields(df)
|
|
40
|
+
>>> print(result['status'])
|
|
41
|
+
'FAIL'
|
|
42
|
+
>>> print(f"Failed records: {result['failed_records']}")
|
|
43
|
+
'Failed records: 5'
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
# ===== INPUT VALIDATION =====
|
|
47
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
48
|
+
raise TypeError(f"Expected pandas DataFrame, got {type(dataframe).__name__}")
|
|
49
|
+
|
|
50
|
+
if dataframe.empty:
|
|
51
|
+
raise ValueError("DataFrame is empty - cannot validate empty dataset")
|
|
52
|
+
|
|
53
|
+
# Default critical fields for clinical data
|
|
54
|
+
if critical_fields is None:
|
|
55
|
+
critical_fields = [
|
|
56
|
+
'patient_id',
|
|
57
|
+
'visit_date',
|
|
58
|
+
'lab_test_name',
|
|
59
|
+
'lab_value',
|
|
60
|
+
'test_date'
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
if isinstance(critical_fields, list) and len(critical_fields) == 0:
|
|
64
|
+
raise ValueError("critical_fields list cannot be empty")
|
|
65
|
+
|
|
66
|
+
# ===== VALIDATION LOGIC =====
|
|
67
|
+
failures = []
|
|
68
|
+
|
|
69
|
+
# Check each critical field
|
|
70
|
+
for field in critical_fields:
|
|
71
|
+
# Skip if field doesn't exist in dataframe
|
|
72
|
+
if field not in dataframe.columns:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
# Find rows where field is NULL, NaN, or empty string
|
|
76
|
+
# This handles: None, np.nan, pd.NaT, and empty strings
|
|
77
|
+
missing_mask = (dataframe[field].isna()) | (dataframe[field] == '')
|
|
78
|
+
missing_indices = dataframe[missing_mask].index.tolist()
|
|
79
|
+
|
|
80
|
+
# Record each failure with context
|
|
81
|
+
for idx in missing_indices:
|
|
82
|
+
# Safely retrieve patient_id for context
|
|
83
|
+
patient_id = 'UNKNOWN'
|
|
84
|
+
try:
|
|
85
|
+
if 'patient_id' in dataframe.columns:
|
|
86
|
+
pid_value = dataframe.at[idx, 'patient_id']
|
|
87
|
+
# Check if patient_id itself is missing
|
|
88
|
+
if pd.isna(pid_value) or pid_value == '':
|
|
89
|
+
patient_id = 'UNKNOWN'
|
|
90
|
+
else:
|
|
91
|
+
patient_id = str(pid_value)
|
|
92
|
+
except (KeyError, IndexError, TypeError):
|
|
93
|
+
patient_id = 'UNKNOWN'
|
|
94
|
+
|
|
95
|
+
failures.append({
|
|
96
|
+
'row_index': idx,
|
|
97
|
+
'missing_field': field,
|
|
98
|
+
'patient_id': patient_id
|
|
99
|
+
})
|
|
100
|
+
|
|
101
|
+
# Count unique rows with any failures
|
|
102
|
+
failed_row_indices = set([f['row_index'] for f in failures])
|
|
103
|
+
|
|
104
|
+
# ===== RETURN RESULTS =====
|
|
105
|
+
return {
|
|
106
|
+
'validator_name': 'missing_critical_fields',
|
|
107
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
108
|
+
'total_records': len(dataframe),
|
|
109
|
+
'failed_records': len(failed_row_indices),
|
|
110
|
+
'critical_fields_checked': critical_fields,
|
|
111
|
+
'failure_count': len(failures),
|
|
112
|
+
'failures': failures
|
|
113
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def validate_missing_visit_data(dataframe, lab_field='lab_value', visit_field='visit_date'):
|
|
4
|
+
"""
|
|
5
|
+
Validates that if a patient has a lab result, they must have a corresponding visit record.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
dataframe: pandas DataFrame with clinical data.
|
|
9
|
+
lab_field: Column name containing lab results.
|
|
10
|
+
visit_field: Column name containing visit dates.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
dict with validation results.
|
|
14
|
+
"""
|
|
15
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
16
|
+
raise TypeError("Expected pandas DataFrame")
|
|
17
|
+
if dataframe.empty:
|
|
18
|
+
raise ValueError("DataFrame is empty")
|
|
19
|
+
|
|
20
|
+
failures = []
|
|
21
|
+
failed_row_indices = set()
|
|
22
|
+
|
|
23
|
+
for idx, row in dataframe.iterrows():
|
|
24
|
+
has_lab = pd.notna(row.get(lab_field)) and str(row.get(lab_field)).strip() != ''
|
|
25
|
+
has_visit = pd.notna(row.get(visit_field)) and str(row.get(visit_field)).strip() != ''
|
|
26
|
+
|
|
27
|
+
if has_lab and not has_visit:
|
|
28
|
+
failed_row_indices.add(idx)
|
|
29
|
+
patient_id = row.get('patient_id', 'UNKNOWN')
|
|
30
|
+
failures.append({
|
|
31
|
+
'row_index': idx,
|
|
32
|
+
'patient_id': patient_id,
|
|
33
|
+
'error': f"Has lab value '{row[lab_field]}' but missing {visit_field}"
|
|
34
|
+
})
|
|
35
|
+
|
|
36
|
+
return {
|
|
37
|
+
'validator_name': 'missing_visit_data',
|
|
38
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
39
|
+
'total_records': len(dataframe),
|
|
40
|
+
'failed_records': len(failed_row_indices),
|
|
41
|
+
'failure_count': len(failures),
|
|
42
|
+
'failures': failures
|
|
43
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def validate_out_of_range_values(dataframe, range_rules=None):
|
|
4
|
+
"""
|
|
5
|
+
Validates that numeric fields fall within acceptable clinical ranges.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
dataframe: pandas DataFrame with clinical data
|
|
9
|
+
range_rules: dict with field names as keys and (min, max) tuples as values
|
|
10
|
+
Example: {'lab_value': (0, 1000), 'age': (0, 120)}
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
dict with validation results
|
|
14
|
+
"""
|
|
15
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
16
|
+
raise TypeError("Expected pandas DataFrame")
|
|
17
|
+
if dataframe.empty:
|
|
18
|
+
raise ValueError("DataFrame is empty")
|
|
19
|
+
|
|
20
|
+
# Default clinical ranges
|
|
21
|
+
if range_rules is None:
|
|
22
|
+
range_rules = {
|
|
23
|
+
'lab_value': (0, 10000), # Generic lab value range
|
|
24
|
+
'age': (0, 120) # Human age range
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
failures = []
|
|
28
|
+
failed_row_indices = set()
|
|
29
|
+
|
|
30
|
+
for field, (min_val, max_val) in range_rules.items():
|
|
31
|
+
if field not in dataframe.columns:
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
for idx, val in dataframe[field].items():
|
|
35
|
+
if pd.isna(val):
|
|
36
|
+
continue # Skip missing values
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
numeric_val = float(val)
|
|
40
|
+
if numeric_val < min_val or numeric_val > max_val:
|
|
41
|
+
failed_row_indices.add(idx)
|
|
42
|
+
patient_id = dataframe.at[idx, 'patient_id'] if 'patient_id' in dataframe.columns else 'UNKNOWN'
|
|
43
|
+
failures.append({
|
|
44
|
+
'row_index': idx,
|
|
45
|
+
'field': field,
|
|
46
|
+
'invalid_value': numeric_val,
|
|
47
|
+
'expected_range': f"{min_val}-{max_val}",
|
|
48
|
+
'patient_id': patient_id
|
|
49
|
+
})
|
|
50
|
+
except (ValueError, TypeError):
|
|
51
|
+
# Skip non-numeric values (handled by invalid_data_types validator)
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
return {
|
|
55
|
+
'validator_name': 'out_of_range_values',
|
|
56
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
57
|
+
'total_records': len(dataframe),
|
|
58
|
+
'failed_records': len(failed_row_indices),
|
|
59
|
+
'failure_count': len(failures),
|
|
60
|
+
'failures': failures
|
|
61
|
+
}
|