clinical-data-validators 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clinical_data_validators-0.4.0/PKG-INFO +49 -0
- clinical_data_validators-0.4.0/README.md +30 -0
- clinical_data_validators-0.4.0/clinical_data_validators.egg-info/PKG-INFO +49 -0
- clinical_data_validators-0.4.0/clinical_data_validators.egg-info/SOURCES.txt +28 -0
- clinical_data_validators-0.4.0/clinical_data_validators.egg-info/dependency_links.txt +1 -0
- clinical_data_validators-0.4.0/clinical_data_validators.egg-info/requires.txt +1 -0
- clinical_data_validators-0.4.0/clinical_data_validators.egg-info/top_level.txt +1 -0
- clinical_data_validators-0.4.0/clinical_validators/__init__.py +24 -0
- clinical_data_validators-0.4.0/clinical_validators/age_consistency.py +56 -0
- clinical_data_validators-0.4.0/clinical_validators/data_completeness.py +57 -0
- clinical_data_validators-0.4.0/clinical_validators/duplicate_records.py +62 -0
- clinical_data_validators-0.4.0/clinical_validators/future_dates.py +37 -0
- clinical_data_validators-0.4.0/clinical_validators/gender_based_validation.py +61 -0
- clinical_data_validators-0.4.0/clinical_validators/invalid_data_types.py +46 -0
- clinical_data_validators-0.4.0/clinical_validators/invalid_patient_ids.py +74 -0
- clinical_data_validators-0.4.0/clinical_validators/missing_fields.py +113 -0
- clinical_data_validators-0.4.0/clinical_validators/missing_visit_data.py +43 -0
- clinical_data_validators-0.4.0/clinical_validators/out_of_range_values.py +61 -0
- clinical_data_validators-0.4.0/setup.cfg +4 -0
- clinical_data_validators-0.4.0/setup.py +23 -0
- clinical_data_validators-0.4.0/tests/test_age_consistency.py +20 -0
- clinical_data_validators-0.4.0/tests/test_data_completeness.py +27 -0
- clinical_data_validators-0.4.0/tests/test_duplicate_records.py +37 -0
- clinical_data_validators-0.4.0/tests/test_future_dates.py +23 -0
- clinical_data_validators-0.4.0/tests/test_gender_based_validation.py +17 -0
- clinical_data_validators-0.4.0/tests/test_invalid_data_types.py +27 -0
- clinical_data_validators-0.4.0/tests/test_invalid_patient_ids.py +32 -0
- clinical_data_validators-0.4.0/tests/test_missing_fields.py +182 -0
- clinical_data_validators-0.4.0/tests/test_missing_visit_data.py +17 -0
- clinical_data_validators-0.4.0/tests/test_out_of_range_values.py +40 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: clinical-data-validators
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: A library for validating clinical data quality
|
|
5
|
+
Author: Navin Kumar
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: pandas>=1.3.0
|
|
12
|
+
Dynamic: author
|
|
13
|
+
Dynamic: classifier
|
|
14
|
+
Dynamic: description
|
|
15
|
+
Dynamic: description-content-type
|
|
16
|
+
Dynamic: requires-dist
|
|
17
|
+
Dynamic: requires-python
|
|
18
|
+
Dynamic: summary
|
|
19
|
+
|
|
20
|
+
# Clinical Data Validators
|
|
21
|
+
|
|
22
|
+
A Python library for validating clinical datasets.
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
pip install clinical-data-validators
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
python
|
|
31
|
+
import pandas as pd
|
|
32
|
+
from clinical_validators import validate_missing_critical_fields
|
|
33
|
+
|
|
34
|
+
df = pd.read_csv("data.csv")
|
|
35
|
+
result = validate_missing_critical_fields(df)
|
|
36
|
+
print(result)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
## Validators
|
|
40
|
+
1. Missing Critical Fields
|
|
41
|
+
2. Invalid Data Types
|
|
42
|
+
3. Future Dates
|
|
43
|
+
4. Out of Range Values
|
|
44
|
+
5. Duplicate Records
|
|
45
|
+
6. Invalid Patient IDs
|
|
46
|
+
7. Missing Visit Data
|
|
47
|
+
8. Age Consistency
|
|
48
|
+
9. Gender-Based Validation
|
|
49
|
+
10. Data Completeness
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Clinical Data Validators
|
|
2
|
+
|
|
3
|
+
A Python library for validating clinical datasets.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
pip install clinical-data-validators
|
|
8
|
+
|
|
9
|
+
## Usage
|
|
10
|
+
|
|
11
|
+
python
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from clinical_validators import validate_missing_critical_fields
|
|
14
|
+
|
|
15
|
+
df = pd.read_csv("data.csv")
|
|
16
|
+
result = validate_missing_critical_fields(df)
|
|
17
|
+
print(result)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
## Validators
|
|
21
|
+
1. Missing Critical Fields
|
|
22
|
+
2. Invalid Data Types
|
|
23
|
+
3. Future Dates
|
|
24
|
+
4. Out of Range Values
|
|
25
|
+
5. Duplicate Records
|
|
26
|
+
6. Invalid Patient IDs
|
|
27
|
+
7. Missing Visit Data
|
|
28
|
+
8. Age Consistency
|
|
29
|
+
9. Gender-Based Validation
|
|
30
|
+
10. Data Completeness
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: clinical-data-validators
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: A library for validating clinical data quality
|
|
5
|
+
Author: Navin Kumar
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: pandas>=1.3.0
|
|
12
|
+
Dynamic: author
|
|
13
|
+
Dynamic: classifier
|
|
14
|
+
Dynamic: description
|
|
15
|
+
Dynamic: description-content-type
|
|
16
|
+
Dynamic: requires-dist
|
|
17
|
+
Dynamic: requires-python
|
|
18
|
+
Dynamic: summary
|
|
19
|
+
|
|
20
|
+
# Clinical Data Validators
|
|
21
|
+
|
|
22
|
+
A Python library for validating clinical datasets.
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
pip install clinical-data-validators
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
python
|
|
31
|
+
import pandas as pd
|
|
32
|
+
from clinical_validators import validate_missing_critical_fields
|
|
33
|
+
|
|
34
|
+
df = pd.read_csv("data.csv")
|
|
35
|
+
result = validate_missing_critical_fields(df)
|
|
36
|
+
print(result)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
## Validators
|
|
40
|
+
1. Missing Critical Fields
|
|
41
|
+
2. Invalid Data Types
|
|
42
|
+
3. Future Dates
|
|
43
|
+
4. Out of Range Values
|
|
44
|
+
5. Duplicate Records
|
|
45
|
+
6. Invalid Patient IDs
|
|
46
|
+
7. Missing Visit Data
|
|
47
|
+
8. Age Consistency
|
|
48
|
+
9. Gender-Based Validation
|
|
49
|
+
10. Data Completeness
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
setup.py
|
|
3
|
+
clinical_data_validators.egg-info/PKG-INFO
|
|
4
|
+
clinical_data_validators.egg-info/SOURCES.txt
|
|
5
|
+
clinical_data_validators.egg-info/dependency_links.txt
|
|
6
|
+
clinical_data_validators.egg-info/requires.txt
|
|
7
|
+
clinical_data_validators.egg-info/top_level.txt
|
|
8
|
+
clinical_validators/__init__.py
|
|
9
|
+
clinical_validators/age_consistency.py
|
|
10
|
+
clinical_validators/data_completeness.py
|
|
11
|
+
clinical_validators/duplicate_records.py
|
|
12
|
+
clinical_validators/future_dates.py
|
|
13
|
+
clinical_validators/gender_based_validation.py
|
|
14
|
+
clinical_validators/invalid_data_types.py
|
|
15
|
+
clinical_validators/invalid_patient_ids.py
|
|
16
|
+
clinical_validators/missing_fields.py
|
|
17
|
+
clinical_validators/missing_visit_data.py
|
|
18
|
+
clinical_validators/out_of_range_values.py
|
|
19
|
+
tests/test_age_consistency.py
|
|
20
|
+
tests/test_data_completeness.py
|
|
21
|
+
tests/test_duplicate_records.py
|
|
22
|
+
tests/test_future_dates.py
|
|
23
|
+
tests/test_gender_based_validation.py
|
|
24
|
+
tests/test_invalid_data_types.py
|
|
25
|
+
tests/test_invalid_patient_ids.py
|
|
26
|
+
tests/test_missing_fields.py
|
|
27
|
+
tests/test_missing_visit_data.py
|
|
28
|
+
tests/test_out_of_range_values.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pandas>=1.3.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
clinical_validators
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from .missing_fields import validate_missing_critical_fields
|
|
2
|
+
from .invalid_data_types import validate_invalid_data_types
|
|
3
|
+
from .future_dates import validate_future_dates
|
|
4
|
+
from .out_of_range_values import validate_out_of_range_values
|
|
5
|
+
from .duplicate_records import validate_duplicate_records
|
|
6
|
+
from .invalid_patient_ids import validate_patient_ids
|
|
7
|
+
from .missing_visit_data import validate_missing_visit_data
|
|
8
|
+
from .age_consistency import validate_age_consistency
|
|
9
|
+
from .gender_based_validation import validate_gender_based_tests
|
|
10
|
+
from .data_completeness import validate_data_completeness
|
|
11
|
+
|
|
12
|
+
__version__ = "0.4.0"
|
|
13
|
+
__all__ = [
|
|
14
|
+
'validate_missing_critical_fields',
|
|
15
|
+
'validate_invalid_data_types',
|
|
16
|
+
'validate_future_dates',
|
|
17
|
+
'validate_out_of_range_values',
|
|
18
|
+
'validate_duplicate_records',
|
|
19
|
+
'validate_patient_ids',
|
|
20
|
+
'validate_missing_visit_data',
|
|
21
|
+
'validate_age_consistency',
|
|
22
|
+
'validate_gender_based_tests',
|
|
23
|
+
'validate_data_completeness'
|
|
24
|
+
]
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
|
|
4
|
+
def validate_age_consistency(dataframe, age_field='age', birth_date_field='birth_date', tolerance_years=1):
|
|
5
|
+
"""
|
|
6
|
+
Validates that the recorded age is consistent with the birth date.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
dataframe: pandas DataFrame.
|
|
10
|
+
age_field: Column name for age.
|
|
11
|
+
birth_date_field: Column name for birth date.
|
|
12
|
+
tolerance_years: Allowed difference in years (default 1 for leap year/rounding).
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
dict with validation results.
|
|
16
|
+
"""
|
|
17
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
18
|
+
raise TypeError("Expected pandas DataFrame")
|
|
19
|
+
if dataframe.empty:
|
|
20
|
+
raise ValueError("DataFrame is empty")
|
|
21
|
+
|
|
22
|
+
failures = []
|
|
23
|
+
failed_row_indices = set()
|
|
24
|
+
now = datetime.now()
|
|
25
|
+
|
|
26
|
+
for idx, row in dataframe.iterrows():
|
|
27
|
+
age = row.get(age_field)
|
|
28
|
+
birth_date = row.get(birth_date_field)
|
|
29
|
+
|
|
30
|
+
if pd.isna(age) or pd.isna(birth_date):
|
|
31
|
+
continue
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
birth_dt = pd.to_datetime(birth_date).to_pydatetime()
|
|
35
|
+
calculated_age = (now - birth_dt).days / 365.25
|
|
36
|
+
if abs(calculated_age - float(age)) > tolerance_years:
|
|
37
|
+
failed_row_indices.add(idx)
|
|
38
|
+
patient_id = row.get('patient_id', 'UNKNOWN')
|
|
39
|
+
failures.append({
|
|
40
|
+
'row_index': idx,
|
|
41
|
+
'patient_id': patient_id,
|
|
42
|
+
'recorded_age': age,
|
|
43
|
+
'calculated_age': round(calculated_age, 1),
|
|
44
|
+
'error': 'Age does not match birth date'
|
|
45
|
+
})
|
|
46
|
+
except (ValueError, TypeError):
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
return {
|
|
50
|
+
'validator_name': 'age_consistency',
|
|
51
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
52
|
+
'total_records': len(dataframe),
|
|
53
|
+
'failed_records': len(failed_row_indices),
|
|
54
|
+
'failure_count': len(failures),
|
|
55
|
+
'failures': failures
|
|
56
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def validate_data_completeness(dataframe, test_field='test_type', completeness_rules=None):
|
|
4
|
+
"""
|
|
5
|
+
Validates that required fields are present for specific test types.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
dataframe: pandas DataFrame.
|
|
9
|
+
test_field: Column name indicating the test type.
|
|
10
|
+
completeness_rules: Dict mapping test types to lists of required fields.
|
|
11
|
+
Default: {'Blood': ['hemoglobin', 'wbc'], 'Urine': ['ph', 'protein']}
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
dict with validation results.
|
|
15
|
+
"""
|
|
16
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
17
|
+
raise TypeError("Expected pandas DataFrame")
|
|
18
|
+
if dataframe.empty:
|
|
19
|
+
raise ValueError("DataFrame is empty")
|
|
20
|
+
|
|
21
|
+
if completeness_rules is None:
|
|
22
|
+
completeness_rules = {
|
|
23
|
+
'Blood': ['hemoglobin', 'wbc'],
|
|
24
|
+
'Urine': ['ph', 'protein']
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
failures = []
|
|
28
|
+
failed_row_indices = set()
|
|
29
|
+
|
|
30
|
+
for idx, row in dataframe.iterrows():
|
|
31
|
+
test_type = row.get(test_field)
|
|
32
|
+
if pd.isna(test_type):
|
|
33
|
+
continue
|
|
34
|
+
|
|
35
|
+
test_str = str(test_type).strip()
|
|
36
|
+
if test_str in completeness_rules:
|
|
37
|
+
required_fields = completeness_rules[test_str]
|
|
38
|
+
for field in required_fields:
|
|
39
|
+
if field not in dataframe.columns or pd.isna(row.get(field)) or str(row.get(field)).strip() == '':
|
|
40
|
+
failed_row_indices.add(idx)
|
|
41
|
+
patient_id = row.get('patient_id', 'UNKNOWN')
|
|
42
|
+
failures.append({
|
|
43
|
+
'row_index': idx,
|
|
44
|
+
'patient_id': patient_id,
|
|
45
|
+
'test_type': test_str,
|
|
46
|
+
'missing_field': field,
|
|
47
|
+
'error': f"Missing required field '{field}' for {test_str} test"
|
|
48
|
+
})
|
|
49
|
+
|
|
50
|
+
return {
|
|
51
|
+
'validator_name': 'data_completeness',
|
|
52
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
53
|
+
'total_records': len(dataframe),
|
|
54
|
+
'failed_records': len(failed_row_indices),
|
|
55
|
+
'failure_count': len(failures),
|
|
56
|
+
'failures': failures
|
|
57
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def validate_duplicate_records(dataframe, duplicate_fields=None):
|
|
4
|
+
"""
|
|
5
|
+
Validates that there are no duplicate records based on specified fields.
|
|
6
|
+
Typical use case: same patient + same test + same date
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
dataframe: pandas DataFrame with clinical data
|
|
10
|
+
duplicate_fields: list of field names to check for duplicates
|
|
11
|
+
Default: ['patient_id', 'test_name', 'test_date']
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
dict with validation results
|
|
15
|
+
"""
|
|
16
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
17
|
+
raise TypeError("Expected pandas DataFrame")
|
|
18
|
+
if dataframe.empty:
|
|
19
|
+
raise ValueError("DataFrame is empty")
|
|
20
|
+
|
|
21
|
+
if duplicate_fields is None:
|
|
22
|
+
duplicate_fields = ['patient_id', 'test_name', 'test_date']
|
|
23
|
+
|
|
24
|
+
# Check if all required fields exist
|
|
25
|
+
available_fields = [f for f in duplicate_fields if f in dataframe.columns]
|
|
26
|
+
if len(available_fields) < 2:
|
|
27
|
+
raise ValueError(f"Need at least 2 fields to check duplicates. Found: {available_fields}")
|
|
28
|
+
|
|
29
|
+
# Find duplicates
|
|
30
|
+
duplicates = dataframe[dataframe.duplicated(subset=available_fields, keep=False)]
|
|
31
|
+
|
|
32
|
+
failures = []
|
|
33
|
+
if not duplicates.empty:
|
|
34
|
+
# Get unique duplicate groups
|
|
35
|
+
duplicate_groups = duplicates.groupby(available_fields).size().reset_index(name='count')
|
|
36
|
+
duplicate_groups = duplicate_groups[duplicate_groups['count'] > 1]
|
|
37
|
+
|
|
38
|
+
for _, row in duplicate_groups.iterrows():
|
|
39
|
+
# Find all rows in this duplicate group
|
|
40
|
+
mask = True
|
|
41
|
+
for field in available_fields:
|
|
42
|
+
mask = mask & (duplicates[field] == row[field])
|
|
43
|
+
duplicate_indices = duplicates[mask].index.tolist()
|
|
44
|
+
|
|
45
|
+
for idx in duplicate_indices:
|
|
46
|
+
failures.append({
|
|
47
|
+
'row_index': idx,
|
|
48
|
+
'duplicate_fields': available_fields,
|
|
49
|
+
'duplicate_values': {field: row[field] for field in available_fields},
|
|
50
|
+
'occurrences': int(row['count'])
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
failed_row_indices = set(f['row_index'] for f in failures)
|
|
54
|
+
|
|
55
|
+
return {
|
|
56
|
+
'validator_name': 'duplicate_records',
|
|
57
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
58
|
+
'total_records': len(dataframe),
|
|
59
|
+
'failed_records': len(failed_row_indices),
|
|
60
|
+
'failure_count': len(failures),
|
|
61
|
+
'failures': failures
|
|
62
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def validate_future_dates(dataframe, date_fields=None):
|
|
4
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
5
|
+
raise TypeError("Expected pandas DataFrame")
|
|
6
|
+
if dataframe.empty:
|
|
7
|
+
raise ValueError("DataFrame is empty")
|
|
8
|
+
|
|
9
|
+
if date_fields is None:
|
|
10
|
+
date_fields = ['visit_date', 'test_date']
|
|
11
|
+
|
|
12
|
+
failures = []
|
|
13
|
+
failed_row_indices = set()
|
|
14
|
+
now = pd.Timestamp.now()
|
|
15
|
+
|
|
16
|
+
for field in date_fields:
|
|
17
|
+
if field not in dataframe.columns: continue
|
|
18
|
+
for idx, val in dataframe[field].items():
|
|
19
|
+
if pd.isna(val) or str(val).strip() == '': continue
|
|
20
|
+
try:
|
|
21
|
+
# QA FIX: Force tz-naive to prevent comparison crashes with mixed timezone data
|
|
22
|
+
date_val = pd.to_datetime(val).tz_localize(None)
|
|
23
|
+
if date_val > now:
|
|
24
|
+
failed_row_indices.add(idx)
|
|
25
|
+
patient_id = dataframe.at[idx, 'patient_id'] if 'patient_id' in dataframe.columns else 'UNKNOWN'
|
|
26
|
+
failures.append({'row_index': idx, 'field': field, 'invalid_value': str(val), 'error': 'Date is in the future', 'patient_id': patient_id})
|
|
27
|
+
except (ValueError, TypeError):
|
|
28
|
+
continue
|
|
29
|
+
|
|
30
|
+
return {
|
|
31
|
+
'validator_name': 'future_dates',
|
|
32
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
33
|
+
'total_records': len(dataframe),
|
|
34
|
+
'failed_records': len(failed_row_indices),
|
|
35
|
+
'failure_count': len(failures),
|
|
36
|
+
'failures': failures
|
|
37
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def validate_gender_based_tests(dataframe, test_field='test_name', gender_field='gender', rules=None):
|
|
4
|
+
"""
|
|
5
|
+
Validates that certain medical tests are only performed on appropriate genders.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
dataframe: pandas DataFrame.
|
|
9
|
+
test_field: Column name for test name.
|
|
10
|
+
gender_field: Column name for gender.
|
|
11
|
+
rules: Dict mapping test names to allowed genders.
|
|
12
|
+
Default: {'Prostate_Specific_Antigen': ['M', 'Male'], 'Pap_Smear': ['F', 'Female']}
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
dict with validation results.
|
|
16
|
+
"""
|
|
17
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
18
|
+
raise TypeError("Expected pandas DataFrame")
|
|
19
|
+
if dataframe.empty:
|
|
20
|
+
raise ValueError("DataFrame is empty")
|
|
21
|
+
|
|
22
|
+
if rules is None:
|
|
23
|
+
rules = {
|
|
24
|
+
'Prostate_Specific_Antigen': ['M', 'Male'],
|
|
25
|
+
'Pap_Smear': ['F', 'Female']
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
failures = []
|
|
29
|
+
failed_row_indices = set()
|
|
30
|
+
|
|
31
|
+
for idx, row in dataframe.iterrows():
|
|
32
|
+
test_name = row.get(test_field)
|
|
33
|
+
gender = row.get(gender_field)
|
|
34
|
+
|
|
35
|
+
if pd.isna(test_name) or pd.isna(gender):
|
|
36
|
+
continue
|
|
37
|
+
|
|
38
|
+
test_str = str(test_name).strip()
|
|
39
|
+
gender_str = str(gender).strip()
|
|
40
|
+
|
|
41
|
+
if test_str in rules:
|
|
42
|
+
allowed_genders = [g.lower() for g in rules[test_str]]
|
|
43
|
+
if gender_str.lower() not in allowed_genders:
|
|
44
|
+
failed_row_indices.add(idx)
|
|
45
|
+
patient_id = row.get('patient_id', 'UNKNOWN')
|
|
46
|
+
failures.append({
|
|
47
|
+
'row_index': idx,
|
|
48
|
+
'patient_id': patient_id,
|
|
49
|
+
'test': test_str,
|
|
50
|
+
'gender': gender_str,
|
|
51
|
+
'error': f"Test {test_str} is invalid for gender {gender_str}"
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
return {
|
|
55
|
+
'validator_name': 'gender_based_validation',
|
|
56
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
57
|
+
'total_records': len(dataframe),
|
|
58
|
+
'failed_records': len(failed_row_indices),
|
|
59
|
+
'failure_count': len(failures),
|
|
60
|
+
'failures': failures
|
|
61
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def validate_invalid_data_types(dataframe, numeric_fields=None, date_fields=None):
|
|
4
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
5
|
+
raise TypeError("Expected pandas DataFrame")
|
|
6
|
+
if dataframe.empty:
|
|
7
|
+
raise ValueError("DataFrame is empty")
|
|
8
|
+
|
|
9
|
+
if numeric_fields is None:
|
|
10
|
+
numeric_fields = ['lab_value']
|
|
11
|
+
if date_fields is None:
|
|
12
|
+
date_fields = ['visit_date', 'test_date']
|
|
13
|
+
|
|
14
|
+
failures = []
|
|
15
|
+
failed_row_indices = set()
|
|
16
|
+
|
|
17
|
+
for field in numeric_fields:
|
|
18
|
+
if field not in dataframe.columns: continue
|
|
19
|
+
for idx, val in dataframe[field].items():
|
|
20
|
+
if pd.isna(val) or str(val).strip() == '': continue
|
|
21
|
+
try:
|
|
22
|
+
float(val)
|
|
23
|
+
except (ValueError, TypeError):
|
|
24
|
+
failed_row_indices.add(idx)
|
|
25
|
+
patient_id = dataframe.at[idx, 'patient_id'] if 'patient_id' in dataframe.columns else 'UNKNOWN'
|
|
26
|
+
failures.append({'row_index': idx, 'field': field, 'invalid_value': str(val), 'expected_type': 'numeric', 'patient_id': patient_id})
|
|
27
|
+
|
|
28
|
+
for field in date_fields:
|
|
29
|
+
if field not in dataframe.columns: continue
|
|
30
|
+
for idx, val in dataframe[field].items():
|
|
31
|
+
if pd.isna(val) or str(val).strip() == '': continue
|
|
32
|
+
try:
|
|
33
|
+
pd.to_datetime(val)
|
|
34
|
+
except (ValueError, TypeError):
|
|
35
|
+
failed_row_indices.add(idx)
|
|
36
|
+
patient_id = dataframe.at[idx, 'patient_id'] if 'patient_id' in dataframe.columns else 'UNKNOWN'
|
|
37
|
+
failures.append({'row_index': idx, 'field': field, 'invalid_value': str(val), 'expected_type': 'datetime', 'patient_id': patient_id})
|
|
38
|
+
|
|
39
|
+
return {
|
|
40
|
+
'validator_name': 'invalid_data_types',
|
|
41
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
42
|
+
'total_records': len(dataframe),
|
|
43
|
+
'failed_records': len(failed_row_indices),
|
|
44
|
+
'failure_count': len(failures),
|
|
45
|
+
'failures': failures
|
|
46
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
def validate_patient_ids(dataframe, patient_id_field='patient_id', pattern=None, min_length=None, max_length=None):
|
|
5
|
+
"""
|
|
6
|
+
Validates patient ID format according to specified rules.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
dataframe: pandas DataFrame with clinical data
|
|
10
|
+
patient_id_field: name of the patient ID column (default: 'patient_id')
|
|
11
|
+
pattern: regex pattern for validation (default: None)
|
|
12
|
+
min_length: minimum length of patient ID (default: None)
|
|
13
|
+
max_length: maximum length of patient ID (default: None)
|
|
14
|
+
|
|
15
|
+
Default behavior: Validates that patient IDs are exactly 5 digits
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
dict with validation results
|
|
19
|
+
"""
|
|
20
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
21
|
+
raise TypeError("Expected pandas DataFrame")
|
|
22
|
+
if dataframe.empty:
|
|
23
|
+
raise ValueError("DataFrame is empty")
|
|
24
|
+
|
|
25
|
+
if patient_id_field not in dataframe.columns:
|
|
26
|
+
raise ValueError(f"Column '{patient_id_field}' not found in DataFrame")
|
|
27
|
+
|
|
28
|
+
# Default: 5-digit numeric ID
|
|
29
|
+
if pattern is None and min_length is None and max_length is None:
|
|
30
|
+
pattern = r'^\d{5}$' # Exactly 5 digits
|
|
31
|
+
|
|
32
|
+
failures = []
|
|
33
|
+
failed_row_indices = set()
|
|
34
|
+
|
|
35
|
+
for idx, val in dataframe[patient_id_field].items():
|
|
36
|
+
if pd.isna(val) or str(val).strip() == '':
|
|
37
|
+
continue # Skip missing values
|
|
38
|
+
|
|
39
|
+
patient_id_str = str(val).strip()
|
|
40
|
+
is_valid = True
|
|
41
|
+
error_reason = ""
|
|
42
|
+
|
|
43
|
+
# Check pattern
|
|
44
|
+
if pattern:
|
|
45
|
+
if not re.match(pattern, patient_id_str):
|
|
46
|
+
is_valid = False
|
|
47
|
+
error_reason = f"Does not match pattern: {pattern}"
|
|
48
|
+
|
|
49
|
+
# Check length
|
|
50
|
+
if is_valid and (min_length is not None or max_length is not None):
|
|
51
|
+
if min_length is not None and len(patient_id_str) < min_length:
|
|
52
|
+
is_valid = False
|
|
53
|
+
error_reason = f"Length {len(patient_id_str)} < minimum {min_length}"
|
|
54
|
+
elif max_length is not None and len(patient_id_str) > max_length:
|
|
55
|
+
is_valid = False
|
|
56
|
+
error_reason = f"Length {len(patient_id_str)} > maximum {max_length}"
|
|
57
|
+
|
|
58
|
+
if not is_valid:
|
|
59
|
+
failed_row_indices.add(idx)
|
|
60
|
+
failures.append({
|
|
61
|
+
'row_index': idx,
|
|
62
|
+
'field': patient_id_field,
|
|
63
|
+
'invalid_value': patient_id_str,
|
|
64
|
+
'error': error_reason
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
'validator_name': 'invalid_patient_ids',
|
|
69
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
70
|
+
'total_records': len(dataframe),
|
|
71
|
+
'failed_records': len(failed_row_indices),
|
|
72
|
+
'failure_count': len(failures),
|
|
73
|
+
'failures': failures
|
|
74
|
+
}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
def validate_missing_critical_fields(
|
|
5
|
+
dataframe: pd.DataFrame,
|
|
6
|
+
critical_fields: Optional[List[str]] = None
|
|
7
|
+
) -> Dict:
|
|
8
|
+
"""
|
|
9
|
+
Validates that critical clinical fields are not NULL/empty.
|
|
10
|
+
|
|
11
|
+
This validator checks for missing values (NaN, None, empty strings) in
|
|
12
|
+
critical clinical data fields. It's designed to catch common data quality
|
|
13
|
+
issues in clinical research datasets.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
dataframe: pandas DataFrame with clinical data
|
|
17
|
+
critical_fields: list of required field names. If None, uses default
|
|
18
|
+
clinical fields (patient_id, visit_date, lab_test_name,
|
|
19
|
+
lab_value, test_date)
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Dictionary with validation results containing:
|
|
23
|
+
- validator_name: name of the validator
|
|
24
|
+
- status: 'PASS' or 'FAIL'
|
|
25
|
+
- total_records: total rows in dataframe
|
|
26
|
+
- failed_records: number of rows with missing values
|
|
27
|
+
- failure_count: total number of missing value instances
|
|
28
|
+
- critical_fields_checked: list of fields that were validated
|
|
29
|
+
- failures: list of detailed failure information
|
|
30
|
+
|
|
31
|
+
Raises:
|
|
32
|
+
ValueError: if dataframe is empty or critical_fields is empty list
|
|
33
|
+
TypeError: if dataframe is not a pandas DataFrame
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> import pandas as pd
|
|
37
|
+
>>> from clinical_validators import validate_missing_critical_fields
|
|
38
|
+
>>> df = pd.read_csv('lab_data.csv')
|
|
39
|
+
>>> result = validate_missing_critical_fields(df)
|
|
40
|
+
>>> print(result['status'])
|
|
41
|
+
'FAIL'
|
|
42
|
+
>>> print(f"Failed records: {result['failed_records']}")
|
|
43
|
+
'Failed records: 5'
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
# ===== INPUT VALIDATION =====
|
|
47
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
48
|
+
raise TypeError(f"Expected pandas DataFrame, got {type(dataframe).__name__}")
|
|
49
|
+
|
|
50
|
+
if dataframe.empty:
|
|
51
|
+
raise ValueError("DataFrame is empty - cannot validate empty dataset")
|
|
52
|
+
|
|
53
|
+
# Default critical fields for clinical data
|
|
54
|
+
if critical_fields is None:
|
|
55
|
+
critical_fields = [
|
|
56
|
+
'patient_id',
|
|
57
|
+
'visit_date',
|
|
58
|
+
'lab_test_name',
|
|
59
|
+
'lab_value',
|
|
60
|
+
'test_date'
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
if isinstance(critical_fields, list) and len(critical_fields) == 0:
|
|
64
|
+
raise ValueError("critical_fields list cannot be empty")
|
|
65
|
+
|
|
66
|
+
# ===== VALIDATION LOGIC =====
|
|
67
|
+
failures = []
|
|
68
|
+
|
|
69
|
+
# Check each critical field
|
|
70
|
+
for field in critical_fields:
|
|
71
|
+
# Skip if field doesn't exist in dataframe
|
|
72
|
+
if field not in dataframe.columns:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
# Find rows where field is NULL, NaN, or empty string
|
|
76
|
+
# This handles: None, np.nan, pd.NaT, and empty strings
|
|
77
|
+
missing_mask = (dataframe[field].isna()) | (dataframe[field] == '')
|
|
78
|
+
missing_indices = dataframe[missing_mask].index.tolist()
|
|
79
|
+
|
|
80
|
+
# Record each failure with context
|
|
81
|
+
for idx in missing_indices:
|
|
82
|
+
# Safely retrieve patient_id for context
|
|
83
|
+
patient_id = 'UNKNOWN'
|
|
84
|
+
try:
|
|
85
|
+
if 'patient_id' in dataframe.columns:
|
|
86
|
+
pid_value = dataframe.at[idx, 'patient_id']
|
|
87
|
+
# Check if patient_id itself is missing
|
|
88
|
+
if pd.isna(pid_value) or pid_value == '':
|
|
89
|
+
patient_id = 'UNKNOWN'
|
|
90
|
+
else:
|
|
91
|
+
patient_id = str(pid_value)
|
|
92
|
+
except (KeyError, IndexError, TypeError):
|
|
93
|
+
patient_id = 'UNKNOWN'
|
|
94
|
+
|
|
95
|
+
failures.append({
|
|
96
|
+
'row_index': idx,
|
|
97
|
+
'missing_field': field,
|
|
98
|
+
'patient_id': patient_id
|
|
99
|
+
})
|
|
100
|
+
|
|
101
|
+
# Count unique rows with any failures
|
|
102
|
+
failed_row_indices = set([f['row_index'] for f in failures])
|
|
103
|
+
|
|
104
|
+
# ===== RETURN RESULTS =====
|
|
105
|
+
return {
|
|
106
|
+
'validator_name': 'missing_critical_fields',
|
|
107
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
108
|
+
'total_records': len(dataframe),
|
|
109
|
+
'failed_records': len(failed_row_indices),
|
|
110
|
+
'critical_fields_checked': critical_fields,
|
|
111
|
+
'failure_count': len(failures),
|
|
112
|
+
'failures': failures
|
|
113
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def validate_missing_visit_data(dataframe, lab_field='lab_value', visit_field='visit_date'):
|
|
4
|
+
"""
|
|
5
|
+
Validates that if a patient has a lab result, they must have a corresponding visit record.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
dataframe: pandas DataFrame with clinical data.
|
|
9
|
+
lab_field: Column name containing lab results.
|
|
10
|
+
visit_field: Column name containing visit dates.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
dict with validation results.
|
|
14
|
+
"""
|
|
15
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
16
|
+
raise TypeError("Expected pandas DataFrame")
|
|
17
|
+
if dataframe.empty:
|
|
18
|
+
raise ValueError("DataFrame is empty")
|
|
19
|
+
|
|
20
|
+
failures = []
|
|
21
|
+
failed_row_indices = set()
|
|
22
|
+
|
|
23
|
+
for idx, row in dataframe.iterrows():
|
|
24
|
+
has_lab = pd.notna(row.get(lab_field)) and str(row.get(lab_field)).strip() != ''
|
|
25
|
+
has_visit = pd.notna(row.get(visit_field)) and str(row.get(visit_field)).strip() != ''
|
|
26
|
+
|
|
27
|
+
if has_lab and not has_visit:
|
|
28
|
+
failed_row_indices.add(idx)
|
|
29
|
+
patient_id = row.get('patient_id', 'UNKNOWN')
|
|
30
|
+
failures.append({
|
|
31
|
+
'row_index': idx,
|
|
32
|
+
'patient_id': patient_id,
|
|
33
|
+
'error': f"Has lab value '{row[lab_field]}' but missing {visit_field}"
|
|
34
|
+
})
|
|
35
|
+
|
|
36
|
+
return {
|
|
37
|
+
'validator_name': 'missing_visit_data',
|
|
38
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
39
|
+
'total_records': len(dataframe),
|
|
40
|
+
'failed_records': len(failed_row_indices),
|
|
41
|
+
'failure_count': len(failures),
|
|
42
|
+
'failures': failures
|
|
43
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def validate_out_of_range_values(dataframe, range_rules=None):
|
|
4
|
+
"""
|
|
5
|
+
Validates that numeric fields fall within acceptable clinical ranges.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
dataframe: pandas DataFrame with clinical data
|
|
9
|
+
range_rules: dict with field names as keys and (min, max) tuples as values
|
|
10
|
+
Example: {'lab_value': (0, 1000), 'age': (0, 120)}
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
dict with validation results
|
|
14
|
+
"""
|
|
15
|
+
if not isinstance(dataframe, pd.DataFrame):
|
|
16
|
+
raise TypeError("Expected pandas DataFrame")
|
|
17
|
+
if dataframe.empty:
|
|
18
|
+
raise ValueError("DataFrame is empty")
|
|
19
|
+
|
|
20
|
+
# Default clinical ranges
|
|
21
|
+
if range_rules is None:
|
|
22
|
+
range_rules = {
|
|
23
|
+
'lab_value': (0, 10000), # Generic lab value range
|
|
24
|
+
'age': (0, 120) # Human age range
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
failures = []
|
|
28
|
+
failed_row_indices = set()
|
|
29
|
+
|
|
30
|
+
for field, (min_val, max_val) in range_rules.items():
|
|
31
|
+
if field not in dataframe.columns:
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
for idx, val in dataframe[field].items():
|
|
35
|
+
if pd.isna(val):
|
|
36
|
+
continue # Skip missing values
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
numeric_val = float(val)
|
|
40
|
+
if numeric_val < min_val or numeric_val > max_val:
|
|
41
|
+
failed_row_indices.add(idx)
|
|
42
|
+
patient_id = dataframe.at[idx, 'patient_id'] if 'patient_id' in dataframe.columns else 'UNKNOWN'
|
|
43
|
+
failures.append({
|
|
44
|
+
'row_index': idx,
|
|
45
|
+
'field': field,
|
|
46
|
+
'invalid_value': numeric_val,
|
|
47
|
+
'expected_range': f"{min_val}-{max_val}",
|
|
48
|
+
'patient_id': patient_id
|
|
49
|
+
})
|
|
50
|
+
except (ValueError, TypeError):
|
|
51
|
+
# Skip non-numeric values (handled by invalid_data_types validator)
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
return {
|
|
55
|
+
'validator_name': 'out_of_range_values',
|
|
56
|
+
'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
|
|
57
|
+
'total_records': len(dataframe),
|
|
58
|
+
'failed_records': len(failed_row_indices),
|
|
59
|
+
'failure_count': len(failures),
|
|
60
|
+
'failures': failures
|
|
61
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
|
+
long_description = fh.read()
|
|
5
|
+
|
|
6
|
+
setup(
|
|
7
|
+
name="clinical-data-validators",
|
|
8
|
+
version="0.4.0",
|
|
9
|
+
author="Navin Kumar",
|
|
10
|
+
description="A library for validating clinical data quality",
|
|
11
|
+
long_description=long_description,
|
|
12
|
+
long_description_content_type="text/markdown",
|
|
13
|
+
packages=find_packages(),
|
|
14
|
+
python_requires=">=3.8",
|
|
15
|
+
install_requires=[
|
|
16
|
+
"pandas>=1.3.0",
|
|
17
|
+
],
|
|
18
|
+
classifiers=[
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Operating System :: OS Independent",
|
|
22
|
+
],
|
|
23
|
+
)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import pytest
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from datetime import datetime, timedelta
|
|
6
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
7
|
+
from clinical_validators.age_consistency import validate_age_consistency
|
|
8
|
+
|
|
9
|
+
class TestAgeConsistency:
|
|
10
|
+
def test_valid_age(self):
|
|
11
|
+
birth = (datetime.now() - timedelta(days=30*365)).strftime('%Y-%m-%d')
|
|
12
|
+
df = pd.DataFrame({'patient_id': [1], 'age': [30], 'birth_date': [birth]})
|
|
13
|
+
assert validate_age_consistency(df)['status'] == 'PASS'
|
|
14
|
+
def test_invalid_age(self):
|
|
15
|
+
birth = (datetime.now() - timedelta(days=30*365)).strftime('%Y-%m-%d')
|
|
16
|
+
df = pd.DataFrame({'patient_id': [1], 'age': [50], 'birth_date': [birth]})
|
|
17
|
+
assert validate_age_consistency(df)['status'] == 'FAIL'
|
|
18
|
+
def test_missing_data(self):
|
|
19
|
+
df = pd.DataFrame({'patient_id': [1], 'age': [None], 'birth_date': [None]})
|
|
20
|
+
assert validate_age_consistency(df)['status'] == 'PASS'
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import pytest
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
6
|
+
from clinical_validators.data_completeness import validate_data_completeness
|
|
7
|
+
|
|
8
|
+
class TestDataCompleteness:
|
|
9
|
+
def test_complete_blood(self):
|
|
10
|
+
df = pd.DataFrame({'patient_id': [1], 'test_type': ['Blood'], 'hemoglobin': [14], 'wbc': [5]})
|
|
11
|
+
assert validate_data_completeness(df)['status'] == 'PASS'
|
|
12
|
+
|
|
13
|
+
def test_incomplete_blood(self):
|
|
14
|
+
df = pd.DataFrame({'patient_id': [1], 'test_type': ['Blood'], 'hemoglobin': [14], 'wbc': [None]})
|
|
15
|
+
assert validate_data_completeness(df)['status'] == 'FAIL'
|
|
16
|
+
|
|
17
|
+
def test_different_test_type(self):
|
|
18
|
+
# Urine test requires 'ph' and 'protein'. 'hemoglobin' and 'wbc' are for Blood, so they can be None.
|
|
19
|
+
df = pd.DataFrame({
|
|
20
|
+
'patient_id': [1],
|
|
21
|
+
'test_type': ['Urine'],
|
|
22
|
+
'ph': [6.0],
|
|
23
|
+
'protein': ['Negative'],
|
|
24
|
+
'hemoglobin': [None],
|
|
25
|
+
'wbc': [None]
|
|
26
|
+
})
|
|
27
|
+
assert validate_data_completeness(df)['status'] == 'PASS'
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import pytest
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
7
|
+
from clinical_validators.duplicate_records import validate_duplicate_records
|
|
8
|
+
|
|
9
|
+
class TestDuplicateRecords:
|
|
10
|
+
def test_no_duplicates(self):
|
|
11
|
+
df = pd.DataFrame({
|
|
12
|
+
'patient_id': [1001, 1002, 1003],
|
|
13
|
+
'test_name': ['Blood', 'Urine', 'X-Ray'],
|
|
14
|
+
'test_date': ['2026-01-15', '2026-01-16', '2026-01-17']
|
|
15
|
+
})
|
|
16
|
+
result = validate_duplicate_records(df)
|
|
17
|
+
assert result['status'] == 'PASS'
|
|
18
|
+
assert result['failed_records'] == 0
|
|
19
|
+
|
|
20
|
+
def test_exact_duplicates(self):
|
|
21
|
+
df = pd.DataFrame({
|
|
22
|
+
'patient_id': [1001, 1001], # Same patient
|
|
23
|
+
'test_name': ['Blood', 'Blood'], # Same test
|
|
24
|
+
'test_date': ['2026-01-15', '2026-01-15'] # Same date
|
|
25
|
+
})
|
|
26
|
+
result = validate_duplicate_records(df)
|
|
27
|
+
assert result['status'] == 'FAIL'
|
|
28
|
+
assert result['failed_records'] == 2 # Both rows are duplicates
|
|
29
|
+
|
|
30
|
+
def test_partial_duplicates_not_flagged(self):
|
|
31
|
+
df = pd.DataFrame({
|
|
32
|
+
'patient_id': [1001, 1001], # Same patient
|
|
33
|
+
'test_name': ['Blood', 'Urine'], # Different test
|
|
34
|
+
'test_date': ['2026-01-15', '2026-01-15'] # Same date
|
|
35
|
+
})
|
|
36
|
+
result = validate_duplicate_records(df)
|
|
37
|
+
assert result['status'] == 'PASS' # Not exact duplicates
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import pytest
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from datetime import datetime, timedelta
|
|
6
|
+
|
|
7
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
8
|
+
from clinical_validators.future_dates import validate_future_dates
|
|
9
|
+
|
|
10
|
+
class TestFutureDates:
|
|
11
|
+
def test_valid_past_dates(self):
|
|
12
|
+
df = pd.DataFrame({'patient_id': [1001], 'visit_date': ['2023-01-15'], 'test_date': ['2023-01-15']})
|
|
13
|
+
assert validate_future_dates(df)['status'] == 'PASS'
|
|
14
|
+
|
|
15
|
+
def test_future_visit_date(self):
|
|
16
|
+
future_date = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%d')
|
|
17
|
+
df = pd.DataFrame({'patient_id': [1001], 'visit_date': [future_date], 'test_date': ['2023-01-15']})
|
|
18
|
+
result = validate_future_dates(df)
|
|
19
|
+
assert result['status'] == 'FAIL'
|
|
20
|
+
assert result['failures'][0]['field'] == 'visit_date'
|
|
21
|
+
|
|
22
|
+
def test_empty_dataframe_raises_error(self):
|
|
23
|
+
with pytest.raises(ValueError): validate_future_dates(pd.DataFrame())
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import pytest
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
6
|
+
from clinical_validators.gender_based_validation import validate_gender_based_tests
|
|
7
|
+
|
|
8
|
+
class TestGenderValidation:
|
|
9
|
+
def test_valid_gender(self):
|
|
10
|
+
df = pd.DataFrame({'patient_id': [1], 'test_name': ['Prostate_Specific_Antigen'], 'gender': ['M']})
|
|
11
|
+
assert validate_gender_based_tests(df)['status'] == 'PASS'
|
|
12
|
+
def test_invalid_gender(self):
|
|
13
|
+
df = pd.DataFrame({'patient_id': [1], 'test_name': ['Prostate_Specific_Antigen'], 'gender': ['F']})
|
|
14
|
+
assert validate_gender_based_tests(df)['status'] == 'FAIL'
|
|
15
|
+
def test_unrestricted_test(self):
|
|
16
|
+
df = pd.DataFrame({'patient_id': [1], 'test_name': ['Blood_Test'], 'gender': ['F']})
|
|
17
|
+
assert validate_gender_based_tests(df)['status'] == 'PASS'
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import pytest
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
7
|
+
from clinical_validators.invalid_data_types import validate_invalid_data_types
|
|
8
|
+
|
|
9
|
+
class TestInvalidDataTypes:
|
|
10
|
+
def test_valid_data_types(self):
|
|
11
|
+
df = pd.DataFrame({'patient_id': [1001], 'lab_value': [14.5], 'visit_date': ['2026-01-15'], 'test_date': ['2026-01-15']})
|
|
12
|
+
assert validate_invalid_data_types(df)['status'] == 'PASS'
|
|
13
|
+
|
|
14
|
+
def test_invalid_numeric_value(self):
|
|
15
|
+
df = pd.DataFrame({'patient_id': [1001], 'lab_value': ['abc'], 'visit_date': ['2026-01-15'], 'test_date': ['2026-01-15']})
|
|
16
|
+
result = validate_invalid_data_types(df)
|
|
17
|
+
assert result['status'] == 'FAIL'
|
|
18
|
+
assert result['failures'][0]['field'] == 'lab_value'
|
|
19
|
+
|
|
20
|
+
def test_invalid_date_format(self):
|
|
21
|
+
df = pd.DataFrame({'patient_id': [1001], 'lab_value': [14.5], 'visit_date': ['not-a-date'], 'test_date': ['2026-01-15']})
|
|
22
|
+
result = validate_invalid_data_types(df)
|
|
23
|
+
assert result['status'] == 'FAIL'
|
|
24
|
+
assert result['failures'][0]['field'] == 'visit_date'
|
|
25
|
+
|
|
26
|
+
def test_empty_dataframe_raises_error(self):
|
|
27
|
+
with pytest.raises(ValueError): validate_invalid_data_types(pd.DataFrame())
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import pytest
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
7
|
+
from clinical_validators.invalid_patient_ids import validate_patient_ids
|
|
8
|
+
|
|
9
|
+
class TestInvalidPatientIds:
|
|
10
|
+
def test_valid_patient_ids(self):
|
|
11
|
+
df = pd.DataFrame({
|
|
12
|
+
'patient_id': ['12345', '67890', '11111']
|
|
13
|
+
})
|
|
14
|
+
result = validate_patient_ids(df)
|
|
15
|
+
assert result['status'] == 'PASS'
|
|
16
|
+
assert result['failed_records'] == 0
|
|
17
|
+
|
|
18
|
+
def test_invalid_format_too_short(self):
|
|
19
|
+
df = pd.DataFrame({
|
|
20
|
+
'patient_id': ['12345', '1234'] # 1234 is only 4 digits
|
|
21
|
+
})
|
|
22
|
+
result = validate_patient_ids(df)
|
|
23
|
+
assert result['status'] == 'FAIL'
|
|
24
|
+
assert result['failed_records'] == 1
|
|
25
|
+
|
|
26
|
+
def test_invalid_format_non_numeric(self):
|
|
27
|
+
df = pd.DataFrame({
|
|
28
|
+
'patient_id': ['12345', 'ABC12'] # ABC12 contains letters
|
|
29
|
+
})
|
|
30
|
+
result = validate_patient_ids(df)
|
|
31
|
+
assert result['status'] == 'FAIL'
|
|
32
|
+
assert result['failed_records'] == 1
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import pytest
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
# Add parent directory to path for imports
|
|
7
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
8
|
+
|
|
9
|
+
from clinical_validators.missing_fields import validate_missing_critical_fields
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TestMissingCriticalFields:
|
|
13
|
+
"""Test suite for missing_critical_fields validator"""
|
|
14
|
+
|
|
15
|
+
def test_valid_dataframe_no_missing_values(self):
|
|
16
|
+
"""Test: Clean data should pass validation"""
|
|
17
|
+
data = {
|
|
18
|
+
'patient_id': [1001, 1002, 1003],
|
|
19
|
+
'visit_date': ['2026-01-15', '2026-01-16', '2026-01-17'],
|
|
20
|
+
'lab_test_name': ['Hemoglobin', 'Glucose', 'Cholesterol'],
|
|
21
|
+
'lab_value': [14.5, 95, 180],
|
|
22
|
+
'test_date': ['2026-01-15', '2026-01-16', '2026-01-17']
|
|
23
|
+
}
|
|
24
|
+
df = pd.DataFrame(data)
|
|
25
|
+
result = validate_missing_critical_fields(df)
|
|
26
|
+
|
|
27
|
+
assert result['status'] == 'PASS', "Clean data should pass"
|
|
28
|
+
assert result['failed_records'] == 0, "Should have 0 failed records"
|
|
29
|
+
assert result['total_records'] == 3, "Should have 3 total records"
|
|
30
|
+
assert len(result['failures']) == 0, "Failures list should be empty"
|
|
31
|
+
print("✓ Test 1 PASSED: Clean data validated successfully")
|
|
32
|
+
|
|
33
|
+
def test_missing_patient_id(self):
|
|
34
|
+
"""Test: Missing patient_id should be detected"""
|
|
35
|
+
data = {
|
|
36
|
+
'patient_id': [1001, None, 1003],
|
|
37
|
+
'visit_date': ['2026-01-15', '2026-01-16', '2026-01-17'],
|
|
38
|
+
'lab_test_name': ['Hemoglobin', 'Glucose', 'Cholesterol'],
|
|
39
|
+
'lab_value': [14.5, 95, 180],
|
|
40
|
+
'test_date': ['2026-01-15', '2026-01-16', '2026-01-17']
|
|
41
|
+
}
|
|
42
|
+
df = pd.DataFrame(data)
|
|
43
|
+
result = validate_missing_critical_fields(df)
|
|
44
|
+
|
|
45
|
+
assert result['status'] == 'FAIL', "Should fail with missing patient_id"
|
|
46
|
+
assert result['failed_records'] == 1, "Should have 1 failed record"
|
|
47
|
+
assert any(f['missing_field'] == 'patient_id' for f in result['failures']), \
|
|
48
|
+
"Should identify missing patient_id"
|
|
49
|
+
print("✓ Test 2 PASSED: Missing patient_id detected")
|
|
50
|
+
|
|
51
|
+
def test_missing_multiple_fields(self):
|
|
52
|
+
"""Test: Multiple missing fields across different rows"""
|
|
53
|
+
data = {
|
|
54
|
+
'patient_id': [1001, 1002, None],
|
|
55
|
+
'visit_date': [None, '2026-01-16', '2026-01-17'],
|
|
56
|
+
'lab_test_name': ['Hemoglobin', 'Glucose', 'Cholesterol'],
|
|
57
|
+
'lab_value': [14.5, 95, 180],
|
|
58
|
+
'test_date': ['2026-01-15', None, '2026-01-17']
|
|
59
|
+
}
|
|
60
|
+
df = pd.DataFrame(data)
|
|
61
|
+
result = validate_missing_critical_fields(df)
|
|
62
|
+
|
|
63
|
+
assert result['status'] == 'FAIL', "Should fail with missing fields"
|
|
64
|
+
assert result['failed_records'] == 3, "Should have 3 failed records"
|
|
65
|
+
assert result['failure_count'] == 3, "Should have 3 total failures"
|
|
66
|
+
print("✓ Test 3 PASSED: Multiple missing fields detected correctly")
|
|
67
|
+
|
|
68
|
+
def test_missing_lab_test_name(self):
|
|
69
|
+
"""Test: Missing lab_test_name detection (empty string)"""
|
|
70
|
+
data = {
|
|
71
|
+
'patient_id': [1001, 1002],
|
|
72
|
+
'visit_date': ['2026-01-15', '2026-01-16'],
|
|
73
|
+
'lab_test_name': ['Hemoglobin', ''], # Empty string
|
|
74
|
+
'lab_value': [14.5, 95],
|
|
75
|
+
'test_date': ['2026-01-15', '2026-01-16']
|
|
76
|
+
}
|
|
77
|
+
df = pd.DataFrame(data)
|
|
78
|
+
result = validate_missing_critical_fields(df)
|
|
79
|
+
|
|
80
|
+
assert result['status'] == 'FAIL', "Should detect empty string as missing"
|
|
81
|
+
assert result['failed_records'] == 1, "Should flag empty string row"
|
|
82
|
+
assert any(f['missing_field'] == 'lab_test_name' and f['row_index'] == 1
|
|
83
|
+
for f in result['failures']), "Should identify lab_test_name empty"
|
|
84
|
+
print("✓ Test 4 PASSED: Empty string detected as missing value")
|
|
85
|
+
|
|
86
|
+
def test_all_fields_missing(self):
|
|
87
|
+
"""Test: Edge case - all critical fields missing in one row"""
|
|
88
|
+
data = {
|
|
89
|
+
'patient_id': [None, 1002],
|
|
90
|
+
'visit_date': [None, '2026-01-16'],
|
|
91
|
+
'lab_test_name': [None, 'Glucose'],
|
|
92
|
+
'lab_value': [None, 95],
|
|
93
|
+
'test_date': [None, '2026-01-16']
|
|
94
|
+
}
|
|
95
|
+
df = pd.DataFrame(data)
|
|
96
|
+
result = validate_missing_critical_fields(df)
|
|
97
|
+
|
|
98
|
+
assert result['failed_records'] == 1, "Should detect row with all missing fields"
|
|
99
|
+
assert result['failure_count'] == 5, "Should count 5 failures (1 per field)"
|
|
100
|
+
print("✓ Test 5 PASSED: All missing fields in single row detected")
|
|
101
|
+
|
|
102
|
+
def test_empty_dataframe_raises_error(self):
|
|
103
|
+
"""Test: Empty DataFrame should raise ValueError"""
|
|
104
|
+
df = pd.DataFrame()
|
|
105
|
+
|
|
106
|
+
with pytest.raises(ValueError):
|
|
107
|
+
validate_missing_critical_fields(df)
|
|
108
|
+
|
|
109
|
+
print("✓ Test 6 PASSED: Empty DataFrame raises ValueError")
|
|
110
|
+
|
|
111
|
+
def test_invalid_input_type_raises_error(self):
|
|
112
|
+
"""Test: Non-DataFrame input should raise TypeError"""
|
|
113
|
+
# Test with string input
|
|
114
|
+
with pytest.raises(TypeError):
|
|
115
|
+
validate_missing_critical_fields("not a dataframe")
|
|
116
|
+
|
|
117
|
+
# Test with list input
|
|
118
|
+
with pytest.raises(TypeError):
|
|
119
|
+
validate_missing_critical_fields([1, 2, 3])
|
|
120
|
+
|
|
121
|
+
print("✓ Test 7 PASSED: Invalid input types raise TypeError")
|
|
122
|
+
|
|
123
|
+
def test_custom_critical_fields(self):
|
|
124
|
+
"""Test: Validator should work with custom field list"""
|
|
125
|
+
data = {
|
|
126
|
+
'patient_id': [1001, None],
|
|
127
|
+
'visit_date': ['2026-01-15', '2026-01-16'],
|
|
128
|
+
'other_field': ['A', 'B']
|
|
129
|
+
}
|
|
130
|
+
df = pd.DataFrame(data)
|
|
131
|
+
custom_fields = ['patient_id', 'other_field']
|
|
132
|
+
result = validate_missing_critical_fields(df, critical_fields=custom_fields)
|
|
133
|
+
|
|
134
|
+
assert 'visit_date' not in result['critical_fields_checked'], \
|
|
135
|
+
"Should only check specified fields"
|
|
136
|
+
assert result['failed_records'] == 1, "Should detect missing patient_id"
|
|
137
|
+
print("✓ Test 8 PASSED: Custom field list works correctly")
|
|
138
|
+
|
|
139
|
+
def test_extra_columns_ignored(self):
|
|
140
|
+
"""Test: Extra columns should not affect validation"""
|
|
141
|
+
data = {
|
|
142
|
+
'patient_id': [1001, 1002],
|
|
143
|
+
'visit_date': ['2026-01-15', '2026-01-16'],
|
|
144
|
+
'lab_test_name': ['Hemoglobin', 'Glucose'],
|
|
145
|
+
'lab_value': [14.5, 95],
|
|
146
|
+
'test_date': ['2026-01-15', '2026-01-16'],
|
|
147
|
+
'extra_column_1': ['X', 'Y'],
|
|
148
|
+
'extra_column_2': [100, 200]
|
|
149
|
+
}
|
|
150
|
+
df = pd.DataFrame(data)
|
|
151
|
+
result = validate_missing_critical_fields(df)
|
|
152
|
+
|
|
153
|
+
assert result['status'] == 'PASS', "Extra columns should not cause failures"
|
|
154
|
+
assert result['failed_records'] == 0, "Should have no failures"
|
|
155
|
+
print("✓ Test 9 PASSED: Extra columns ignored correctly")
|
|
156
|
+
|
|
157
|
+
def test_nan_vs_none_consistency(self):
|
|
158
|
+
"""Test: Both NaN and None are treated as missing"""
|
|
159
|
+
data = {
|
|
160
|
+
'patient_id': [1001, None, 1003],
|
|
161
|
+
'visit_date': ['2026-01-15', '2026-01-16', pd.NaT],
|
|
162
|
+
'lab_test_name': ['Hemoglobin', 'Glucose', 'Cholesterol'],
|
|
163
|
+
'lab_value': [14.5, 95, 180],
|
|
164
|
+
'test_date': ['2026-01-15', '2026-01-16', '2026-01-17']
|
|
165
|
+
}
|
|
166
|
+
df = pd.DataFrame(data)
|
|
167
|
+
result = validate_missing_critical_fields(df)
|
|
168
|
+
|
|
169
|
+
assert result['status'] == 'FAIL', "Should detect both None and NaT"
|
|
170
|
+
assert result['failed_records'] == 2, "Should have 2 failed records"
|
|
171
|
+
print("✓ Test 10 PASSED: NaN and None handled consistently")
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
if __name__ == '__main__':
|
|
175
|
+
print("=" * 80)
|
|
176
|
+
print("CLINICAL DATA VALIDATORS - TEST SUITE")
|
|
177
|
+
print("Validator: missing_critical_fields")
|
|
178
|
+
print("=" * 80)
|
|
179
|
+
print()
|
|
180
|
+
|
|
181
|
+
# Run with pytest
|
|
182
|
+
pytest.main([__file__, '-v', '--tb=short'])
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import pytest
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
6
|
+
from clinical_validators.missing_visit_data import validate_missing_visit_data
|
|
7
|
+
|
|
8
|
+
class TestMissingVisitData:
|
|
9
|
+
def test_valid_data(self):
|
|
10
|
+
df = pd.DataFrame({'patient_id': [1], 'lab_value': [100], 'visit_date': ['2026-01-01']})
|
|
11
|
+
assert validate_missing_visit_data(df)['status'] == 'PASS'
|
|
12
|
+
def test_missing_visit(self):
|
|
13
|
+
df = pd.DataFrame({'patient_id': [1], 'lab_value': [100], 'visit_date': [None]})
|
|
14
|
+
assert validate_missing_visit_data(df)['status'] == 'FAIL'
|
|
15
|
+
def test_no_lab_no_visit(self):
|
|
16
|
+
df = pd.DataFrame({'patient_id': [1], 'lab_value': [None], 'visit_date': [None]})
|
|
17
|
+
assert validate_missing_visit_data(df)['status'] == 'PASS'
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import pytest
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
7
|
+
from clinical_validators.out_of_range_values import validate_out_of_range_values
|
|
8
|
+
|
|
9
|
+
class TestOutOfRangeValues:
|
|
10
|
+
def test_valid_ranges(self):
|
|
11
|
+
df = pd.DataFrame({
|
|
12
|
+
'patient_id': [1001, 1002],
|
|
13
|
+
'lab_value': [150.5, 500],
|
|
14
|
+
'age': [35, 67]
|
|
15
|
+
})
|
|
16
|
+
result = validate_out_of_range_values(df)
|
|
17
|
+
assert result['status'] == 'PASS'
|
|
18
|
+
assert result['failed_records'] == 0
|
|
19
|
+
|
|
20
|
+
def test_lab_value_too_high(self):
|
|
21
|
+
df = pd.DataFrame({
|
|
22
|
+
'patient_id': [1001, 1002],
|
|
23
|
+
'lab_value': [150.5, 15000], # 15000 > 10000
|
|
24
|
+
'age': [35, 67]
|
|
25
|
+
})
|
|
26
|
+
result = validate_out_of_range_values(df)
|
|
27
|
+
assert result['status'] == 'FAIL'
|
|
28
|
+
assert result['failed_records'] == 1
|
|
29
|
+
assert result['failures'][0]['field'] == 'lab_value'
|
|
30
|
+
|
|
31
|
+
def test_age_out_of_range(self):
|
|
32
|
+
df = pd.DataFrame({
|
|
33
|
+
'patient_id': [1001, 1002],
|
|
34
|
+
'lab_value': [150.5, 500],
|
|
35
|
+
'age': [35, 150] # 150 > 120
|
|
36
|
+
})
|
|
37
|
+
result = validate_out_of_range_values(df)
|
|
38
|
+
assert result['status'] == 'FAIL'
|
|
39
|
+
assert result['failed_records'] == 1
|
|
40
|
+
assert result['failures'][0]['field'] == 'age'
|