clinical-data-validators 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. clinical_data_validators-0.4.0/PKG-INFO +49 -0
  2. clinical_data_validators-0.4.0/README.md +30 -0
  3. clinical_data_validators-0.4.0/clinical_data_validators.egg-info/PKG-INFO +49 -0
  4. clinical_data_validators-0.4.0/clinical_data_validators.egg-info/SOURCES.txt +28 -0
  5. clinical_data_validators-0.4.0/clinical_data_validators.egg-info/dependency_links.txt +1 -0
  6. clinical_data_validators-0.4.0/clinical_data_validators.egg-info/requires.txt +1 -0
  7. clinical_data_validators-0.4.0/clinical_data_validators.egg-info/top_level.txt +1 -0
  8. clinical_data_validators-0.4.0/clinical_validators/__init__.py +24 -0
  9. clinical_data_validators-0.4.0/clinical_validators/age_consistency.py +56 -0
  10. clinical_data_validators-0.4.0/clinical_validators/data_completeness.py +57 -0
  11. clinical_data_validators-0.4.0/clinical_validators/duplicate_records.py +62 -0
  12. clinical_data_validators-0.4.0/clinical_validators/future_dates.py +37 -0
  13. clinical_data_validators-0.4.0/clinical_validators/gender_based_validation.py +61 -0
  14. clinical_data_validators-0.4.0/clinical_validators/invalid_data_types.py +46 -0
  15. clinical_data_validators-0.4.0/clinical_validators/invalid_patient_ids.py +74 -0
  16. clinical_data_validators-0.4.0/clinical_validators/missing_fields.py +113 -0
  17. clinical_data_validators-0.4.0/clinical_validators/missing_visit_data.py +43 -0
  18. clinical_data_validators-0.4.0/clinical_validators/out_of_range_values.py +61 -0
  19. clinical_data_validators-0.4.0/setup.cfg +4 -0
  20. clinical_data_validators-0.4.0/setup.py +23 -0
  21. clinical_data_validators-0.4.0/tests/test_age_consistency.py +20 -0
  22. clinical_data_validators-0.4.0/tests/test_data_completeness.py +27 -0
  23. clinical_data_validators-0.4.0/tests/test_duplicate_records.py +37 -0
  24. clinical_data_validators-0.4.0/tests/test_future_dates.py +23 -0
  25. clinical_data_validators-0.4.0/tests/test_gender_based_validation.py +17 -0
  26. clinical_data_validators-0.4.0/tests/test_invalid_data_types.py +27 -0
  27. clinical_data_validators-0.4.0/tests/test_invalid_patient_ids.py +32 -0
  28. clinical_data_validators-0.4.0/tests/test_missing_fields.py +182 -0
  29. clinical_data_validators-0.4.0/tests/test_missing_visit_data.py +17 -0
  30. clinical_data_validators-0.4.0/tests/test_out_of_range_values.py +40 -0
@@ -0,0 +1,49 @@
1
+ Metadata-Version: 2.4
2
+ Name: clinical-data-validators
3
+ Version: 0.4.0
4
+ Summary: A library for validating clinical data quality
5
+ Author: Navin Kumar
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: pandas>=1.3.0
12
+ Dynamic: author
13
+ Dynamic: classifier
14
+ Dynamic: description
15
+ Dynamic: description-content-type
16
+ Dynamic: requires-dist
17
+ Dynamic: requires-python
18
+ Dynamic: summary
19
+
20
+ # Clinical Data Validators
21
+
22
+ A Python library for validating clinical datasets.
23
+
24
+ ## Installation
25
+
26
+ pip install clinical-data-validators
27
+
28
+ ## Usage
29
+
30
+ python
31
+ import pandas as pd
32
+ from clinical_validators import validate_missing_critical_fields
33
+
34
+ df = pd.read_csv("data.csv")
35
+ result = validate_missing_critical_fields(df)
36
+ print(result)
37
+
38
+
39
+ ## Validators
40
+ 1. Missing Critical Fields
41
+ 2. Invalid Data Types
42
+ 3. Future Dates
43
+ 4. Out of Range Values
44
+ 5. Duplicate Records
45
+ 6. Invalid Patient IDs
46
+ 7. Missing Visit Data
47
+ 8. Age Consistency
48
+ 9. Gender-Based Validation
49
+ 10. Data Completeness
@@ -0,0 +1,30 @@
1
+ # Clinical Data Validators
2
+
3
+ A Python library for validating clinical datasets.
4
+
5
+ ## Installation
6
+
7
+ pip install clinical-data-validators
8
+
9
+ ## Usage
10
+
11
+ python
12
+ import pandas as pd
13
+ from clinical_validators import validate_missing_critical_fields
14
+
15
+ df = pd.read_csv("data.csv")
16
+ result = validate_missing_critical_fields(df)
17
+ print(result)
18
+
19
+
20
+ ## Validators
21
+ 1. Missing Critical Fields
22
+ 2. Invalid Data Types
23
+ 3. Future Dates
24
+ 4. Out of Range Values
25
+ 5. Duplicate Records
26
+ 6. Invalid Patient IDs
27
+ 7. Missing Visit Data
28
+ 8. Age Consistency
29
+ 9. Gender-Based Validation
30
+ 10. Data Completeness
@@ -0,0 +1,49 @@
1
+ Metadata-Version: 2.4
2
+ Name: clinical-data-validators
3
+ Version: 0.4.0
4
+ Summary: A library for validating clinical data quality
5
+ Author: Navin Kumar
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: pandas>=1.3.0
12
+ Dynamic: author
13
+ Dynamic: classifier
14
+ Dynamic: description
15
+ Dynamic: description-content-type
16
+ Dynamic: requires-dist
17
+ Dynamic: requires-python
18
+ Dynamic: summary
19
+
20
+ # Clinical Data Validators
21
+
22
+ A Python library for validating clinical datasets.
23
+
24
+ ## Installation
25
+
26
+ pip install clinical-data-validators
27
+
28
+ ## Usage
29
+
30
+ python
31
+ import pandas as pd
32
+ from clinical_validators import validate_missing_critical_fields
33
+
34
+ df = pd.read_csv("data.csv")
35
+ result = validate_missing_critical_fields(df)
36
+ print(result)
37
+
38
+
39
+ ## Validators
40
+ 1. Missing Critical Fields
41
+ 2. Invalid Data Types
42
+ 3. Future Dates
43
+ 4. Out of Range Values
44
+ 5. Duplicate Records
45
+ 6. Invalid Patient IDs
46
+ 7. Missing Visit Data
47
+ 8. Age Consistency
48
+ 9. Gender-Based Validation
49
+ 10. Data Completeness
@@ -0,0 +1,28 @@
1
+ README.md
2
+ setup.py
3
+ clinical_data_validators.egg-info/PKG-INFO
4
+ clinical_data_validators.egg-info/SOURCES.txt
5
+ clinical_data_validators.egg-info/dependency_links.txt
6
+ clinical_data_validators.egg-info/requires.txt
7
+ clinical_data_validators.egg-info/top_level.txt
8
+ clinical_validators/__init__.py
9
+ clinical_validators/age_consistency.py
10
+ clinical_validators/data_completeness.py
11
+ clinical_validators/duplicate_records.py
12
+ clinical_validators/future_dates.py
13
+ clinical_validators/gender_based_validation.py
14
+ clinical_validators/invalid_data_types.py
15
+ clinical_validators/invalid_patient_ids.py
16
+ clinical_validators/missing_fields.py
17
+ clinical_validators/missing_visit_data.py
18
+ clinical_validators/out_of_range_values.py
19
+ tests/test_age_consistency.py
20
+ tests/test_data_completeness.py
21
+ tests/test_duplicate_records.py
22
+ tests/test_future_dates.py
23
+ tests/test_gender_based_validation.py
24
+ tests/test_invalid_data_types.py
25
+ tests/test_invalid_patient_ids.py
26
+ tests/test_missing_fields.py
27
+ tests/test_missing_visit_data.py
28
+ tests/test_out_of_range_values.py
@@ -0,0 +1,24 @@
1
+ from .missing_fields import validate_missing_critical_fields
2
+ from .invalid_data_types import validate_invalid_data_types
3
+ from .future_dates import validate_future_dates
4
+ from .out_of_range_values import validate_out_of_range_values
5
+ from .duplicate_records import validate_duplicate_records
6
+ from .invalid_patient_ids import validate_patient_ids
7
+ from .missing_visit_data import validate_missing_visit_data
8
+ from .age_consistency import validate_age_consistency
9
+ from .gender_based_validation import validate_gender_based_tests
10
+ from .data_completeness import validate_data_completeness
11
+
12
+ __version__ = "0.4.0"
13
+ __all__ = [
14
+ 'validate_missing_critical_fields',
15
+ 'validate_invalid_data_types',
16
+ 'validate_future_dates',
17
+ 'validate_out_of_range_values',
18
+ 'validate_duplicate_records',
19
+ 'validate_patient_ids',
20
+ 'validate_missing_visit_data',
21
+ 'validate_age_consistency',
22
+ 'validate_gender_based_tests',
23
+ 'validate_data_completeness'
24
+ ]
@@ -0,0 +1,56 @@
1
+ import pandas as pd
2
+ from datetime import datetime
3
+
4
+ def validate_age_consistency(dataframe, age_field='age', birth_date_field='birth_date', tolerance_years=1):
5
+ """
6
+ Validates that the recorded age is consistent with the birth date.
7
+
8
+ Args:
9
+ dataframe: pandas DataFrame.
10
+ age_field: Column name for age.
11
+ birth_date_field: Column name for birth date.
12
+ tolerance_years: Allowed difference in years (default 1 for leap year/rounding).
13
+
14
+ Returns:
15
+ dict with validation results.
16
+ """
17
+ if not isinstance(dataframe, pd.DataFrame):
18
+ raise TypeError("Expected pandas DataFrame")
19
+ if dataframe.empty:
20
+ raise ValueError("DataFrame is empty")
21
+
22
+ failures = []
23
+ failed_row_indices = set()
24
+ now = datetime.now()
25
+
26
+ for idx, row in dataframe.iterrows():
27
+ age = row.get(age_field)
28
+ birth_date = row.get(birth_date_field)
29
+
30
+ if pd.isna(age) or pd.isna(birth_date):
31
+ continue
32
+
33
+ try:
34
+ birth_dt = pd.to_datetime(birth_date).to_pydatetime()
35
+ calculated_age = (now - birth_dt).days / 365.25
36
+ if abs(calculated_age - float(age)) > tolerance_years:
37
+ failed_row_indices.add(idx)
38
+ patient_id = row.get('patient_id', 'UNKNOWN')
39
+ failures.append({
40
+ 'row_index': idx,
41
+ 'patient_id': patient_id,
42
+ 'recorded_age': age,
43
+ 'calculated_age': round(calculated_age, 1),
44
+ 'error': 'Age does not match birth date'
45
+ })
46
+ except (ValueError, TypeError):
47
+ continue
48
+
49
+ return {
50
+ 'validator_name': 'age_consistency',
51
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
52
+ 'total_records': len(dataframe),
53
+ 'failed_records': len(failed_row_indices),
54
+ 'failure_count': len(failures),
55
+ 'failures': failures
56
+ }
@@ -0,0 +1,57 @@
1
+ import pandas as pd
2
+
3
+ def validate_data_completeness(dataframe, test_field='test_type', completeness_rules=None):
4
+ """
5
+ Validates that required fields are present for specific test types.
6
+
7
+ Args:
8
+ dataframe: pandas DataFrame.
9
+ test_field: Column name indicating the test type.
10
+ completeness_rules: Dict mapping test types to lists of required fields.
11
+ Default: {'Blood': ['hemoglobin', 'wbc'], 'Urine': ['ph', 'protein']}
12
+
13
+ Returns:
14
+ dict with validation results.
15
+ """
16
+ if not isinstance(dataframe, pd.DataFrame):
17
+ raise TypeError("Expected pandas DataFrame")
18
+ if dataframe.empty:
19
+ raise ValueError("DataFrame is empty")
20
+
21
+ if completeness_rules is None:
22
+ completeness_rules = {
23
+ 'Blood': ['hemoglobin', 'wbc'],
24
+ 'Urine': ['ph', 'protein']
25
+ }
26
+
27
+ failures = []
28
+ failed_row_indices = set()
29
+
30
+ for idx, row in dataframe.iterrows():
31
+ test_type = row.get(test_field)
32
+ if pd.isna(test_type):
33
+ continue
34
+
35
+ test_str = str(test_type).strip()
36
+ if test_str in completeness_rules:
37
+ required_fields = completeness_rules[test_str]
38
+ for field in required_fields:
39
+ if field not in dataframe.columns or pd.isna(row.get(field)) or str(row.get(field)).strip() == '':
40
+ failed_row_indices.add(idx)
41
+ patient_id = row.get('patient_id', 'UNKNOWN')
42
+ failures.append({
43
+ 'row_index': idx,
44
+ 'patient_id': patient_id,
45
+ 'test_type': test_str,
46
+ 'missing_field': field,
47
+ 'error': f"Missing required field '{field}' for {test_str} test"
48
+ })
49
+
50
+ return {
51
+ 'validator_name': 'data_completeness',
52
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
53
+ 'total_records': len(dataframe),
54
+ 'failed_records': len(failed_row_indices),
55
+ 'failure_count': len(failures),
56
+ 'failures': failures
57
+ }
@@ -0,0 +1,62 @@
1
+ import pandas as pd
2
+
3
+ def validate_duplicate_records(dataframe, duplicate_fields=None):
4
+ """
5
+ Validates that there are no duplicate records based on specified fields.
6
+ Typical use case: same patient + same test + same date
7
+
8
+ Args:
9
+ dataframe: pandas DataFrame with clinical data
10
+ duplicate_fields: list of field names to check for duplicates
11
+ Default: ['patient_id', 'test_name', 'test_date']
12
+
13
+ Returns:
14
+ dict with validation results
15
+ """
16
+ if not isinstance(dataframe, pd.DataFrame):
17
+ raise TypeError("Expected pandas DataFrame")
18
+ if dataframe.empty:
19
+ raise ValueError("DataFrame is empty")
20
+
21
+ if duplicate_fields is None:
22
+ duplicate_fields = ['patient_id', 'test_name', 'test_date']
23
+
24
+ # Check if all required fields exist
25
+ available_fields = [f for f in duplicate_fields if f in dataframe.columns]
26
+ if len(available_fields) < 2:
27
+ raise ValueError(f"Need at least 2 fields to check duplicates. Found: {available_fields}")
28
+
29
+ # Find duplicates
30
+ duplicates = dataframe[dataframe.duplicated(subset=available_fields, keep=False)]
31
+
32
+ failures = []
33
+ if not duplicates.empty:
34
+ # Get unique duplicate groups
35
+ duplicate_groups = duplicates.groupby(available_fields).size().reset_index(name='count')
36
+ duplicate_groups = duplicate_groups[duplicate_groups['count'] > 1]
37
+
38
+ for _, row in duplicate_groups.iterrows():
39
+ # Find all rows in this duplicate group
40
+ mask = True
41
+ for field in available_fields:
42
+ mask = mask & (duplicates[field] == row[field])
43
+ duplicate_indices = duplicates[mask].index.tolist()
44
+
45
+ for idx in duplicate_indices:
46
+ failures.append({
47
+ 'row_index': idx,
48
+ 'duplicate_fields': available_fields,
49
+ 'duplicate_values': {field: row[field] for field in available_fields},
50
+ 'occurrences': int(row['count'])
51
+ })
52
+
53
+ failed_row_indices = set(f['row_index'] for f in failures)
54
+
55
+ return {
56
+ 'validator_name': 'duplicate_records',
57
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
58
+ 'total_records': len(dataframe),
59
+ 'failed_records': len(failed_row_indices),
60
+ 'failure_count': len(failures),
61
+ 'failures': failures
62
+ }
@@ -0,0 +1,37 @@
1
+ import pandas as pd
2
+
3
+ def validate_future_dates(dataframe, date_fields=None):
4
+ if not isinstance(dataframe, pd.DataFrame):
5
+ raise TypeError("Expected pandas DataFrame")
6
+ if dataframe.empty:
7
+ raise ValueError("DataFrame is empty")
8
+
9
+ if date_fields is None:
10
+ date_fields = ['visit_date', 'test_date']
11
+
12
+ failures = []
13
+ failed_row_indices = set()
14
+ now = pd.Timestamp.now()
15
+
16
+ for field in date_fields:
17
+ if field not in dataframe.columns: continue
18
+ for idx, val in dataframe[field].items():
19
+ if pd.isna(val) or str(val).strip() == '': continue
20
+ try:
21
+ # QA FIX: Force tz-naive to prevent comparison crashes with mixed timezone data
22
+ date_val = pd.to_datetime(val).tz_localize(None)
23
+ if date_val > now:
24
+ failed_row_indices.add(idx)
25
+ patient_id = dataframe.at[idx, 'patient_id'] if 'patient_id' in dataframe.columns else 'UNKNOWN'
26
+ failures.append({'row_index': idx, 'field': field, 'invalid_value': str(val), 'error': 'Date is in the future', 'patient_id': patient_id})
27
+ except (ValueError, TypeError):
28
+ continue
29
+
30
+ return {
31
+ 'validator_name': 'future_dates',
32
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
33
+ 'total_records': len(dataframe),
34
+ 'failed_records': len(failed_row_indices),
35
+ 'failure_count': len(failures),
36
+ 'failures': failures
37
+ }
@@ -0,0 +1,61 @@
1
+ import pandas as pd
2
+
3
+ def validate_gender_based_tests(dataframe, test_field='test_name', gender_field='gender', rules=None):
4
+ """
5
+ Validates that certain medical tests are only performed on appropriate genders.
6
+
7
+ Args:
8
+ dataframe: pandas DataFrame.
9
+ test_field: Column name for test name.
10
+ gender_field: Column name for gender.
11
+ rules: Dict mapping test names to allowed genders.
12
+ Default: {'Prostate_Specific_Antigen': ['M', 'Male'], 'Pap_Smear': ['F', 'Female']}
13
+
14
+ Returns:
15
+ dict with validation results.
16
+ """
17
+ if not isinstance(dataframe, pd.DataFrame):
18
+ raise TypeError("Expected pandas DataFrame")
19
+ if dataframe.empty:
20
+ raise ValueError("DataFrame is empty")
21
+
22
+ if rules is None:
23
+ rules = {
24
+ 'Prostate_Specific_Antigen': ['M', 'Male'],
25
+ 'Pap_Smear': ['F', 'Female']
26
+ }
27
+
28
+ failures = []
29
+ failed_row_indices = set()
30
+
31
+ for idx, row in dataframe.iterrows():
32
+ test_name = row.get(test_field)
33
+ gender = row.get(gender_field)
34
+
35
+ if pd.isna(test_name) or pd.isna(gender):
36
+ continue
37
+
38
+ test_str = str(test_name).strip()
39
+ gender_str = str(gender).strip()
40
+
41
+ if test_str in rules:
42
+ allowed_genders = [g.lower() for g in rules[test_str]]
43
+ if gender_str.lower() not in allowed_genders:
44
+ failed_row_indices.add(idx)
45
+ patient_id = row.get('patient_id', 'UNKNOWN')
46
+ failures.append({
47
+ 'row_index': idx,
48
+ 'patient_id': patient_id,
49
+ 'test': test_str,
50
+ 'gender': gender_str,
51
+ 'error': f"Test {test_str} is invalid for gender {gender_str}"
52
+ })
53
+
54
+ return {
55
+ 'validator_name': 'gender_based_validation',
56
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
57
+ 'total_records': len(dataframe),
58
+ 'failed_records': len(failed_row_indices),
59
+ 'failure_count': len(failures),
60
+ 'failures': failures
61
+ }
@@ -0,0 +1,46 @@
1
+ import pandas as pd
2
+
3
+ def validate_invalid_data_types(dataframe, numeric_fields=None, date_fields=None):
4
+ if not isinstance(dataframe, pd.DataFrame):
5
+ raise TypeError("Expected pandas DataFrame")
6
+ if dataframe.empty:
7
+ raise ValueError("DataFrame is empty")
8
+
9
+ if numeric_fields is None:
10
+ numeric_fields = ['lab_value']
11
+ if date_fields is None:
12
+ date_fields = ['visit_date', 'test_date']
13
+
14
+ failures = []
15
+ failed_row_indices = set()
16
+
17
+ for field in numeric_fields:
18
+ if field not in dataframe.columns: continue
19
+ for idx, val in dataframe[field].items():
20
+ if pd.isna(val) or str(val).strip() == '': continue
21
+ try:
22
+ float(val)
23
+ except (ValueError, TypeError):
24
+ failed_row_indices.add(idx)
25
+ patient_id = dataframe.at[idx, 'patient_id'] if 'patient_id' in dataframe.columns else 'UNKNOWN'
26
+ failures.append({'row_index': idx, 'field': field, 'invalid_value': str(val), 'expected_type': 'numeric', 'patient_id': patient_id})
27
+
28
+ for field in date_fields:
29
+ if field not in dataframe.columns: continue
30
+ for idx, val in dataframe[field].items():
31
+ if pd.isna(val) or str(val).strip() == '': continue
32
+ try:
33
+ pd.to_datetime(val)
34
+ except (ValueError, TypeError):
35
+ failed_row_indices.add(idx)
36
+ patient_id = dataframe.at[idx, 'patient_id'] if 'patient_id' in dataframe.columns else 'UNKNOWN'
37
+ failures.append({'row_index': idx, 'field': field, 'invalid_value': str(val), 'expected_type': 'datetime', 'patient_id': patient_id})
38
+
39
+ return {
40
+ 'validator_name': 'invalid_data_types',
41
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
42
+ 'total_records': len(dataframe),
43
+ 'failed_records': len(failed_row_indices),
44
+ 'failure_count': len(failures),
45
+ 'failures': failures
46
+ }
@@ -0,0 +1,74 @@
1
+ import pandas as pd
2
+ import re
3
+
4
+ def validate_patient_ids(dataframe, patient_id_field='patient_id', pattern=None, min_length=None, max_length=None):
5
+ """
6
+ Validates patient ID format according to specified rules.
7
+
8
+ Args:
9
+ dataframe: pandas DataFrame with clinical data
10
+ patient_id_field: name of the patient ID column (default: 'patient_id')
11
+ pattern: regex pattern for validation (default: None)
12
+ min_length: minimum length of patient ID (default: None)
13
+ max_length: maximum length of patient ID (default: None)
14
+
15
+ Default behavior: Validates that patient IDs are exactly 5 digits
16
+
17
+ Returns:
18
+ dict with validation results
19
+ """
20
+ if not isinstance(dataframe, pd.DataFrame):
21
+ raise TypeError("Expected pandas DataFrame")
22
+ if dataframe.empty:
23
+ raise ValueError("DataFrame is empty")
24
+
25
+ if patient_id_field not in dataframe.columns:
26
+ raise ValueError(f"Column '{patient_id_field}' not found in DataFrame")
27
+
28
+ # Default: 5-digit numeric ID
29
+ if pattern is None and min_length is None and max_length is None:
30
+ pattern = r'^\d{5}$' # Exactly 5 digits
31
+
32
+ failures = []
33
+ failed_row_indices = set()
34
+
35
+ for idx, val in dataframe[patient_id_field].items():
36
+ if pd.isna(val) or str(val).strip() == '':
37
+ continue # Skip missing values
38
+
39
+ patient_id_str = str(val).strip()
40
+ is_valid = True
41
+ error_reason = ""
42
+
43
+ # Check pattern
44
+ if pattern:
45
+ if not re.match(pattern, patient_id_str):
46
+ is_valid = False
47
+ error_reason = f"Does not match pattern: {pattern}"
48
+
49
+ # Check length
50
+ if is_valid and (min_length is not None or max_length is not None):
51
+ if min_length is not None and len(patient_id_str) < min_length:
52
+ is_valid = False
53
+ error_reason = f"Length {len(patient_id_str)} < minimum {min_length}"
54
+ elif max_length is not None and len(patient_id_str) > max_length:
55
+ is_valid = False
56
+ error_reason = f"Length {len(patient_id_str)} > maximum {max_length}"
57
+
58
+ if not is_valid:
59
+ failed_row_indices.add(idx)
60
+ failures.append({
61
+ 'row_index': idx,
62
+ 'field': patient_id_field,
63
+ 'invalid_value': patient_id_str,
64
+ 'error': error_reason
65
+ })
66
+
67
+ return {
68
+ 'validator_name': 'invalid_patient_ids',
69
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
70
+ 'total_records': len(dataframe),
71
+ 'failed_records': len(failed_row_indices),
72
+ 'failure_count': len(failures),
73
+ 'failures': failures
74
+ }
@@ -0,0 +1,113 @@
1
+ import pandas as pd
2
+ from typing import Dict, List, Optional
3
+
4
+ def validate_missing_critical_fields(
5
+ dataframe: pd.DataFrame,
6
+ critical_fields: Optional[List[str]] = None
7
+ ) -> Dict:
8
+ """
9
+ Validates that critical clinical fields are not NULL/empty.
10
+
11
+ This validator checks for missing values (NaN, None, empty strings) in
12
+ critical clinical data fields. It's designed to catch common data quality
13
+ issues in clinical research datasets.
14
+
15
+ Args:
16
+ dataframe: pandas DataFrame with clinical data
17
+ critical_fields: list of required field names. If None, uses default
18
+ clinical fields (patient_id, visit_date, lab_test_name,
19
+ lab_value, test_date)
20
+
21
+ Returns:
22
+ Dictionary with validation results containing:
23
+ - validator_name: name of the validator
24
+ - status: 'PASS' or 'FAIL'
25
+ - total_records: total rows in dataframe
26
+ - failed_records: number of rows with missing values
27
+ - failure_count: total number of missing value instances
28
+ - critical_fields_checked: list of fields that were validated
29
+ - failures: list of detailed failure information
30
+
31
+ Raises:
32
+ ValueError: if dataframe is empty or critical_fields is empty list
33
+ TypeError: if dataframe is not a pandas DataFrame
34
+
35
+ Example:
36
+ >>> import pandas as pd
37
+ >>> from clinical_validators import validate_missing_critical_fields
38
+ >>> df = pd.read_csv('lab_data.csv')
39
+ >>> result = validate_missing_critical_fields(df)
40
+ >>> print(result['status'])
41
+ 'FAIL'
42
+ >>> print(f"Failed records: {result['failed_records']}")
43
+ 'Failed records: 5'
44
+ """
45
+
46
+ # ===== INPUT VALIDATION =====
47
+ if not isinstance(dataframe, pd.DataFrame):
48
+ raise TypeError(f"Expected pandas DataFrame, got {type(dataframe).__name__}")
49
+
50
+ if dataframe.empty:
51
+ raise ValueError("DataFrame is empty - cannot validate empty dataset")
52
+
53
+ # Default critical fields for clinical data
54
+ if critical_fields is None:
55
+ critical_fields = [
56
+ 'patient_id',
57
+ 'visit_date',
58
+ 'lab_test_name',
59
+ 'lab_value',
60
+ 'test_date'
61
+ ]
62
+
63
+ if isinstance(critical_fields, list) and len(critical_fields) == 0:
64
+ raise ValueError("critical_fields list cannot be empty")
65
+
66
+ # ===== VALIDATION LOGIC =====
67
+ failures = []
68
+
69
+ # Check each critical field
70
+ for field in critical_fields:
71
+ # Skip if field doesn't exist in dataframe
72
+ if field not in dataframe.columns:
73
+ continue
74
+
75
+ # Find rows where field is NULL, NaN, or empty string
76
+ # This handles: None, np.nan, pd.NaT, and empty strings
77
+ missing_mask = (dataframe[field].isna()) | (dataframe[field] == '')
78
+ missing_indices = dataframe[missing_mask].index.tolist()
79
+
80
+ # Record each failure with context
81
+ for idx in missing_indices:
82
+ # Safely retrieve patient_id for context
83
+ patient_id = 'UNKNOWN'
84
+ try:
85
+ if 'patient_id' in dataframe.columns:
86
+ pid_value = dataframe.at[idx, 'patient_id']
87
+ # Check if patient_id itself is missing
88
+ if pd.isna(pid_value) or pid_value == '':
89
+ patient_id = 'UNKNOWN'
90
+ else:
91
+ patient_id = str(pid_value)
92
+ except (KeyError, IndexError, TypeError):
93
+ patient_id = 'UNKNOWN'
94
+
95
+ failures.append({
96
+ 'row_index': idx,
97
+ 'missing_field': field,
98
+ 'patient_id': patient_id
99
+ })
100
+
101
+ # Count unique rows with any failures
102
+ failed_row_indices = set([f['row_index'] for f in failures])
103
+
104
+ # ===== RETURN RESULTS =====
105
+ return {
106
+ 'validator_name': 'missing_critical_fields',
107
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
108
+ 'total_records': len(dataframe),
109
+ 'failed_records': len(failed_row_indices),
110
+ 'critical_fields_checked': critical_fields,
111
+ 'failure_count': len(failures),
112
+ 'failures': failures
113
+ }
@@ -0,0 +1,43 @@
1
+ import pandas as pd
2
+
3
+ def validate_missing_visit_data(dataframe, lab_field='lab_value', visit_field='visit_date'):
4
+ """
5
+ Validates that if a patient has a lab result, they must have a corresponding visit record.
6
+
7
+ Args:
8
+ dataframe: pandas DataFrame with clinical data.
9
+ lab_field: Column name containing lab results.
10
+ visit_field: Column name containing visit dates.
11
+
12
+ Returns:
13
+ dict with validation results.
14
+ """
15
+ if not isinstance(dataframe, pd.DataFrame):
16
+ raise TypeError("Expected pandas DataFrame")
17
+ if dataframe.empty:
18
+ raise ValueError("DataFrame is empty")
19
+
20
+ failures = []
21
+ failed_row_indices = set()
22
+
23
+ for idx, row in dataframe.iterrows():
24
+ has_lab = pd.notna(row.get(lab_field)) and str(row.get(lab_field)).strip() != ''
25
+ has_visit = pd.notna(row.get(visit_field)) and str(row.get(visit_field)).strip() != ''
26
+
27
+ if has_lab and not has_visit:
28
+ failed_row_indices.add(idx)
29
+ patient_id = row.get('patient_id', 'UNKNOWN')
30
+ failures.append({
31
+ 'row_index': idx,
32
+ 'patient_id': patient_id,
33
+ 'error': f"Has lab value '{row[lab_field]}' but missing {visit_field}"
34
+ })
35
+
36
+ return {
37
+ 'validator_name': 'missing_visit_data',
38
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
39
+ 'total_records': len(dataframe),
40
+ 'failed_records': len(failed_row_indices),
41
+ 'failure_count': len(failures),
42
+ 'failures': failures
43
+ }
@@ -0,0 +1,61 @@
1
+ import pandas as pd
2
+
3
+ def validate_out_of_range_values(dataframe, range_rules=None):
4
+ """
5
+ Validates that numeric fields fall within acceptable clinical ranges.
6
+
7
+ Args:
8
+ dataframe: pandas DataFrame with clinical data
9
+ range_rules: dict with field names as keys and (min, max) tuples as values
10
+ Example: {'lab_value': (0, 1000), 'age': (0, 120)}
11
+
12
+ Returns:
13
+ dict with validation results
14
+ """
15
+ if not isinstance(dataframe, pd.DataFrame):
16
+ raise TypeError("Expected pandas DataFrame")
17
+ if dataframe.empty:
18
+ raise ValueError("DataFrame is empty")
19
+
20
+ # Default clinical ranges
21
+ if range_rules is None:
22
+ range_rules = {
23
+ 'lab_value': (0, 10000), # Generic lab value range
24
+ 'age': (0, 120) # Human age range
25
+ }
26
+
27
+ failures = []
28
+ failed_row_indices = set()
29
+
30
+ for field, (min_val, max_val) in range_rules.items():
31
+ if field not in dataframe.columns:
32
+ continue
33
+
34
+ for idx, val in dataframe[field].items():
35
+ if pd.isna(val):
36
+ continue # Skip missing values
37
+
38
+ try:
39
+ numeric_val = float(val)
40
+ if numeric_val < min_val or numeric_val > max_val:
41
+ failed_row_indices.add(idx)
42
+ patient_id = dataframe.at[idx, 'patient_id'] if 'patient_id' in dataframe.columns else 'UNKNOWN'
43
+ failures.append({
44
+ 'row_index': idx,
45
+ 'field': field,
46
+ 'invalid_value': numeric_val,
47
+ 'expected_range': f"{min_val}-{max_val}",
48
+ 'patient_id': patient_id
49
+ })
50
+ except (ValueError, TypeError):
51
+ # Skip non-numeric values (handled by invalid_data_types validator)
52
+ continue
53
+
54
+ return {
55
+ 'validator_name': 'out_of_range_values',
56
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
57
+ 'total_records': len(dataframe),
58
+ 'failed_records': len(failed_row_indices),
59
+ 'failure_count': len(failures),
60
+ 'failures': failures
61
+ }
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,23 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ with open("README.md", "r", encoding="utf-8") as fh:
4
+ long_description = fh.read()
5
+
6
+ setup(
7
+ name="clinical-data-validators",
8
+ version="0.4.0",
9
+ author="Navin Kumar",
10
+ description="A library for validating clinical data quality",
11
+ long_description=long_description,
12
+ long_description_content_type="text/markdown",
13
+ packages=find_packages(),
14
+ python_requires=">=3.8",
15
+ install_requires=[
16
+ "pandas>=1.3.0",
17
+ ],
18
+ classifiers=[
19
+ "Programming Language :: Python :: 3",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Operating System :: OS Independent",
22
+ ],
23
+ )
@@ -0,0 +1,20 @@
1
+ import pandas as pd
2
+ import pytest
3
+ import sys
4
+ from pathlib import Path
5
+ from datetime import datetime, timedelta
6
+ sys.path.insert(0, str(Path(__file__).parent.parent))
7
+ from clinical_validators.age_consistency import validate_age_consistency
8
+
9
+ class TestAgeConsistency:
10
+ def test_valid_age(self):
11
+ birth = (datetime.now() - timedelta(days=30*365)).strftime('%Y-%m-%d')
12
+ df = pd.DataFrame({'patient_id': [1], 'age': [30], 'birth_date': [birth]})
13
+ assert validate_age_consistency(df)['status'] == 'PASS'
14
+ def test_invalid_age(self):
15
+ birth = (datetime.now() - timedelta(days=30*365)).strftime('%Y-%m-%d')
16
+ df = pd.DataFrame({'patient_id': [1], 'age': [50], 'birth_date': [birth]})
17
+ assert validate_age_consistency(df)['status'] == 'FAIL'
18
+ def test_missing_data(self):
19
+ df = pd.DataFrame({'patient_id': [1], 'age': [None], 'birth_date': [None]})
20
+ assert validate_age_consistency(df)['status'] == 'PASS'
@@ -0,0 +1,27 @@
1
+ import pandas as pd
2
+ import pytest
3
+ import sys
4
+ from pathlib import Path
5
+ sys.path.insert(0, str(Path(__file__).parent.parent))
6
+ from clinical_validators.data_completeness import validate_data_completeness
7
+
8
+ class TestDataCompleteness:
9
+ def test_complete_blood(self):
10
+ df = pd.DataFrame({'patient_id': [1], 'test_type': ['Blood'], 'hemoglobin': [14], 'wbc': [5]})
11
+ assert validate_data_completeness(df)['status'] == 'PASS'
12
+
13
+ def test_incomplete_blood(self):
14
+ df = pd.DataFrame({'patient_id': [1], 'test_type': ['Blood'], 'hemoglobin': [14], 'wbc': [None]})
15
+ assert validate_data_completeness(df)['status'] == 'FAIL'
16
+
17
+ def test_different_test_type(self):
18
+ # Urine test requires 'ph' and 'protein'. 'hemoglobin' and 'wbc' are for Blood, so they can be None.
19
+ df = pd.DataFrame({
20
+ 'patient_id': [1],
21
+ 'test_type': ['Urine'],
22
+ 'ph': [6.0],
23
+ 'protein': ['Negative'],
24
+ 'hemoglobin': [None],
25
+ 'wbc': [None]
26
+ })
27
+ assert validate_data_completeness(df)['status'] == 'PASS'
@@ -0,0 +1,37 @@
1
+ import pandas as pd
2
+ import pytest
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ sys.path.insert(0, str(Path(__file__).parent.parent))
7
+ from clinical_validators.duplicate_records import validate_duplicate_records
8
+
9
+ class TestDuplicateRecords:
10
+ def test_no_duplicates(self):
11
+ df = pd.DataFrame({
12
+ 'patient_id': [1001, 1002, 1003],
13
+ 'test_name': ['Blood', 'Urine', 'X-Ray'],
14
+ 'test_date': ['2026-01-15', '2026-01-16', '2026-01-17']
15
+ })
16
+ result = validate_duplicate_records(df)
17
+ assert result['status'] == 'PASS'
18
+ assert result['failed_records'] == 0
19
+
20
+ def test_exact_duplicates(self):
21
+ df = pd.DataFrame({
22
+ 'patient_id': [1001, 1001], # Same patient
23
+ 'test_name': ['Blood', 'Blood'], # Same test
24
+ 'test_date': ['2026-01-15', '2026-01-15'] # Same date
25
+ })
26
+ result = validate_duplicate_records(df)
27
+ assert result['status'] == 'FAIL'
28
+ assert result['failed_records'] == 2 # Both rows are duplicates
29
+
30
+ def test_partial_duplicates_not_flagged(self):
31
+ df = pd.DataFrame({
32
+ 'patient_id': [1001, 1001], # Same patient
33
+ 'test_name': ['Blood', 'Urine'], # Different test
34
+ 'test_date': ['2026-01-15', '2026-01-15'] # Same date
35
+ })
36
+ result = validate_duplicate_records(df)
37
+ assert result['status'] == 'PASS' # Not exact duplicates
@@ -0,0 +1,23 @@
1
+ import pandas as pd
2
+ import pytest
3
+ import sys
4
+ from pathlib import Path
5
+ from datetime import datetime, timedelta
6
+
7
+ sys.path.insert(0, str(Path(__file__).parent.parent))
8
+ from clinical_validators.future_dates import validate_future_dates
9
+
10
+ class TestFutureDates:
11
+ def test_valid_past_dates(self):
12
+ df = pd.DataFrame({'patient_id': [1001], 'visit_date': ['2023-01-15'], 'test_date': ['2023-01-15']})
13
+ assert validate_future_dates(df)['status'] == 'PASS'
14
+
15
+ def test_future_visit_date(self):
16
+ future_date = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%d')
17
+ df = pd.DataFrame({'patient_id': [1001], 'visit_date': [future_date], 'test_date': ['2023-01-15']})
18
+ result = validate_future_dates(df)
19
+ assert result['status'] == 'FAIL'
20
+ assert result['failures'][0]['field'] == 'visit_date'
21
+
22
+ def test_empty_dataframe_raises_error(self):
23
+ with pytest.raises(ValueError): validate_future_dates(pd.DataFrame())
@@ -0,0 +1,17 @@
1
+ import pandas as pd
2
+ import pytest
3
+ import sys
4
+ from pathlib import Path
5
+ sys.path.insert(0, str(Path(__file__).parent.parent))
6
+ from clinical_validators.gender_based_validation import validate_gender_based_tests
7
+
8
+ class TestGenderValidation:
9
+ def test_valid_gender(self):
10
+ df = pd.DataFrame({'patient_id': [1], 'test_name': ['Prostate_Specific_Antigen'], 'gender': ['M']})
11
+ assert validate_gender_based_tests(df)['status'] == 'PASS'
12
+ def test_invalid_gender(self):
13
+ df = pd.DataFrame({'patient_id': [1], 'test_name': ['Prostate_Specific_Antigen'], 'gender': ['F']})
14
+ assert validate_gender_based_tests(df)['status'] == 'FAIL'
15
+ def test_unrestricted_test(self):
16
+ df = pd.DataFrame({'patient_id': [1], 'test_name': ['Blood_Test'], 'gender': ['F']})
17
+ assert validate_gender_based_tests(df)['status'] == 'PASS'
@@ -0,0 +1,27 @@
1
+ import pandas as pd
2
+ import pytest
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ sys.path.insert(0, str(Path(__file__).parent.parent))
7
+ from clinical_validators.invalid_data_types import validate_invalid_data_types
8
+
9
+ class TestInvalidDataTypes:
10
+ def test_valid_data_types(self):
11
+ df = pd.DataFrame({'patient_id': [1001], 'lab_value': [14.5], 'visit_date': ['2026-01-15'], 'test_date': ['2026-01-15']})
12
+ assert validate_invalid_data_types(df)['status'] == 'PASS'
13
+
14
+ def test_invalid_numeric_value(self):
15
+ df = pd.DataFrame({'patient_id': [1001], 'lab_value': ['abc'], 'visit_date': ['2026-01-15'], 'test_date': ['2026-01-15']})
16
+ result = validate_invalid_data_types(df)
17
+ assert result['status'] == 'FAIL'
18
+ assert result['failures'][0]['field'] == 'lab_value'
19
+
20
+ def test_invalid_date_format(self):
21
+ df = pd.DataFrame({'patient_id': [1001], 'lab_value': [14.5], 'visit_date': ['not-a-date'], 'test_date': ['2026-01-15']})
22
+ result = validate_invalid_data_types(df)
23
+ assert result['status'] == 'FAIL'
24
+ assert result['failures'][0]['field'] == 'visit_date'
25
+
26
+ def test_empty_dataframe_raises_error(self):
27
+ with pytest.raises(ValueError): validate_invalid_data_types(pd.DataFrame())
@@ -0,0 +1,32 @@
1
+ import pandas as pd
2
+ import pytest
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ sys.path.insert(0, str(Path(__file__).parent.parent))
7
+ from clinical_validators.invalid_patient_ids import validate_patient_ids
8
+
9
+ class TestInvalidPatientIds:
10
+ def test_valid_patient_ids(self):
11
+ df = pd.DataFrame({
12
+ 'patient_id': ['12345', '67890', '11111']
13
+ })
14
+ result = validate_patient_ids(df)
15
+ assert result['status'] == 'PASS'
16
+ assert result['failed_records'] == 0
17
+
18
+ def test_invalid_format_too_short(self):
19
+ df = pd.DataFrame({
20
+ 'patient_id': ['12345', '1234'] # 1234 is only 4 digits
21
+ })
22
+ result = validate_patient_ids(df)
23
+ assert result['status'] == 'FAIL'
24
+ assert result['failed_records'] == 1
25
+
26
+ def test_invalid_format_non_numeric(self):
27
+ df = pd.DataFrame({
28
+ 'patient_id': ['12345', 'ABC12'] # ABC12 contains letters
29
+ })
30
+ result = validate_patient_ids(df)
31
+ assert result['status'] == 'FAIL'
32
+ assert result['failed_records'] == 1
@@ -0,0 +1,182 @@
1
+ import pandas as pd
2
+ import pytest
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ # Add parent directory to path for imports
7
+ sys.path.insert(0, str(Path(__file__).parent.parent))
8
+
9
+ from clinical_validators.missing_fields import validate_missing_critical_fields
10
+
11
+
12
+ class TestMissingCriticalFields:
13
+ """Test suite for missing_critical_fields validator"""
14
+
15
+ def test_valid_dataframe_no_missing_values(self):
16
+ """Test: Clean data should pass validation"""
17
+ data = {
18
+ 'patient_id': [1001, 1002, 1003],
19
+ 'visit_date': ['2026-01-15', '2026-01-16', '2026-01-17'],
20
+ 'lab_test_name': ['Hemoglobin', 'Glucose', 'Cholesterol'],
21
+ 'lab_value': [14.5, 95, 180],
22
+ 'test_date': ['2026-01-15', '2026-01-16', '2026-01-17']
23
+ }
24
+ df = pd.DataFrame(data)
25
+ result = validate_missing_critical_fields(df)
26
+
27
+ assert result['status'] == 'PASS', "Clean data should pass"
28
+ assert result['failed_records'] == 0, "Should have 0 failed records"
29
+ assert result['total_records'] == 3, "Should have 3 total records"
30
+ assert len(result['failures']) == 0, "Failures list should be empty"
31
+ print("✓ Test 1 PASSED: Clean data validated successfully")
32
+
33
+ def test_missing_patient_id(self):
34
+ """Test: Missing patient_id should be detected"""
35
+ data = {
36
+ 'patient_id': [1001, None, 1003],
37
+ 'visit_date': ['2026-01-15', '2026-01-16', '2026-01-17'],
38
+ 'lab_test_name': ['Hemoglobin', 'Glucose', 'Cholesterol'],
39
+ 'lab_value': [14.5, 95, 180],
40
+ 'test_date': ['2026-01-15', '2026-01-16', '2026-01-17']
41
+ }
42
+ df = pd.DataFrame(data)
43
+ result = validate_missing_critical_fields(df)
44
+
45
+ assert result['status'] == 'FAIL', "Should fail with missing patient_id"
46
+ assert result['failed_records'] == 1, "Should have 1 failed record"
47
+ assert any(f['missing_field'] == 'patient_id' for f in result['failures']), \
48
+ "Should identify missing patient_id"
49
+ print("✓ Test 2 PASSED: Missing patient_id detected")
50
+
51
+ def test_missing_multiple_fields(self):
52
+ """Test: Multiple missing fields across different rows"""
53
+ data = {
54
+ 'patient_id': [1001, 1002, None],
55
+ 'visit_date': [None, '2026-01-16', '2026-01-17'],
56
+ 'lab_test_name': ['Hemoglobin', 'Glucose', 'Cholesterol'],
57
+ 'lab_value': [14.5, 95, 180],
58
+ 'test_date': ['2026-01-15', None, '2026-01-17']
59
+ }
60
+ df = pd.DataFrame(data)
61
+ result = validate_missing_critical_fields(df)
62
+
63
+ assert result['status'] == 'FAIL', "Should fail with missing fields"
64
+ assert result['failed_records'] == 3, "Should have 3 failed records"
65
+ assert result['failure_count'] == 3, "Should have 3 total failures"
66
+ print("✓ Test 3 PASSED: Multiple missing fields detected correctly")
67
+
68
+ def test_missing_lab_test_name(self):
69
+ """Test: Missing lab_test_name detection (empty string)"""
70
+ data = {
71
+ 'patient_id': [1001, 1002],
72
+ 'visit_date': ['2026-01-15', '2026-01-16'],
73
+ 'lab_test_name': ['Hemoglobin', ''], # Empty string
74
+ 'lab_value': [14.5, 95],
75
+ 'test_date': ['2026-01-15', '2026-01-16']
76
+ }
77
+ df = pd.DataFrame(data)
78
+ result = validate_missing_critical_fields(df)
79
+
80
+ assert result['status'] == 'FAIL', "Should detect empty string as missing"
81
+ assert result['failed_records'] == 1, "Should flag empty string row"
82
+ assert any(f['missing_field'] == 'lab_test_name' and f['row_index'] == 1
83
+ for f in result['failures']), "Should identify lab_test_name empty"
84
+ print("✓ Test 4 PASSED: Empty string detected as missing value")
85
+
86
+ def test_all_fields_missing(self):
87
+ """Test: Edge case - all critical fields missing in one row"""
88
+ data = {
89
+ 'patient_id': [None, 1002],
90
+ 'visit_date': [None, '2026-01-16'],
91
+ 'lab_test_name': [None, 'Glucose'],
92
+ 'lab_value': [None, 95],
93
+ 'test_date': [None, '2026-01-16']
94
+ }
95
+ df = pd.DataFrame(data)
96
+ result = validate_missing_critical_fields(df)
97
+
98
+ assert result['failed_records'] == 1, "Should detect row with all missing fields"
99
+ assert result['failure_count'] == 5, "Should count 5 failures (1 per field)"
100
+ print("✓ Test 5 PASSED: All missing fields in single row detected")
101
+
102
+ def test_empty_dataframe_raises_error(self):
103
+ """Test: Empty DataFrame should raise ValueError"""
104
+ df = pd.DataFrame()
105
+
106
+ with pytest.raises(ValueError):
107
+ validate_missing_critical_fields(df)
108
+
109
+ print("✓ Test 6 PASSED: Empty DataFrame raises ValueError")
110
+
111
+ def test_invalid_input_type_raises_error(self):
112
+ """Test: Non-DataFrame input should raise TypeError"""
113
+ # Test with string input
114
+ with pytest.raises(TypeError):
115
+ validate_missing_critical_fields("not a dataframe")
116
+
117
+ # Test with list input
118
+ with pytest.raises(TypeError):
119
+ validate_missing_critical_fields([1, 2, 3])
120
+
121
+ print("✓ Test 7 PASSED: Invalid input types raise TypeError")
122
+
123
+ def test_custom_critical_fields(self):
124
+ """Test: Validator should work with custom field list"""
125
+ data = {
126
+ 'patient_id': [1001, None],
127
+ 'visit_date': ['2026-01-15', '2026-01-16'],
128
+ 'other_field': ['A', 'B']
129
+ }
130
+ df = pd.DataFrame(data)
131
+ custom_fields = ['patient_id', 'other_field']
132
+ result = validate_missing_critical_fields(df, critical_fields=custom_fields)
133
+
134
+ assert 'visit_date' not in result['critical_fields_checked'], \
135
+ "Should only check specified fields"
136
+ assert result['failed_records'] == 1, "Should detect missing patient_id"
137
+ print("✓ Test 8 PASSED: Custom field list works correctly")
138
+
139
+ def test_extra_columns_ignored(self):
140
+ """Test: Extra columns should not affect validation"""
141
+ data = {
142
+ 'patient_id': [1001, 1002],
143
+ 'visit_date': ['2026-01-15', '2026-01-16'],
144
+ 'lab_test_name': ['Hemoglobin', 'Glucose'],
145
+ 'lab_value': [14.5, 95],
146
+ 'test_date': ['2026-01-15', '2026-01-16'],
147
+ 'extra_column_1': ['X', 'Y'],
148
+ 'extra_column_2': [100, 200]
149
+ }
150
+ df = pd.DataFrame(data)
151
+ result = validate_missing_critical_fields(df)
152
+
153
+ assert result['status'] == 'PASS', "Extra columns should not cause failures"
154
+ assert result['failed_records'] == 0, "Should have no failures"
155
+ print("✓ Test 9 PASSED: Extra columns ignored correctly")
156
+
157
+ def test_nan_vs_none_consistency(self):
158
+ """Test: Both NaN and None are treated as missing"""
159
+ data = {
160
+ 'patient_id': [1001, None, 1003],
161
+ 'visit_date': ['2026-01-15', '2026-01-16', pd.NaT],
162
+ 'lab_test_name': ['Hemoglobin', 'Glucose', 'Cholesterol'],
163
+ 'lab_value': [14.5, 95, 180],
164
+ 'test_date': ['2026-01-15', '2026-01-16', '2026-01-17']
165
+ }
166
+ df = pd.DataFrame(data)
167
+ result = validate_missing_critical_fields(df)
168
+
169
+ assert result['status'] == 'FAIL', "Should detect both None and NaT"
170
+ assert result['failed_records'] == 2, "Should have 2 failed records"
171
+ print("✓ Test 10 PASSED: NaN and None handled consistently")
172
+
173
+
174
+ if __name__ == '__main__':
175
+ print("=" * 80)
176
+ print("CLINICAL DATA VALIDATORS - TEST SUITE")
177
+ print("Validator: missing_critical_fields")
178
+ print("=" * 80)
179
+ print()
180
+
181
+ # Run with pytest
182
+ pytest.main([__file__, '-v', '--tb=short'])
@@ -0,0 +1,17 @@
1
+ import pandas as pd
2
+ import pytest
3
+ import sys
4
+ from pathlib import Path
5
+ sys.path.insert(0, str(Path(__file__).parent.parent))
6
+ from clinical_validators.missing_visit_data import validate_missing_visit_data
7
+
8
+ class TestMissingVisitData:
9
+ def test_valid_data(self):
10
+ df = pd.DataFrame({'patient_id': [1], 'lab_value': [100], 'visit_date': ['2026-01-01']})
11
+ assert validate_missing_visit_data(df)['status'] == 'PASS'
12
+ def test_missing_visit(self):
13
+ df = pd.DataFrame({'patient_id': [1], 'lab_value': [100], 'visit_date': [None]})
14
+ assert validate_missing_visit_data(df)['status'] == 'FAIL'
15
+ def test_no_lab_no_visit(self):
16
+ df = pd.DataFrame({'patient_id': [1], 'lab_value': [None], 'visit_date': [None]})
17
+ assert validate_missing_visit_data(df)['status'] == 'PASS'
@@ -0,0 +1,40 @@
1
+ import pandas as pd
2
+ import pytest
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ sys.path.insert(0, str(Path(__file__).parent.parent))
7
+ from clinical_validators.out_of_range_values import validate_out_of_range_values
8
+
9
+ class TestOutOfRangeValues:
10
+ def test_valid_ranges(self):
11
+ df = pd.DataFrame({
12
+ 'patient_id': [1001, 1002],
13
+ 'lab_value': [150.5, 500],
14
+ 'age': [35, 67]
15
+ })
16
+ result = validate_out_of_range_values(df)
17
+ assert result['status'] == 'PASS'
18
+ assert result['failed_records'] == 0
19
+
20
+ def test_lab_value_too_high(self):
21
+ df = pd.DataFrame({
22
+ 'patient_id': [1001, 1002],
23
+ 'lab_value': [150.5, 15000], # 15000 > 10000
24
+ 'age': [35, 67]
25
+ })
26
+ result = validate_out_of_range_values(df)
27
+ assert result['status'] == 'FAIL'
28
+ assert result['failed_records'] == 1
29
+ assert result['failures'][0]['field'] == 'lab_value'
30
+
31
+ def test_age_out_of_range(self):
32
+ df = pd.DataFrame({
33
+ 'patient_id': [1001, 1002],
34
+ 'lab_value': [150.5, 500],
35
+ 'age': [35, 150] # 150 > 120
36
+ })
37
+ result = validate_out_of_range_values(df)
38
+ assert result['status'] == 'FAIL'
39
+ assert result['failed_records'] == 1
40
+ assert result['failures'][0]['field'] == 'age'