clinical-data-validators 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,49 @@
1
+ Metadata-Version: 2.4
2
+ Name: clinical-data-validators
3
+ Version: 0.4.0
4
+ Summary: A library for validating clinical data quality
5
+ Author: Navin Kumar
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: pandas>=1.3.0
12
+ Dynamic: author
13
+ Dynamic: classifier
14
+ Dynamic: description
15
+ Dynamic: description-content-type
16
+ Dynamic: requires-dist
17
+ Dynamic: requires-python
18
+ Dynamic: summary
19
+
20
+ # Clinical Data Validators
21
+
22
+ A Python library for validating clinical datasets.
23
+
24
+ ## Installation
25
+
26
+ pip install clinical-data-validators
27
+
28
+ ## Usage
29
+
30
+ python
31
+ import pandas as pd
32
+ from clinical_validators import validate_missing_critical_fields
33
+
34
+ df = pd.read_csv("data.csv")
35
+ result = validate_missing_critical_fields(df)
36
+ print(result)
37
+
38
+
39
+ ## Validators
40
+ 1. Missing Critical Fields
41
+ 2. Invalid Data Types
42
+ 3. Future Dates
43
+ 4. Out of Range Values
44
+ 5. Duplicate Records
45
+ 6. Invalid Patient IDs
46
+ 7. Missing Visit Data
47
+ 8. Age Consistency
48
+ 9. Gender-Based Validation
49
+ 10. Data Completeness
@@ -0,0 +1,15 @@
1
+ clinical_validators/__init__.py,sha256=y3ziB5Ikc-lL3M80UnyL5iKzQTSSKEMDgpHTR7XaltY,958
2
+ clinical_validators/age_consistency.py,sha256=Uw75D5_i8RKwv0Z4NNRJs-2CiTRN4It3MLkUm2TujCI,1958
3
+ clinical_validators/data_completeness.py,sha256=IhcChtVydd2Y6T5ReDPGsiZYyjAUt5ArhgNnEmRJy-A,2142
4
+ clinical_validators/duplicate_records.py,sha256=IReFwioNGpHlNrGx3q30BQohEvTt96Y30MpR_NJjMOo,2449
5
+ clinical_validators/future_dates.py,sha256=Xy0JbqdU-vQdUFmY8jZLjasApt1LASLdmCue-mf5F-0,1508
6
+ clinical_validators/gender_based_validation.py,sha256=VWi28vGAyg8MVNjEKB9XfnK0rR-Ol0gylN0u6uxI05c,2113
7
+ clinical_validators/invalid_data_types.py,sha256=zsFifgoLUZ6REobPj72sAKgt9CthDAoeA8XjVBD6wEA,1973
8
+ clinical_validators/invalid_patient_ids.py,sha256=Sh-BrLXeLEUQJ54I9xTA6xg97__nxZpPuBv0w6SJNzA,2773
9
+ clinical_validators/missing_fields.py,sha256=08vI5SJVLsfg_y1wpiNxye9hCxFRVShv8Q_5UVdTFog,4269
10
+ clinical_validators/missing_visit_data.py,sha256=i940dRgy58soz8W2xlds7VQCoi0aNFCPZeSUM1HC42E,1565
11
+ clinical_validators/out_of_range_values.py,sha256=Rtag7MJrdyMLajNyBtDQ16bHWhtp6DHopoxBOozkxgA,2226
12
+ clinical_data_validators-0.4.0.dist-info/METADATA,sha256=oHNClMVzg6dhOCweyszLGi_93PY0fsvym5utiR6X6HI,1156
13
+ clinical_data_validators-0.4.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
14
+ clinical_data_validators-0.4.0.dist-info/top_level.txt,sha256=-OtSRpWXkxFLxhj6uy-Vn3HWbgHzWithqh4CWmkGWnQ,20
15
+ clinical_data_validators-0.4.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ clinical_validators
@@ -0,0 +1,24 @@
1
+ from .missing_fields import validate_missing_critical_fields
2
+ from .invalid_data_types import validate_invalid_data_types
3
+ from .future_dates import validate_future_dates
4
+ from .out_of_range_values import validate_out_of_range_values
5
+ from .duplicate_records import validate_duplicate_records
6
+ from .invalid_patient_ids import validate_patient_ids
7
+ from .missing_visit_data import validate_missing_visit_data
8
+ from .age_consistency import validate_age_consistency
9
+ from .gender_based_validation import validate_gender_based_tests
10
+ from .data_completeness import validate_data_completeness
11
+
12
+ __version__ = "0.4.0"
13
+ __all__ = [
14
+ 'validate_missing_critical_fields',
15
+ 'validate_invalid_data_types',
16
+ 'validate_future_dates',
17
+ 'validate_out_of_range_values',
18
+ 'validate_duplicate_records',
19
+ 'validate_patient_ids',
20
+ 'validate_missing_visit_data',
21
+ 'validate_age_consistency',
22
+ 'validate_gender_based_tests',
23
+ 'validate_data_completeness'
24
+ ]
@@ -0,0 +1,56 @@
1
+ import pandas as pd
2
+ from datetime import datetime
3
+
4
+ def validate_age_consistency(dataframe, age_field='age', birth_date_field='birth_date', tolerance_years=1):
5
+ """
6
+ Validates that the recorded age is consistent with the birth date.
7
+
8
+ Args:
9
+ dataframe: pandas DataFrame.
10
+ age_field: Column name for age.
11
+ birth_date_field: Column name for birth date.
12
+ tolerance_years: Allowed difference in years (default 1 for leap year/rounding).
13
+
14
+ Returns:
15
+ dict with validation results.
16
+ """
17
+ if not isinstance(dataframe, pd.DataFrame):
18
+ raise TypeError("Expected pandas DataFrame")
19
+ if dataframe.empty:
20
+ raise ValueError("DataFrame is empty")
21
+
22
+ failures = []
23
+ failed_row_indices = set()
24
+ now = datetime.now()
25
+
26
+ for idx, row in dataframe.iterrows():
27
+ age = row.get(age_field)
28
+ birth_date = row.get(birth_date_field)
29
+
30
+ if pd.isna(age) or pd.isna(birth_date):
31
+ continue
32
+
33
+ try:
34
+ birth_dt = pd.to_datetime(birth_date).to_pydatetime()
35
+ calculated_age = (now - birth_dt).days / 365.25
36
+ if abs(calculated_age - float(age)) > tolerance_years:
37
+ failed_row_indices.add(idx)
38
+ patient_id = row.get('patient_id', 'UNKNOWN')
39
+ failures.append({
40
+ 'row_index': idx,
41
+ 'patient_id': patient_id,
42
+ 'recorded_age': age,
43
+ 'calculated_age': round(calculated_age, 1),
44
+ 'error': 'Age does not match birth date'
45
+ })
46
+ except (ValueError, TypeError):
47
+ continue
48
+
49
+ return {
50
+ 'validator_name': 'age_consistency',
51
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
52
+ 'total_records': len(dataframe),
53
+ 'failed_records': len(failed_row_indices),
54
+ 'failure_count': len(failures),
55
+ 'failures': failures
56
+ }
@@ -0,0 +1,57 @@
1
+ import pandas as pd
2
+
3
+ def validate_data_completeness(dataframe, test_field='test_type', completeness_rules=None):
4
+ """
5
+ Validates that required fields are present for specific test types.
6
+
7
+ Args:
8
+ dataframe: pandas DataFrame.
9
+ test_field: Column name indicating the test type.
10
+ completeness_rules: Dict mapping test types to lists of required fields.
11
+ Default: {'Blood': ['hemoglobin', 'wbc'], 'Urine': ['ph', 'protein']}
12
+
13
+ Returns:
14
+ dict with validation results.
15
+ """
16
+ if not isinstance(dataframe, pd.DataFrame):
17
+ raise TypeError("Expected pandas DataFrame")
18
+ if dataframe.empty:
19
+ raise ValueError("DataFrame is empty")
20
+
21
+ if completeness_rules is None:
22
+ completeness_rules = {
23
+ 'Blood': ['hemoglobin', 'wbc'],
24
+ 'Urine': ['ph', 'protein']
25
+ }
26
+
27
+ failures = []
28
+ failed_row_indices = set()
29
+
30
+ for idx, row in dataframe.iterrows():
31
+ test_type = row.get(test_field)
32
+ if pd.isna(test_type):
33
+ continue
34
+
35
+ test_str = str(test_type).strip()
36
+ if test_str in completeness_rules:
37
+ required_fields = completeness_rules[test_str]
38
+ for field in required_fields:
39
+ if field not in dataframe.columns or pd.isna(row.get(field)) or str(row.get(field)).strip() == '':
40
+ failed_row_indices.add(idx)
41
+ patient_id = row.get('patient_id', 'UNKNOWN')
42
+ failures.append({
43
+ 'row_index': idx,
44
+ 'patient_id': patient_id,
45
+ 'test_type': test_str,
46
+ 'missing_field': field,
47
+ 'error': f"Missing required field '{field}' for {test_str} test"
48
+ })
49
+
50
+ return {
51
+ 'validator_name': 'data_completeness',
52
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
53
+ 'total_records': len(dataframe),
54
+ 'failed_records': len(failed_row_indices),
55
+ 'failure_count': len(failures),
56
+ 'failures': failures
57
+ }
@@ -0,0 +1,62 @@
1
+ import pandas as pd
2
+
3
+ def validate_duplicate_records(dataframe, duplicate_fields=None):
4
+ """
5
+ Validates that there are no duplicate records based on specified fields.
6
+ Typical use case: same patient + same test + same date
7
+
8
+ Args:
9
+ dataframe: pandas DataFrame with clinical data
10
+ duplicate_fields: list of field names to check for duplicates
11
+ Default: ['patient_id', 'test_name', 'test_date']
12
+
13
+ Returns:
14
+ dict with validation results
15
+ """
16
+ if not isinstance(dataframe, pd.DataFrame):
17
+ raise TypeError("Expected pandas DataFrame")
18
+ if dataframe.empty:
19
+ raise ValueError("DataFrame is empty")
20
+
21
+ if duplicate_fields is None:
22
+ duplicate_fields = ['patient_id', 'test_name', 'test_date']
23
+
24
+ # Check if all required fields exist
25
+ available_fields = [f for f in duplicate_fields if f in dataframe.columns]
26
+ if len(available_fields) < 2:
27
+ raise ValueError(f"Need at least 2 fields to check duplicates. Found: {available_fields}")
28
+
29
+ # Find duplicates
30
+ duplicates = dataframe[dataframe.duplicated(subset=available_fields, keep=False)]
31
+
32
+ failures = []
33
+ if not duplicates.empty:
34
+ # Get unique duplicate groups
35
+ duplicate_groups = duplicates.groupby(available_fields).size().reset_index(name='count')
36
+ duplicate_groups = duplicate_groups[duplicate_groups['count'] > 1]
37
+
38
+ for _, row in duplicate_groups.iterrows():
39
+ # Find all rows in this duplicate group
40
+ mask = True
41
+ for field in available_fields:
42
+ mask = mask & (duplicates[field] == row[field])
43
+ duplicate_indices = duplicates[mask].index.tolist()
44
+
45
+ for idx in duplicate_indices:
46
+ failures.append({
47
+ 'row_index': idx,
48
+ 'duplicate_fields': available_fields,
49
+ 'duplicate_values': {field: row[field] for field in available_fields},
50
+ 'occurrences': int(row['count'])
51
+ })
52
+
53
+ failed_row_indices = set(f['row_index'] for f in failures)
54
+
55
+ return {
56
+ 'validator_name': 'duplicate_records',
57
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
58
+ 'total_records': len(dataframe),
59
+ 'failed_records': len(failed_row_indices),
60
+ 'failure_count': len(failures),
61
+ 'failures': failures
62
+ }
@@ -0,0 +1,37 @@
1
+ import pandas as pd
2
+
3
+ def validate_future_dates(dataframe, date_fields=None):
4
+ if not isinstance(dataframe, pd.DataFrame):
5
+ raise TypeError("Expected pandas DataFrame")
6
+ if dataframe.empty:
7
+ raise ValueError("DataFrame is empty")
8
+
9
+ if date_fields is None:
10
+ date_fields = ['visit_date', 'test_date']
11
+
12
+ failures = []
13
+ failed_row_indices = set()
14
+ now = pd.Timestamp.now()
15
+
16
+ for field in date_fields:
17
+ if field not in dataframe.columns: continue
18
+ for idx, val in dataframe[field].items():
19
+ if pd.isna(val) or str(val).strip() == '': continue
20
+ try:
21
+ # QA FIX: Force tz-naive to prevent comparison crashes with mixed timezone data
22
+ date_val = pd.to_datetime(val).tz_localize(None)
23
+ if date_val > now:
24
+ failed_row_indices.add(idx)
25
+ patient_id = dataframe.at[idx, 'patient_id'] if 'patient_id' in dataframe.columns else 'UNKNOWN'
26
+ failures.append({'row_index': idx, 'field': field, 'invalid_value': str(val), 'error': 'Date is in the future', 'patient_id': patient_id})
27
+ except (ValueError, TypeError):
28
+ continue
29
+
30
+ return {
31
+ 'validator_name': 'future_dates',
32
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
33
+ 'total_records': len(dataframe),
34
+ 'failed_records': len(failed_row_indices),
35
+ 'failure_count': len(failures),
36
+ 'failures': failures
37
+ }
@@ -0,0 +1,61 @@
1
+ import pandas as pd
2
+
3
+ def validate_gender_based_tests(dataframe, test_field='test_name', gender_field='gender', rules=None):
4
+ """
5
+ Validates that certain medical tests are only performed on appropriate genders.
6
+
7
+ Args:
8
+ dataframe: pandas DataFrame.
9
+ test_field: Column name for test name.
10
+ gender_field: Column name for gender.
11
+ rules: Dict mapping test names to allowed genders.
12
+ Default: {'Prostate_Specific_Antigen': ['M', 'Male'], 'Pap_Smear': ['F', 'Female']}
13
+
14
+ Returns:
15
+ dict with validation results.
16
+ """
17
+ if not isinstance(dataframe, pd.DataFrame):
18
+ raise TypeError("Expected pandas DataFrame")
19
+ if dataframe.empty:
20
+ raise ValueError("DataFrame is empty")
21
+
22
+ if rules is None:
23
+ rules = {
24
+ 'Prostate_Specific_Antigen': ['M', 'Male'],
25
+ 'Pap_Smear': ['F', 'Female']
26
+ }
27
+
28
+ failures = []
29
+ failed_row_indices = set()
30
+
31
+ for idx, row in dataframe.iterrows():
32
+ test_name = row.get(test_field)
33
+ gender = row.get(gender_field)
34
+
35
+ if pd.isna(test_name) or pd.isna(gender):
36
+ continue
37
+
38
+ test_str = str(test_name).strip()
39
+ gender_str = str(gender).strip()
40
+
41
+ if test_str in rules:
42
+ allowed_genders = [g.lower() for g in rules[test_str]]
43
+ if gender_str.lower() not in allowed_genders:
44
+ failed_row_indices.add(idx)
45
+ patient_id = row.get('patient_id', 'UNKNOWN')
46
+ failures.append({
47
+ 'row_index': idx,
48
+ 'patient_id': patient_id,
49
+ 'test': test_str,
50
+ 'gender': gender_str,
51
+ 'error': f"Test {test_str} is invalid for gender {gender_str}"
52
+ })
53
+
54
+ return {
55
+ 'validator_name': 'gender_based_validation',
56
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
57
+ 'total_records': len(dataframe),
58
+ 'failed_records': len(failed_row_indices),
59
+ 'failure_count': len(failures),
60
+ 'failures': failures
61
+ }
@@ -0,0 +1,46 @@
1
+ import pandas as pd
2
+
3
+ def validate_invalid_data_types(dataframe, numeric_fields=None, date_fields=None):
4
+ if not isinstance(dataframe, pd.DataFrame):
5
+ raise TypeError("Expected pandas DataFrame")
6
+ if dataframe.empty:
7
+ raise ValueError("DataFrame is empty")
8
+
9
+ if numeric_fields is None:
10
+ numeric_fields = ['lab_value']
11
+ if date_fields is None:
12
+ date_fields = ['visit_date', 'test_date']
13
+
14
+ failures = []
15
+ failed_row_indices = set()
16
+
17
+ for field in numeric_fields:
18
+ if field not in dataframe.columns: continue
19
+ for idx, val in dataframe[field].items():
20
+ if pd.isna(val) or str(val).strip() == '': continue
21
+ try:
22
+ float(val)
23
+ except (ValueError, TypeError):
24
+ failed_row_indices.add(idx)
25
+ patient_id = dataframe.at[idx, 'patient_id'] if 'patient_id' in dataframe.columns else 'UNKNOWN'
26
+ failures.append({'row_index': idx, 'field': field, 'invalid_value': str(val), 'expected_type': 'numeric', 'patient_id': patient_id})
27
+
28
+ for field in date_fields:
29
+ if field not in dataframe.columns: continue
30
+ for idx, val in dataframe[field].items():
31
+ if pd.isna(val) or str(val).strip() == '': continue
32
+ try:
33
+ pd.to_datetime(val)
34
+ except (ValueError, TypeError):
35
+ failed_row_indices.add(idx)
36
+ patient_id = dataframe.at[idx, 'patient_id'] if 'patient_id' in dataframe.columns else 'UNKNOWN'
37
+ failures.append({'row_index': idx, 'field': field, 'invalid_value': str(val), 'expected_type': 'datetime', 'patient_id': patient_id})
38
+
39
+ return {
40
+ 'validator_name': 'invalid_data_types',
41
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
42
+ 'total_records': len(dataframe),
43
+ 'failed_records': len(failed_row_indices),
44
+ 'failure_count': len(failures),
45
+ 'failures': failures
46
+ }
@@ -0,0 +1,74 @@
1
+ import pandas as pd
2
+ import re
3
+
4
+ def validate_patient_ids(dataframe, patient_id_field='patient_id', pattern=None, min_length=None, max_length=None):
5
+ """
6
+ Validates patient ID format according to specified rules.
7
+
8
+ Args:
9
+ dataframe: pandas DataFrame with clinical data
10
+ patient_id_field: name of the patient ID column (default: 'patient_id')
11
+ pattern: regex pattern for validation (default: None)
12
+ min_length: minimum length of patient ID (default: None)
13
+ max_length: maximum length of patient ID (default: None)
14
+
15
+ Default behavior: Validates that patient IDs are exactly 5 digits
16
+
17
+ Returns:
18
+ dict with validation results
19
+ """
20
+ if not isinstance(dataframe, pd.DataFrame):
21
+ raise TypeError("Expected pandas DataFrame")
22
+ if dataframe.empty:
23
+ raise ValueError("DataFrame is empty")
24
+
25
+ if patient_id_field not in dataframe.columns:
26
+ raise ValueError(f"Column '{patient_id_field}' not found in DataFrame")
27
+
28
+ # Default: 5-digit numeric ID
29
+ if pattern is None and min_length is None and max_length is None:
30
+ pattern = r'^\d{5}$' # Exactly 5 digits
31
+
32
+ failures = []
33
+ failed_row_indices = set()
34
+
35
+ for idx, val in dataframe[patient_id_field].items():
36
+ if pd.isna(val) or str(val).strip() == '':
37
+ continue # Skip missing values
38
+
39
+ patient_id_str = str(val).strip()
40
+ is_valid = True
41
+ error_reason = ""
42
+
43
+ # Check pattern
44
+ if pattern:
45
+ if not re.match(pattern, patient_id_str):
46
+ is_valid = False
47
+ error_reason = f"Does not match pattern: {pattern}"
48
+
49
+ # Check length
50
+ if is_valid and (min_length is not None or max_length is not None):
51
+ if min_length is not None and len(patient_id_str) < min_length:
52
+ is_valid = False
53
+ error_reason = f"Length {len(patient_id_str)} < minimum {min_length}"
54
+ elif max_length is not None and len(patient_id_str) > max_length:
55
+ is_valid = False
56
+ error_reason = f"Length {len(patient_id_str)} > maximum {max_length}"
57
+
58
+ if not is_valid:
59
+ failed_row_indices.add(idx)
60
+ failures.append({
61
+ 'row_index': idx,
62
+ 'field': patient_id_field,
63
+ 'invalid_value': patient_id_str,
64
+ 'error': error_reason
65
+ })
66
+
67
+ return {
68
+ 'validator_name': 'invalid_patient_ids',
69
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
70
+ 'total_records': len(dataframe),
71
+ 'failed_records': len(failed_row_indices),
72
+ 'failure_count': len(failures),
73
+ 'failures': failures
74
+ }
@@ -0,0 +1,113 @@
1
+ import pandas as pd
2
+ from typing import Dict, List, Optional
3
+
4
+ def validate_missing_critical_fields(
5
+ dataframe: pd.DataFrame,
6
+ critical_fields: Optional[List[str]] = None
7
+ ) -> Dict:
8
+ """
9
+ Validates that critical clinical fields are not NULL/empty.
10
+
11
+ This validator checks for missing values (NaN, None, empty strings) in
12
+ critical clinical data fields. It's designed to catch common data quality
13
+ issues in clinical research datasets.
14
+
15
+ Args:
16
+ dataframe: pandas DataFrame with clinical data
17
+ critical_fields: list of required field names. If None, uses default
18
+ clinical fields (patient_id, visit_date, lab_test_name,
19
+ lab_value, test_date)
20
+
21
+ Returns:
22
+ Dictionary with validation results containing:
23
+ - validator_name: name of the validator
24
+ - status: 'PASS' or 'FAIL'
25
+ - total_records: total rows in dataframe
26
+ - failed_records: number of rows with missing values
27
+ - failure_count: total number of missing value instances
28
+ - critical_fields_checked: list of fields that were validated
29
+ - failures: list of detailed failure information
30
+
31
+ Raises:
32
+ ValueError: if dataframe is empty or critical_fields is empty list
33
+ TypeError: if dataframe is not a pandas DataFrame
34
+
35
+ Example:
36
+ >>> import pandas as pd
37
+ >>> from clinical_validators import validate_missing_critical_fields
38
+ >>> df = pd.read_csv('lab_data.csv')
39
+ >>> result = validate_missing_critical_fields(df)
40
+ >>> print(result['status'])
41
+ 'FAIL'
42
+ >>> print(f"Failed records: {result['failed_records']}")
43
+ 'Failed records: 5'
44
+ """
45
+
46
+ # ===== INPUT VALIDATION =====
47
+ if not isinstance(dataframe, pd.DataFrame):
48
+ raise TypeError(f"Expected pandas DataFrame, got {type(dataframe).__name__}")
49
+
50
+ if dataframe.empty:
51
+ raise ValueError("DataFrame is empty - cannot validate empty dataset")
52
+
53
+ # Default critical fields for clinical data
54
+ if critical_fields is None:
55
+ critical_fields = [
56
+ 'patient_id',
57
+ 'visit_date',
58
+ 'lab_test_name',
59
+ 'lab_value',
60
+ 'test_date'
61
+ ]
62
+
63
+ if isinstance(critical_fields, list) and len(critical_fields) == 0:
64
+ raise ValueError("critical_fields list cannot be empty")
65
+
66
+ # ===== VALIDATION LOGIC =====
67
+ failures = []
68
+
69
+ # Check each critical field
70
+ for field in critical_fields:
71
+ # Skip if field doesn't exist in dataframe
72
+ if field not in dataframe.columns:
73
+ continue
74
+
75
+ # Find rows where field is NULL, NaN, or empty string
76
+ # This handles: None, np.nan, pd.NaT, and empty strings
77
+ missing_mask = (dataframe[field].isna()) | (dataframe[field] == '')
78
+ missing_indices = dataframe[missing_mask].index.tolist()
79
+
80
+ # Record each failure with context
81
+ for idx in missing_indices:
82
+ # Safely retrieve patient_id for context
83
+ patient_id = 'UNKNOWN'
84
+ try:
85
+ if 'patient_id' in dataframe.columns:
86
+ pid_value = dataframe.at[idx, 'patient_id']
87
+ # Check if patient_id itself is missing
88
+ if pd.isna(pid_value) or pid_value == '':
89
+ patient_id = 'UNKNOWN'
90
+ else:
91
+ patient_id = str(pid_value)
92
+ except (KeyError, IndexError, TypeError):
93
+ patient_id = 'UNKNOWN'
94
+
95
+ failures.append({
96
+ 'row_index': idx,
97
+ 'missing_field': field,
98
+ 'patient_id': patient_id
99
+ })
100
+
101
+ # Count unique rows with any failures
102
+ failed_row_indices = set([f['row_index'] for f in failures])
103
+
104
+ # ===== RETURN RESULTS =====
105
+ return {
106
+ 'validator_name': 'missing_critical_fields',
107
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
108
+ 'total_records': len(dataframe),
109
+ 'failed_records': len(failed_row_indices),
110
+ 'critical_fields_checked': critical_fields,
111
+ 'failure_count': len(failures),
112
+ 'failures': failures
113
+ }
@@ -0,0 +1,43 @@
1
+ import pandas as pd
2
+
3
+ def validate_missing_visit_data(dataframe, lab_field='lab_value', visit_field='visit_date'):
4
+ """
5
+ Validates that if a patient has a lab result, they must have a corresponding visit record.
6
+
7
+ Args:
8
+ dataframe: pandas DataFrame with clinical data.
9
+ lab_field: Column name containing lab results.
10
+ visit_field: Column name containing visit dates.
11
+
12
+ Returns:
13
+ dict with validation results.
14
+ """
15
+ if not isinstance(dataframe, pd.DataFrame):
16
+ raise TypeError("Expected pandas DataFrame")
17
+ if dataframe.empty:
18
+ raise ValueError("DataFrame is empty")
19
+
20
+ failures = []
21
+ failed_row_indices = set()
22
+
23
+ for idx, row in dataframe.iterrows():
24
+ has_lab = pd.notna(row.get(lab_field)) and str(row.get(lab_field)).strip() != ''
25
+ has_visit = pd.notna(row.get(visit_field)) and str(row.get(visit_field)).strip() != ''
26
+
27
+ if has_lab and not has_visit:
28
+ failed_row_indices.add(idx)
29
+ patient_id = row.get('patient_id', 'UNKNOWN')
30
+ failures.append({
31
+ 'row_index': idx,
32
+ 'patient_id': patient_id,
33
+ 'error': f"Has lab value '{row[lab_field]}' but missing {visit_field}"
34
+ })
35
+
36
+ return {
37
+ 'validator_name': 'missing_visit_data',
38
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
39
+ 'total_records': len(dataframe),
40
+ 'failed_records': len(failed_row_indices),
41
+ 'failure_count': len(failures),
42
+ 'failures': failures
43
+ }
@@ -0,0 +1,61 @@
1
+ import pandas as pd
2
+
3
+ def validate_out_of_range_values(dataframe, range_rules=None):
4
+ """
5
+ Validates that numeric fields fall within acceptable clinical ranges.
6
+
7
+ Args:
8
+ dataframe: pandas DataFrame with clinical data
9
+ range_rules: dict with field names as keys and (min, max) tuples as values
10
+ Example: {'lab_value': (0, 1000), 'age': (0, 120)}
11
+
12
+ Returns:
13
+ dict with validation results
14
+ """
15
+ if not isinstance(dataframe, pd.DataFrame):
16
+ raise TypeError("Expected pandas DataFrame")
17
+ if dataframe.empty:
18
+ raise ValueError("DataFrame is empty")
19
+
20
+ # Default clinical ranges
21
+ if range_rules is None:
22
+ range_rules = {
23
+ 'lab_value': (0, 10000), # Generic lab value range
24
+ 'age': (0, 120) # Human age range
25
+ }
26
+
27
+ failures = []
28
+ failed_row_indices = set()
29
+
30
+ for field, (min_val, max_val) in range_rules.items():
31
+ if field not in dataframe.columns:
32
+ continue
33
+
34
+ for idx, val in dataframe[field].items():
35
+ if pd.isna(val):
36
+ continue # Skip missing values
37
+
38
+ try:
39
+ numeric_val = float(val)
40
+ if numeric_val < min_val or numeric_val > max_val:
41
+ failed_row_indices.add(idx)
42
+ patient_id = dataframe.at[idx, 'patient_id'] if 'patient_id' in dataframe.columns else 'UNKNOWN'
43
+ failures.append({
44
+ 'row_index': idx,
45
+ 'field': field,
46
+ 'invalid_value': numeric_val,
47
+ 'expected_range': f"{min_val}-{max_val}",
48
+ 'patient_id': patient_id
49
+ })
50
+ except (ValueError, TypeError):
51
+ # Skip non-numeric values (handled by invalid_data_types validator)
52
+ continue
53
+
54
+ return {
55
+ 'validator_name': 'out_of_range_values',
56
+ 'status': 'PASS' if len(failed_row_indices) == 0 else 'FAIL',
57
+ 'total_records': len(dataframe),
58
+ 'failed_records': len(failed_row_indices),
59
+ 'failure_count': len(failures),
60
+ 'failures': failures
61
+ }