iints-sdk-python35 0.0.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iints/__init__.py +183 -0
- iints/analysis/__init__.py +12 -0
- iints/analysis/algorithm_xray.py +387 -0
- iints/analysis/baseline.py +92 -0
- iints/analysis/clinical_benchmark.py +198 -0
- iints/analysis/clinical_metrics.py +551 -0
- iints/analysis/clinical_tir_analyzer.py +136 -0
- iints/analysis/diabetes_metrics.py +43 -0
- iints/analysis/edge_efficiency.py +33 -0
- iints/analysis/edge_performance_monitor.py +315 -0
- iints/analysis/explainability.py +94 -0
- iints/analysis/explainable_ai.py +232 -0
- iints/analysis/hardware_benchmark.py +221 -0
- iints/analysis/metrics.py +117 -0
- iints/analysis/population_report.py +188 -0
- iints/analysis/reporting.py +345 -0
- iints/analysis/safety_index.py +311 -0
- iints/analysis/sensor_filtering.py +54 -0
- iints/analysis/validator.py +273 -0
- iints/api/__init__.py +0 -0
- iints/api/base_algorithm.py +307 -0
- iints/api/registry.py +103 -0
- iints/api/template_algorithm.py +195 -0
- iints/assets/iints_logo.png +0 -0
- iints/cli/__init__.py +0 -0
- iints/cli/cli.py +2598 -0
- iints/core/__init__.py +1 -0
- iints/core/algorithms/__init__.py +0 -0
- iints/core/algorithms/battle_runner.py +138 -0
- iints/core/algorithms/correction_bolus.py +95 -0
- iints/core/algorithms/discovery.py +92 -0
- iints/core/algorithms/fixed_basal_bolus.py +58 -0
- iints/core/algorithms/hybrid_algorithm.py +92 -0
- iints/core/algorithms/lstm_algorithm.py +138 -0
- iints/core/algorithms/mock_algorithms.py +162 -0
- iints/core/algorithms/pid_controller.py +88 -0
- iints/core/algorithms/standard_pump_algo.py +64 -0
- iints/core/device.py +0 -0
- iints/core/device_manager.py +64 -0
- iints/core/devices/__init__.py +3 -0
- iints/core/devices/models.py +160 -0
- iints/core/patient/__init__.py +9 -0
- iints/core/patient/bergman_model.py +341 -0
- iints/core/patient/models.py +285 -0
- iints/core/patient/patient_factory.py +117 -0
- iints/core/patient/profile.py +41 -0
- iints/core/safety/__init__.py +12 -0
- iints/core/safety/config.py +37 -0
- iints/core/safety/input_validator.py +95 -0
- iints/core/safety/supervisor.py +39 -0
- iints/core/simulation/__init__.py +0 -0
- iints/core/simulation/scenario_parser.py +61 -0
- iints/core/simulator.py +874 -0
- iints/core/supervisor.py +367 -0
- iints/data/__init__.py +53 -0
- iints/data/adapter.py +142 -0
- iints/data/column_mapper.py +398 -0
- iints/data/datasets.json +132 -0
- iints/data/demo/__init__.py +1 -0
- iints/data/demo/demo_cgm.csv +289 -0
- iints/data/importer.py +275 -0
- iints/data/ingestor.py +162 -0
- iints/data/nightscout.py +128 -0
- iints/data/quality_checker.py +550 -0
- iints/data/registry.py +166 -0
- iints/data/tidepool.py +38 -0
- iints/data/universal_parser.py +813 -0
- iints/data/virtual_patients/clinic_safe_baseline.yaml +9 -0
- iints/data/virtual_patients/clinic_safe_hyper_challenge.yaml +9 -0
- iints/data/virtual_patients/clinic_safe_hypo_prone.yaml +9 -0
- iints/data/virtual_patients/clinic_safe_midnight.yaml +9 -0
- iints/data/virtual_patients/clinic_safe_pizza.yaml +9 -0
- iints/data/virtual_patients/clinic_safe_stress_meal.yaml +9 -0
- iints/data/virtual_patients/default_patient.yaml +11 -0
- iints/data/virtual_patients/patient_559_config.yaml +11 -0
- iints/emulation/__init__.py +80 -0
- iints/emulation/legacy_base.py +414 -0
- iints/emulation/medtronic_780g.py +337 -0
- iints/emulation/omnipod_5.py +367 -0
- iints/emulation/tandem_controliq.py +393 -0
- iints/highlevel.py +451 -0
- iints/learning/__init__.py +3 -0
- iints/learning/autonomous_optimizer.py +194 -0
- iints/learning/learning_system.py +122 -0
- iints/metrics.py +34 -0
- iints/population/__init__.py +11 -0
- iints/population/generator.py +131 -0
- iints/population/runner.py +327 -0
- iints/presets/__init__.py +28 -0
- iints/presets/presets.json +114 -0
- iints/research/__init__.py +30 -0
- iints/research/config.py +68 -0
- iints/research/dataset.py +319 -0
- iints/research/losses.py +73 -0
- iints/research/predictor.py +329 -0
- iints/scenarios/__init__.py +3 -0
- iints/scenarios/generator.py +92 -0
- iints/templates/__init__.py +0 -0
- iints/templates/default_algorithm.py +91 -0
- iints/templates/scenarios/__init__.py +0 -0
- iints/templates/scenarios/chaos_insulin_stacking.json +29 -0
- iints/templates/scenarios/chaos_runaway_ai.json +25 -0
- iints/templates/scenarios/example_scenario.json +35 -0
- iints/templates/scenarios/exercise_stress.json +30 -0
- iints/utils/__init__.py +3 -0
- iints/utils/plotting.py +50 -0
- iints/utils/run_io.py +152 -0
- iints/validation/__init__.py +133 -0
- iints/validation/schemas.py +94 -0
- iints/visualization/__init__.py +34 -0
- iints/visualization/cockpit.py +691 -0
- iints/visualization/uncertainty_cloud.py +612 -0
- iints_sdk_python35-0.0.18.dist-info/METADATA +225 -0
- iints_sdk_python35-0.0.18.dist-info/RECORD +118 -0
- iints_sdk_python35-0.0.18.dist-info/WHEEL +5 -0
- iints_sdk_python35-0.0.18.dist-info/entry_points.txt +10 -0
- iints_sdk_python35-0.0.18.dist-info/licenses/LICENSE +28 -0
- iints_sdk_python35-0.0.18.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,550 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Data Quality Checker - IINTS-AF
|
|
4
|
+
Validates data quality and calculates confidence scores with gap detection.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Dict, List, Optional, Tuple
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from datetime import datetime, timedelta
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class QualityReport:
|
|
16
|
+
"""Comprehensive data quality report"""
|
|
17
|
+
overall_score: float # 0.0 - 1.0
|
|
18
|
+
completeness_score: float # Data coverage percentage
|
|
19
|
+
consistency_score: float # Temporal consistency
|
|
20
|
+
validity_score: float # Value range validation
|
|
21
|
+
gaps: List['DataGap']
|
|
22
|
+
anomalies: List['DataAnomaly']
|
|
23
|
+
warnings: List[str]
|
|
24
|
+
summary: str
|
|
25
|
+
|
|
26
|
+
def to_dict(self) -> Dict:
|
|
27
|
+
return {
|
|
28
|
+
'overall_score': self.overall_score,
|
|
29
|
+
'completeness_score': self.completeness_score,
|
|
30
|
+
'consistency_score': self.consistency_score,
|
|
31
|
+
'validity_score': self.validity_score,
|
|
32
|
+
'gaps': [g.to_dict() for g in self.gaps],
|
|
33
|
+
'anomalies': [a.to_dict() for a in self.anomalies],
|
|
34
|
+
'warnings': self.warnings,
|
|
35
|
+
'summary': self.summary
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class DataGap:
|
|
41
|
+
"""Represents a gap in the data"""
|
|
42
|
+
start_time: float
|
|
43
|
+
end_time: float
|
|
44
|
+
duration_minutes: float
|
|
45
|
+
data_points_missing: int
|
|
46
|
+
percentage_of_total: float
|
|
47
|
+
time_range_description: str
|
|
48
|
+
|
|
49
|
+
def to_dict(self) -> Dict:
|
|
50
|
+
return {
|
|
51
|
+
'start_time': self.start_time,
|
|
52
|
+
'end_time': self.end_time,
|
|
53
|
+
'duration_minutes': self.duration_minutes,
|
|
54
|
+
'data_points_missing': self.data_points_missing,
|
|
55
|
+
'percentage_of_total': self.percentage_of_total,
|
|
56
|
+
'time_range_description': self.time_range_description
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
def get_warning_message(self) -> str:
|
|
60
|
+
"""Generate human-readable warning message"""
|
|
61
|
+
return (
|
|
62
|
+
f"[WARN] DATA GAP DETECTED: {self.percentage_of_total:.1f}% of data missing "
|
|
63
|
+
f"({self.data_points_missing} points) between {self.time_range_description} "
|
|
64
|
+
f"({self.duration_minutes:.0f} minutes)"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class DataAnomaly:
|
|
70
|
+
"""Represents an anomalous data point"""
|
|
71
|
+
index: int
|
|
72
|
+
timestamp: float
|
|
73
|
+
value: float
|
|
74
|
+
anomaly_type: str # 'outlier', 'impossible_value', 'rapid_change'
|
|
75
|
+
severity: str # 'low', 'medium', 'high'
|
|
76
|
+
description: str
|
|
77
|
+
|
|
78
|
+
def to_dict(self) -> Dict:
|
|
79
|
+
return {
|
|
80
|
+
'index': self.index,
|
|
81
|
+
'timestamp': self.timestamp,
|
|
82
|
+
'value': self.value,
|
|
83
|
+
'anomaly_type': self.anomaly_type,
|
|
84
|
+
'severity': self.severity,
|
|
85
|
+
'description': self.description
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class DataQualityChecker:
|
|
90
|
+
"""
|
|
91
|
+
Validates data quality and calculates confidence scores.
|
|
92
|
+
|
|
93
|
+
Performs comprehensive checks:
|
|
94
|
+
- Completeness: Detects missing data and gaps
|
|
95
|
+
- Consistency: Validates temporal sampling
|
|
96
|
+
- Validity: Checks value ranges
|
|
97
|
+
|
|
98
|
+
Outputs confidence score and detailed warnings.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
# Physiological limits for glucose values
|
|
102
|
+
GLUCOSE_LIMITS = {
|
|
103
|
+
'minimum': 20, # mg/dL - physiologically possible minimum
|
|
104
|
+
'maximum': 600, # mg/dL - physiologically possible maximum
|
|
105
|
+
'critical_low': 54, # mg/dL - clinically significant low
|
|
106
|
+
'critical_high': 350 # mg/dL - clinically significant high
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
PHYSIOLOGICAL_RATES = {
|
|
110
|
+
'max_glucose_change_per_min': 19.9 # mg/dL/min - Detecting changes of 20 mg/dL/min or more
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
# Expected sampling intervals (in minutes)
|
|
114
|
+
EXPECTED_INTERVALS = {
|
|
115
|
+
'cgm': 5, # Continuous Glucose Monitor
|
|
116
|
+
'bg_meter': 60, # Blood glucose meter
|
|
117
|
+
'manual': 240 # Manual logging
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
def __init__(self, expected_interval: int = 5, source_type: str = 'cgm'):
|
|
121
|
+
"""
|
|
122
|
+
Initialize quality checker.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
expected_interval: Expected time between readings in minutes
|
|
126
|
+
source_type: Data source type ('cgm', 'bg_meter', 'manual')
|
|
127
|
+
"""
|
|
128
|
+
self.expected_interval = expected_interval
|
|
129
|
+
self.source_type = source_type
|
|
130
|
+
|
|
131
|
+
def check_completeness(self, df: pd.DataFrame) -> Tuple[float, List[DataGap]]:
|
|
132
|
+
"""
|
|
133
|
+
Check data completeness and detect gaps.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
df: DataFrame with timestamp and glucose columns
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Tuple of (completeness_score, list of gaps)
|
|
140
|
+
"""
|
|
141
|
+
if 'timestamp' not in df.columns:
|
|
142
|
+
return 1.0, [] # Can't check without timestamp
|
|
143
|
+
|
|
144
|
+
timestamps = df['timestamp'].dropna().sort_values().astype(float)
|
|
145
|
+
|
|
146
|
+
if len(timestamps) < 2:
|
|
147
|
+
return 1.0, []
|
|
148
|
+
|
|
149
|
+
# Calculate expected number of readings
|
|
150
|
+
time_span = timestamps.iloc[-1] - timestamps.iloc[0]
|
|
151
|
+
expected_readings = int((time_span / self.expected_interval) + 1)
|
|
152
|
+
actual_readings = len(timestamps)
|
|
153
|
+
|
|
154
|
+
# Completeness score
|
|
155
|
+
completeness = min(1.0, actual_readings / expected_readings)
|
|
156
|
+
|
|
157
|
+
# Detect gaps
|
|
158
|
+
gaps = self._detect_gaps(timestamps, time_span, actual_readings, int(expected_readings))
|
|
159
|
+
|
|
160
|
+
return completeness, gaps
|
|
161
|
+
|
|
162
|
+
def _detect_gaps(self,
|
|
163
|
+
timestamps: pd.Series,
|
|
164
|
+
time_span: float,
|
|
165
|
+
actual_readings: int,
|
|
166
|
+
expected_readings: int) -> List[DataGap]:
|
|
167
|
+
"""Detect gaps in the data"""
|
|
168
|
+
gaps: List[DataGap] = []
|
|
169
|
+
|
|
170
|
+
if actual_readings < 2:
|
|
171
|
+
return gaps
|
|
172
|
+
|
|
173
|
+
# Calculate time differences between consecutive readings
|
|
174
|
+
time_diffs = timestamps.diff().dropna().astype(float)
|
|
175
|
+
|
|
176
|
+
# Threshold for gap detection (3x expected interval)
|
|
177
|
+
gap_threshold = float(self.expected_interval * 3)
|
|
178
|
+
|
|
179
|
+
# Find gap locations
|
|
180
|
+
gap_indices = time_diffs[time_diffs > gap_threshold].index
|
|
181
|
+
|
|
182
|
+
for idx in gap_indices:
|
|
183
|
+
# Get timestamps around the gap
|
|
184
|
+
before_idx = idx - 1
|
|
185
|
+
after_idx = idx
|
|
186
|
+
|
|
187
|
+
start_time = timestamps.loc[before_idx]
|
|
188
|
+
end_time = timestamps.loc[after_idx]
|
|
189
|
+
|
|
190
|
+
gap_duration = end_time - start_time
|
|
191
|
+
points_missing = int(gap_duration / self.expected_interval) - 1
|
|
192
|
+
gap_percentage = (points_missing / expected_readings) * 100 if expected_readings > 0 else 0
|
|
193
|
+
|
|
194
|
+
# Create time range description
|
|
195
|
+
start_minutes = int(start_time)
|
|
196
|
+
end_minutes = int(end_time)
|
|
197
|
+
hours_start = start_minutes // 60
|
|
198
|
+
mins_start = start_minutes % 60
|
|
199
|
+
hours_end = end_minutes // 60
|
|
200
|
+
mins_end = end_minutes % 60
|
|
201
|
+
|
|
202
|
+
time_range_desc = f"{hours_start:02d}:{mins_start:02d} - {hours_end:02d}:{mins_end:02d}"
|
|
203
|
+
|
|
204
|
+
gap = DataGap(
|
|
205
|
+
start_time=start_time,
|
|
206
|
+
end_time=end_time,
|
|
207
|
+
duration_minutes=gap_duration,
|
|
208
|
+
data_points_missing=points_missing,
|
|
209
|
+
percentage_of_total=gap_percentage,
|
|
210
|
+
time_range_description=time_range_desc
|
|
211
|
+
)
|
|
212
|
+
gaps.append(gap)
|
|
213
|
+
|
|
214
|
+
return gaps
|
|
215
|
+
|
|
216
|
+
def check_consistency(self, df: pd.DataFrame) -> float:
|
|
217
|
+
"""
|
|
218
|
+
Check temporal consistency of data.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
df: DataFrame with timestamp column
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Consistency score (0.0 - 1.0)
|
|
225
|
+
"""
|
|
226
|
+
if 'timestamp' not in df.columns:
|
|
227
|
+
return 1.0
|
|
228
|
+
|
|
229
|
+
timestamps = df['timestamp'].dropna().sort_values()
|
|
230
|
+
|
|
231
|
+
if len(timestamps) < 3:
|
|
232
|
+
return 1.0
|
|
233
|
+
|
|
234
|
+
# Calculate time differences
|
|
235
|
+
time_diffs = timestamps.diff().dropna()
|
|
236
|
+
|
|
237
|
+
if len(time_diffs) == 0:
|
|
238
|
+
return 1.0
|
|
239
|
+
|
|
240
|
+
# Check for irregular intervals
|
|
241
|
+
mean_interval = time_diffs.mean()
|
|
242
|
+
std_interval = time_diffs.std()
|
|
243
|
+
|
|
244
|
+
# Coefficient of variation
|
|
245
|
+
cv = std_interval / mean_interval if mean_interval > 0 else 0
|
|
246
|
+
|
|
247
|
+
# Score based on CV (lower is better)
|
|
248
|
+
if cv < 0.1: # Very consistent
|
|
249
|
+
return 1.0
|
|
250
|
+
elif cv < 0.25: # Mostly consistent
|
|
251
|
+
return 0.9
|
|
252
|
+
elif cv < 0.5: # Somewhat inconsistent
|
|
253
|
+
return 0.7
|
|
254
|
+
else: # Very inconsistent
|
|
255
|
+
return 0.5
|
|
256
|
+
|
|
257
|
+
def check_validity(self, df: pd.DataFrame) -> Tuple[float, List[DataAnomaly]]:
|
|
258
|
+
"""
|
|
259
|
+
Check data validity and detect anomalies.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
df: DataFrame with glucose column
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Tuple of (validity_score, list of anomalies)
|
|
266
|
+
"""
|
|
267
|
+
anomalies: List[DataAnomaly] = []
|
|
268
|
+
|
|
269
|
+
if 'glucose' not in df.columns:
|
|
270
|
+
return 1.0, anomalies
|
|
271
|
+
|
|
272
|
+
glucose = df['glucose'].dropna()
|
|
273
|
+
|
|
274
|
+
if len(glucose) == 0:
|
|
275
|
+
return 1.0, anomalies
|
|
276
|
+
|
|
277
|
+
# Check for impossible values
|
|
278
|
+
for idx, value in glucose.items():
|
|
279
|
+
if value < self.GLUCOSE_LIMITS['minimum']:
|
|
280
|
+
anomalies.append(DataAnomaly(
|
|
281
|
+
index=int(idx), # type: ignore
|
|
282
|
+
timestamp=float(df.at[idx, 'timestamp']), # type: ignore
|
|
283
|
+
value=value,
|
|
284
|
+
anomaly_type='impossible_value',
|
|
285
|
+
severity='high',
|
|
286
|
+
description=f"Glucose {value:.1f} mg/dL below physiological minimum ({self.GLUCOSE_LIMITS['minimum']})"
|
|
287
|
+
))
|
|
288
|
+
elif value > self.GLUCOSE_LIMITS['maximum']:
|
|
289
|
+
anomalies.append(DataAnomaly(
|
|
290
|
+
index=int(idx), # type: ignore
|
|
291
|
+
timestamp=float(df.at[idx, 'timestamp']), # type: ignore
|
|
292
|
+
value=value,
|
|
293
|
+
anomaly_type='impossible_value',
|
|
294
|
+
severity='high',
|
|
295
|
+
description=f"Glucose {value:.1f} mg/dL above physiological maximum ({self.GLUCOSE_LIMITS['maximum']})"
|
|
296
|
+
))
|
|
297
|
+
|
|
298
|
+
# Check for outliers using IQR method
|
|
299
|
+
q1 = glucose.quantile(0.25)
|
|
300
|
+
q3 = glucose.quantile(0.75)
|
|
301
|
+
iqr = q3 - q1
|
|
302
|
+
lower_bound = q1 - 3 * iqr # Using 3*IQR for extreme outliers
|
|
303
|
+
upper_bound = q3 + 3 * iqr
|
|
304
|
+
|
|
305
|
+
for idx, value in glucose.items():
|
|
306
|
+
if value < lower_bound or value > upper_bound:
|
|
307
|
+
severity = 'low' if (abs(value - glucose.median()) < 3 * iqr) else 'medium'
|
|
308
|
+
anomalies.append(DataAnomaly(
|
|
309
|
+
index=int(idx), # type: ignore
|
|
310
|
+
timestamp=float(df.at[idx, 'timestamp']), # type: ignore
|
|
311
|
+
value=value,
|
|
312
|
+
anomaly_type='outlier',
|
|
313
|
+
severity=severity,
|
|
314
|
+
description=f"Outlier glucose value {value:.1f} mg/dL"
|
|
315
|
+
))
|
|
316
|
+
|
|
317
|
+
# Check for rapid glucose changes (physiologically impossible)
|
|
318
|
+
if 'timestamp' in df.columns:
|
|
319
|
+
glucose_with_time = df[['timestamp', 'glucose']].dropna().sort_values('timestamp')
|
|
320
|
+
if len(glucose_with_time) >= 2:
|
|
321
|
+
time_diff = glucose_with_time['timestamp'].diff() # type: ignore
|
|
322
|
+
glucose_diff = glucose_with_time['glucose'].diff() # type: ignore
|
|
323
|
+
|
|
324
|
+
# Rate of change in mg/dL per minute
|
|
325
|
+
rate_of_change = glucose_diff / time_diff
|
|
326
|
+
|
|
327
|
+
# Use the new class attribute
|
|
328
|
+
max_rate = self.PHYSIOLOGICAL_RATES['max_glucose_change_per_min']
|
|
329
|
+
rapid_change_mask = rate_of_change.abs() > max_rate
|
|
330
|
+
|
|
331
|
+
for idx in rate_of_change[rapid_change_mask].index:
|
|
332
|
+
change = glucose_diff.loc[idx] # type: ignore
|
|
333
|
+
time_delta = time_diff.loc[idx] # type: ignore
|
|
334
|
+
actual_rate = rate_of_change.loc[idx] # type: ignore
|
|
335
|
+
|
|
336
|
+
direction = "rise" if change > 0 else "drop"
|
|
337
|
+
description = (f"Impossible glucose {direction} of {actual_rate:.1f} mg/dL/min "
|
|
338
|
+
f"(changed by {change:.1f} in {time_delta:.1f} min)")
|
|
339
|
+
|
|
340
|
+
anomalies.append(DataAnomaly(
|
|
341
|
+
index=int(idx), # type: ignore
|
|
342
|
+
timestamp=float(df.at[idx, 'timestamp']), # type: ignore
|
|
343
|
+
value=glucose_with_time.loc[idx, 'glucose'], # type: ignore
|
|
344
|
+
anomaly_type='rapid_change',
|
|
345
|
+
severity='high',
|
|
346
|
+
description=description
|
|
347
|
+
))
|
|
348
|
+
|
|
349
|
+
# Calculate validity score
|
|
350
|
+
total_points = len(glucose)
|
|
351
|
+
invalid_points = len(anomalies)
|
|
352
|
+
|
|
353
|
+
if total_points == 0:
|
|
354
|
+
return 1.0, anomalies
|
|
355
|
+
|
|
356
|
+
validity = 1.0 - (invalid_points / total_points)
|
|
357
|
+
|
|
358
|
+
return max(0.0, validity), anomalies
|
|
359
|
+
|
|
360
|
+
def check(self, df: pd.DataFrame) -> QualityReport:
|
|
361
|
+
"""
|
|
362
|
+
Perform comprehensive data quality check.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
df: DataFrame to check
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
QualityReport with all findings
|
|
369
|
+
"""
|
|
370
|
+
warnings = []
|
|
371
|
+
|
|
372
|
+
# Run all checks
|
|
373
|
+
completeness, gaps = self.check_completeness(df)
|
|
374
|
+
consistency = self.check_consistency(df)
|
|
375
|
+
validity, anomalies = self.check_validity(df)
|
|
376
|
+
|
|
377
|
+
# Calculate overall score (weighted average)
|
|
378
|
+
overall = (
|
|
379
|
+
completeness * 0.4 +
|
|
380
|
+
consistency * 0.3 +
|
|
381
|
+
validity * 0.3
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# Generate warnings
|
|
385
|
+
for gap in gaps:
|
|
386
|
+
warnings.append(gap.get_warning_message())
|
|
387
|
+
warnings.append(
|
|
388
|
+
f" [INFO] Simulation confidence score decreases to {max(0, overall - gap.percentage_of_total * 0.01):.2f}"
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
for anomaly in anomalies:
|
|
392
|
+
if anomaly.severity == 'high':
|
|
393
|
+
warnings.append(
|
|
394
|
+
f"[WARN] CRITICAL ANOMALY: {anomaly.description} at index {anomaly.index}"
|
|
395
|
+
)
|
|
396
|
+
elif anomaly.severity == 'medium':
|
|
397
|
+
warnings.append(
|
|
398
|
+
f"[WARN] ANOMALY: {anomaly.description} at index {anomaly.index}"
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
# Summary generation
|
|
402
|
+
if overall >= 0.9:
|
|
403
|
+
summary = "Excellent data quality"
|
|
404
|
+
elif overall >= 0.75:
|
|
405
|
+
summary = "Good data quality with minor issues"
|
|
406
|
+
elif overall >= 0.5:
|
|
407
|
+
summary = "Moderate data quality - use with caution"
|
|
408
|
+
elif overall >= 0.25:
|
|
409
|
+
summary = "Poor data quality - significant gaps detected"
|
|
410
|
+
else:
|
|
411
|
+
summary = "Critical data quality issues - simulation may be unreliable"
|
|
412
|
+
|
|
413
|
+
return QualityReport(
|
|
414
|
+
overall_score=overall,
|
|
415
|
+
completeness_score=completeness,
|
|
416
|
+
consistency_score=consistency,
|
|
417
|
+
validity_score=validity,
|
|
418
|
+
gaps=gaps,
|
|
419
|
+
anomalies=anomalies,
|
|
420
|
+
warnings=warnings,
|
|
421
|
+
summary=summary
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
def get_confidence_score(self, df: pd.DataFrame) -> float:
|
|
425
|
+
"""
|
|
426
|
+
Get overall confidence score for simulation.
|
|
427
|
+
|
|
428
|
+
Args:
|
|
429
|
+
df: DataFrame to check
|
|
430
|
+
|
|
431
|
+
Returns:
|
|
432
|
+
Confidence score (0.0 - 1.0)
|
|
433
|
+
"""
|
|
434
|
+
report = self.check(df)
|
|
435
|
+
return report.overall_score
|
|
436
|
+
|
|
437
|
+
def print_report(self, report: QualityReport):
|
|
438
|
+
"""Print formatted quality report"""
|
|
439
|
+
print("\n" + "=" * 70)
|
|
440
|
+
print("DATA QUALITY REPORT")
|
|
441
|
+
print("=" * 70)
|
|
442
|
+
|
|
443
|
+
# Overall score with visual indicator
|
|
444
|
+
score_bar = "█" * int(report.overall_score * 20) + "░" * (20 - int(report.overall_score * 20))
|
|
445
|
+
print(f"\nOverall Score: [{score_bar}] {report.overall_score:.1%}")
|
|
446
|
+
print(f"Summary: {report.summary}")
|
|
447
|
+
|
|
448
|
+
# Component scores
|
|
449
|
+
print(f"\nComponent Scores:")
|
|
450
|
+
print(f" Completeness: {report.completeness_score:.1%}")
|
|
451
|
+
print(f" Consistency: {report.consistency_score:.1%}")
|
|
452
|
+
print(f" Validity: {report.validity_score:.1%}")
|
|
453
|
+
|
|
454
|
+
# Gaps
|
|
455
|
+
if report.gaps:
|
|
456
|
+
print(f"\nData Gaps Found: {len(report.gaps)}")
|
|
457
|
+
for gap in report.gaps:
|
|
458
|
+
print(f" {gap.get_warning_message()}")
|
|
459
|
+
|
|
460
|
+
# Anomalies
|
|
461
|
+
high_anomalies = [a for a in report.anomalies if a.severity == 'high']
|
|
462
|
+
medium_anomalies = [a for a in report.anomalies if a.severity == 'medium']
|
|
463
|
+
|
|
464
|
+
if high_anomalies:
|
|
465
|
+
print(f"\nCRITICAL Anomalies: {len(high_anomalies)}")
|
|
466
|
+
for anomaly in high_anomalies:
|
|
467
|
+
print(f" {anomaly.description}")
|
|
468
|
+
|
|
469
|
+
if medium_anomalies:
|
|
470
|
+
print(f"\nWarnings: {len(medium_anomalies)}")
|
|
471
|
+
for anomaly in medium_anomalies[:5]: # Show first 5
|
|
472
|
+
print(f" {anomaly.description}")
|
|
473
|
+
if len(medium_anomalies) > 5:
|
|
474
|
+
print(f" ... and {len(medium_anomalies) - 5} more")
|
|
475
|
+
|
|
476
|
+
# Warnings
|
|
477
|
+
if report.warnings:
|
|
478
|
+
print(f"\n{'='*70}")
|
|
479
|
+
print("WARNINGS")
|
|
480
|
+
print("=" * 70)
|
|
481
|
+
for warning in report.warnings:
|
|
482
|
+
print(f" {warning}")
|
|
483
|
+
|
|
484
|
+
print("\n" + "=" * 70)
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def demo_quality_checker():
|
|
488
|
+
"""Demonstrate data quality checking"""
|
|
489
|
+
print("=" * 70)
|
|
490
|
+
print("DATA QUALITY CHECKER DEMONSTRATION")
|
|
491
|
+
print("=" * 70)
|
|
492
|
+
|
|
493
|
+
checker = DataQualityChecker(expected_interval=5, source_type='cgm')
|
|
494
|
+
|
|
495
|
+
# Test case 1: Clean data
|
|
496
|
+
print("\nTest Case 1: Clean Data (Simulated)")
|
|
497
|
+
print("-" * 50)
|
|
498
|
+
|
|
499
|
+
# Generate clean data
|
|
500
|
+
np.random.seed(42)
|
|
501
|
+
timestamps = np.arange(0, 480, 5) # 8 hours, 5-min intervals
|
|
502
|
+
glucose = 120 + 30 * np.sin(timestamps / 60) + np.random.normal(0, 5, len(timestamps))
|
|
503
|
+
glucose = np.clip(glucose, 40, 400) # Keep within reasonable range
|
|
504
|
+
|
|
505
|
+
clean_df = pd.DataFrame({
|
|
506
|
+
'timestamp': timestamps,
|
|
507
|
+
'glucose': glucose,
|
|
508
|
+
'carbs': np.random.choice([0, 30, 60], len(timestamps), p=[0.8, 0.15, 0.05]),
|
|
509
|
+
'insulin': np.random.choice([0, 1, 2], len(timestamps), p=[0.7, 0.2, 0.1])
|
|
510
|
+
})
|
|
511
|
+
|
|
512
|
+
report = checker.check(clean_df)
|
|
513
|
+
checker.print_report(report)
|
|
514
|
+
|
|
515
|
+
# Test case 2: Data with gaps
|
|
516
|
+
print("\n\nTest Case 2: Data with Gaps (14:00-16:00)")
|
|
517
|
+
print("-" * 50)
|
|
518
|
+
|
|
519
|
+
gap_df = clean_df.copy()
|
|
520
|
+
|
|
521
|
+
# Remove data points between 14:00 and 16:00 (in minutes from start)
|
|
522
|
+
# Assuming 14:00 = 840 minutes, but our data is only 0-480
|
|
523
|
+
# Let's create a scenario where gap is in the middle
|
|
524
|
+
gap_start_idx = 100 # Around index 100
|
|
525
|
+
gap_end_idx = 130
|
|
526
|
+
|
|
527
|
+
gap_df = gap_df.drop(range(gap_start_idx, gap_end_idx))
|
|
528
|
+
gap_df = gap_df.reset_index(drop=True)
|
|
529
|
+
|
|
530
|
+
report = checker.check(gap_df)
|
|
531
|
+
checker.print_report(report)
|
|
532
|
+
|
|
533
|
+
# Test case 3: Data with anomalies
|
|
534
|
+
print("\n\nTest Case 3: Data with Anomalies")
|
|
535
|
+
print("-" * 50)
|
|
536
|
+
|
|
537
|
+
anomaly_df = clean_df.copy()
|
|
538
|
+
|
|
539
|
+
# Add impossible values
|
|
540
|
+
anomaly_df.loc[50, 'glucose'] = 15 # Too low
|
|
541
|
+
anomaly_df.loc[100, 'glucose'] = 700 # Too high
|
|
542
|
+
|
|
543
|
+
# Add outlier
|
|
544
|
+
anomaly_df.loc[75, 'glucose'] = 400
|
|
545
|
+
|
|
546
|
+
report = checker.check(anomaly_df)
|
|
547
|
+
checker.print_report(report)
|
|
548
|
+
|
|
549
|
+
print("\n" + "=" * 70)
|
|
550
|
+
print("DATA QUALITY CHECKER DEMONSTRATION COMPLETE")
|