iints-sdk-python35 0.0.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iints/__init__.py +183 -0
- iints/analysis/__init__.py +12 -0
- iints/analysis/algorithm_xray.py +387 -0
- iints/analysis/baseline.py +92 -0
- iints/analysis/clinical_benchmark.py +198 -0
- iints/analysis/clinical_metrics.py +551 -0
- iints/analysis/clinical_tir_analyzer.py +136 -0
- iints/analysis/diabetes_metrics.py +43 -0
- iints/analysis/edge_efficiency.py +33 -0
- iints/analysis/edge_performance_monitor.py +315 -0
- iints/analysis/explainability.py +94 -0
- iints/analysis/explainable_ai.py +232 -0
- iints/analysis/hardware_benchmark.py +221 -0
- iints/analysis/metrics.py +117 -0
- iints/analysis/population_report.py +188 -0
- iints/analysis/reporting.py +345 -0
- iints/analysis/safety_index.py +311 -0
- iints/analysis/sensor_filtering.py +54 -0
- iints/analysis/validator.py +273 -0
- iints/api/__init__.py +0 -0
- iints/api/base_algorithm.py +307 -0
- iints/api/registry.py +103 -0
- iints/api/template_algorithm.py +195 -0
- iints/assets/iints_logo.png +0 -0
- iints/cli/__init__.py +0 -0
- iints/cli/cli.py +2598 -0
- iints/core/__init__.py +1 -0
- iints/core/algorithms/__init__.py +0 -0
- iints/core/algorithms/battle_runner.py +138 -0
- iints/core/algorithms/correction_bolus.py +95 -0
- iints/core/algorithms/discovery.py +92 -0
- iints/core/algorithms/fixed_basal_bolus.py +58 -0
- iints/core/algorithms/hybrid_algorithm.py +92 -0
- iints/core/algorithms/lstm_algorithm.py +138 -0
- iints/core/algorithms/mock_algorithms.py +162 -0
- iints/core/algorithms/pid_controller.py +88 -0
- iints/core/algorithms/standard_pump_algo.py +64 -0
- iints/core/device.py +0 -0
- iints/core/device_manager.py +64 -0
- iints/core/devices/__init__.py +3 -0
- iints/core/devices/models.py +160 -0
- iints/core/patient/__init__.py +9 -0
- iints/core/patient/bergman_model.py +341 -0
- iints/core/patient/models.py +285 -0
- iints/core/patient/patient_factory.py +117 -0
- iints/core/patient/profile.py +41 -0
- iints/core/safety/__init__.py +12 -0
- iints/core/safety/config.py +37 -0
- iints/core/safety/input_validator.py +95 -0
- iints/core/safety/supervisor.py +39 -0
- iints/core/simulation/__init__.py +0 -0
- iints/core/simulation/scenario_parser.py +61 -0
- iints/core/simulator.py +874 -0
- iints/core/supervisor.py +367 -0
- iints/data/__init__.py +53 -0
- iints/data/adapter.py +142 -0
- iints/data/column_mapper.py +398 -0
- iints/data/datasets.json +132 -0
- iints/data/demo/__init__.py +1 -0
- iints/data/demo/demo_cgm.csv +289 -0
- iints/data/importer.py +275 -0
- iints/data/ingestor.py +162 -0
- iints/data/nightscout.py +128 -0
- iints/data/quality_checker.py +550 -0
- iints/data/registry.py +166 -0
- iints/data/tidepool.py +38 -0
- iints/data/universal_parser.py +813 -0
- iints/data/virtual_patients/clinic_safe_baseline.yaml +9 -0
- iints/data/virtual_patients/clinic_safe_hyper_challenge.yaml +9 -0
- iints/data/virtual_patients/clinic_safe_hypo_prone.yaml +9 -0
- iints/data/virtual_patients/clinic_safe_midnight.yaml +9 -0
- iints/data/virtual_patients/clinic_safe_pizza.yaml +9 -0
- iints/data/virtual_patients/clinic_safe_stress_meal.yaml +9 -0
- iints/data/virtual_patients/default_patient.yaml +11 -0
- iints/data/virtual_patients/patient_559_config.yaml +11 -0
- iints/emulation/__init__.py +80 -0
- iints/emulation/legacy_base.py +414 -0
- iints/emulation/medtronic_780g.py +337 -0
- iints/emulation/omnipod_5.py +367 -0
- iints/emulation/tandem_controliq.py +393 -0
- iints/highlevel.py +451 -0
- iints/learning/__init__.py +3 -0
- iints/learning/autonomous_optimizer.py +194 -0
- iints/learning/learning_system.py +122 -0
- iints/metrics.py +34 -0
- iints/population/__init__.py +11 -0
- iints/population/generator.py +131 -0
- iints/population/runner.py +327 -0
- iints/presets/__init__.py +28 -0
- iints/presets/presets.json +114 -0
- iints/research/__init__.py +30 -0
- iints/research/config.py +68 -0
- iints/research/dataset.py +319 -0
- iints/research/losses.py +73 -0
- iints/research/predictor.py +329 -0
- iints/scenarios/__init__.py +3 -0
- iints/scenarios/generator.py +92 -0
- iints/templates/__init__.py +0 -0
- iints/templates/default_algorithm.py +91 -0
- iints/templates/scenarios/__init__.py +0 -0
- iints/templates/scenarios/chaos_insulin_stacking.json +29 -0
- iints/templates/scenarios/chaos_runaway_ai.json +25 -0
- iints/templates/scenarios/example_scenario.json +35 -0
- iints/templates/scenarios/exercise_stress.json +30 -0
- iints/utils/__init__.py +3 -0
- iints/utils/plotting.py +50 -0
- iints/utils/run_io.py +152 -0
- iints/validation/__init__.py +133 -0
- iints/validation/schemas.py +94 -0
- iints/visualization/__init__.py +34 -0
- iints/visualization/cockpit.py +691 -0
- iints/visualization/uncertainty_cloud.py +612 -0
- iints_sdk_python35-0.0.18.dist-info/METADATA +225 -0
- iints_sdk_python35-0.0.18.dist-info/RECORD +118 -0
- iints_sdk_python35-0.0.18.dist-info/WHEEL +5 -0
- iints_sdk_python35-0.0.18.dist-info/entry_points.txt +10 -0
- iints_sdk_python35-0.0.18.dist-info/licenses/LICENSE +28 -0
- iints_sdk_python35-0.0.18.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,813 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Universal Data Parser - IINTS-AF
|
|
4
|
+
Universal ingestion engine for any CSV/JSON data format.
|
|
5
|
+
|
|
6
|
+
This is the "Universal Data Bridge" - it accepts any data format and
|
|
7
|
+
converts it to the standard IINTS format: [Time, Glucose, Carbs, Insulin]
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import numpy as np
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Dict, List, Optional, Tuple, Union, Any
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
|
|
18
|
+
from .column_mapper import ColumnMapper, ColumnMapping
|
|
19
|
+
from .quality_checker import DataQualityChecker, QualityReport
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class StandardDataPack:
|
|
24
|
+
"""
|
|
25
|
+
Standard data format for IINTS-AF.
|
|
26
|
+
|
|
27
|
+
All data is converted to this format before simulation.
|
|
28
|
+
"""
|
|
29
|
+
data: pd.DataFrame
|
|
30
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
31
|
+
quality_report: Optional[QualityReport] = None
|
|
32
|
+
source_file: Optional[str] = None
|
|
33
|
+
data_format: str = "standard"
|
|
34
|
+
|
|
35
|
+
def __post_init__(self):
|
|
36
|
+
"""Ensure standard columns exist"""
|
|
37
|
+
required_cols = ['timestamp', 'glucose']
|
|
38
|
+
optional_cols = ['carbs', 'insulin']
|
|
39
|
+
|
|
40
|
+
for col in required_cols:
|
|
41
|
+
if col not in self.data.columns:
|
|
42
|
+
self.data[col] = float('nan')
|
|
43
|
+
|
|
44
|
+
for col in optional_cols:
|
|
45
|
+
if col not in self.data.columns:
|
|
46
|
+
self.data[col] = 0.0
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def duration_hours(self) -> float:
|
|
50
|
+
"""Get data duration in hours"""
|
|
51
|
+
if 'timestamp' not in self.data.columns or len(self.data) < 2:
|
|
52
|
+
return 0.0
|
|
53
|
+
return (self.data['timestamp'].max() - self.data['timestamp'].min()) / 60.0
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def data_points(self) -> int:
|
|
57
|
+
"""Get number of data points"""
|
|
58
|
+
return len(self.data)
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def confidence_score(self) -> float:
|
|
62
|
+
"""Get simulation confidence score"""
|
|
63
|
+
if self.quality_report:
|
|
64
|
+
return self.quality_report.overall_score
|
|
65
|
+
return 0.85 # Default if no quality report
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class ParseResult:
|
|
70
|
+
"""Result of a parse operation"""
|
|
71
|
+
success: bool
|
|
72
|
+
data_pack: Optional[StandardDataPack]
|
|
73
|
+
errors: List[str]
|
|
74
|
+
warnings: List[str]
|
|
75
|
+
parse_time_seconds: float
|
|
76
|
+
|
|
77
|
+
def to_dict(self) -> Dict:
|
|
78
|
+
return {
|
|
79
|
+
'success': self.success,
|
|
80
|
+
'data_pack': {
|
|
81
|
+
'data_points': self.data_pack.data_points if self.data_pack else 0,
|
|
82
|
+
'duration_hours': self.data_pack.duration_hours if self.data_pack else 0,
|
|
83
|
+
'confidence_score': self.data_pack.confidence_score if self.data_pack else 0
|
|
84
|
+
} if self.data_pack else None,
|
|
85
|
+
'errors': self.errors,
|
|
86
|
+
'warnings': self.warnings,
|
|
87
|
+
'parse_time_seconds': self.parse_time_seconds
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class UniversalParser:
|
|
92
|
+
"""
|
|
93
|
+
Universal data parser for IINTS-AF.
|
|
94
|
+
|
|
95
|
+
Accepts any CSV or JSON file and converts it to standard format.
|
|
96
|
+
Handles:
|
|
97
|
+
- Automatic column detection and mapping
|
|
98
|
+
- Multiple date/time formats
|
|
99
|
+
- Various glucose unit conversions
|
|
100
|
+
- Data quality assessment
|
|
101
|
+
|
|
102
|
+
Usage:
|
|
103
|
+
parser = UniversalParser()
|
|
104
|
+
result = parser.parse("patient_data.csv")
|
|
105
|
+
if result.success:
|
|
106
|
+
data = result.data_pack
|
|
107
|
+
# Use data for simulation
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
# Supported date formats
|
|
111
|
+
DATE_FORMATS = [
|
|
112
|
+
'%Y-%m-%d %H:%M:%S',
|
|
113
|
+
'%Y-%m-%d %H:%M',
|
|
114
|
+
'%Y-%m-%d',
|
|
115
|
+
'%m/%d/%Y %H:%M:%S',
|
|
116
|
+
'%m/%d/%Y %H:%M',
|
|
117
|
+
'%m/%d/%Y',
|
|
118
|
+
'%H:%M:%S',
|
|
119
|
+
'%H:%M',
|
|
120
|
+
'%Y-%m-%dT%H:%M:%S',
|
|
121
|
+
'%Y-%m-%dT%H:%M:%SZ',
|
|
122
|
+
'%Y-%m-%dT%H:%M:%S.%f',
|
|
123
|
+
'%Y-%m-%dT%H:%M:%S.%fZ',
|
|
124
|
+
None, # Let pandas infer
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
# Glucose unit conversions (to mg/dL)
|
|
128
|
+
GLUCOSE_CONVERSIONS = {
|
|
129
|
+
'mg/dl': 1.0,
|
|
130
|
+
'mg/dL': 1.0,
|
|
131
|
+
'mmol/l': 18.0182,
|
|
132
|
+
'mmol/L': 18.0182,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
def __init__(self,
|
|
136
|
+
auto_validate: bool = True,
|
|
137
|
+
expected_interval: int = 5):
|
|
138
|
+
"""
|
|
139
|
+
Initialize universal parser.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
auto_validate: Whether to automatically run quality checks
|
|
143
|
+
expected_interval: Expected time between readings in minutes
|
|
144
|
+
"""
|
|
145
|
+
self.column_mapper = ColumnMapper()
|
|
146
|
+
self.quality_checker = DataQualityChecker(
|
|
147
|
+
expected_interval=expected_interval,
|
|
148
|
+
source_type='cgm'
|
|
149
|
+
)
|
|
150
|
+
self.auto_validate = auto_validate
|
|
151
|
+
self.expected_interval = expected_interval
|
|
152
|
+
|
|
153
|
+
def detect_format(self, file_path: str) -> str:
|
|
154
|
+
"""
|
|
155
|
+
Detect file format from extension.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
file_path: Path to the file
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Detected format ('csv', 'json', 'parquet', 'unknown')
|
|
162
|
+
"""
|
|
163
|
+
path = Path(file_path)
|
|
164
|
+
suffix = path.suffix.lower()
|
|
165
|
+
|
|
166
|
+
if suffix == '.csv':
|
|
167
|
+
return 'csv'
|
|
168
|
+
elif suffix == '.json':
|
|
169
|
+
return 'json'
|
|
170
|
+
elif suffix == '.parquet':
|
|
171
|
+
return 'parquet'
|
|
172
|
+
else:
|
|
173
|
+
return 'unknown'
|
|
174
|
+
|
|
175
|
+
def detect_delimiter(self, file_path: str) -> Optional[str]:
|
|
176
|
+
"""
|
|
177
|
+
Detect CSV delimiter by analyzing the file.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
file_path: Path to CSV file
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Detected delimiter or None
|
|
184
|
+
"""
|
|
185
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
186
|
+
first_line = f.readline()
|
|
187
|
+
|
|
188
|
+
# Common delimiters to check
|
|
189
|
+
delimiters = [',', ';', '\t', '|']
|
|
190
|
+
detected = None
|
|
191
|
+
max_count = 0
|
|
192
|
+
|
|
193
|
+
for delimiter in delimiters:
|
|
194
|
+
count = first_line.count(delimiter)
|
|
195
|
+
if count > max_count:
|
|
196
|
+
max_count = count
|
|
197
|
+
detected = delimiter
|
|
198
|
+
|
|
199
|
+
return detected
|
|
200
|
+
|
|
201
|
+
def parse_datetime(self, value: Any) -> Optional[float]:
|
|
202
|
+
"""
|
|
203
|
+
Parse datetime value to minutes from start.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
value: Datetime value to parse
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Minutes from start or None if parsing fails
|
|
210
|
+
"""
|
|
211
|
+
if pd.isna(value):
|
|
212
|
+
return None
|
|
213
|
+
|
|
214
|
+
# If already numeric, assume minutes
|
|
215
|
+
if isinstance(value, (int, float)):
|
|
216
|
+
return float(value)
|
|
217
|
+
|
|
218
|
+
# If string, try to parse
|
|
219
|
+
if isinstance(value, str):
|
|
220
|
+
value = value.strip()
|
|
221
|
+
|
|
222
|
+
try:
|
|
223
|
+
# Use pandas for robust datetime parsing
|
|
224
|
+
dt = pd.to_datetime(value)
|
|
225
|
+
# Return minutes from midnight. The normalize_timestamps function
|
|
226
|
+
# will convert this to minutes from the start of the series.
|
|
227
|
+
return dt.hour * 60 + dt.minute + dt.second / 60 + dt.microsecond / 1_000_000 / 60
|
|
228
|
+
except ValueError:
|
|
229
|
+
# If parsing as a date fails, it might be a time-only format
|
|
230
|
+
# that pandas couldn't infer.
|
|
231
|
+
try:
|
|
232
|
+
parts = value.split(':')
|
|
233
|
+
if len(parts) >= 2:
|
|
234
|
+
hours = int(parts[0])
|
|
235
|
+
minutes = int(parts[1])
|
|
236
|
+
seconds = float(parts[2]) if len(parts) > 2 else 0.0
|
|
237
|
+
if 0 <= hours < 24 and 0 <= minutes < 60 and 0 <= seconds < 60:
|
|
238
|
+
return hours * 60 + minutes + seconds / 60
|
|
239
|
+
except (ValueError, IndexError):
|
|
240
|
+
return None # Could not parse as time
|
|
241
|
+
|
|
242
|
+
return None
|
|
243
|
+
|
|
244
|
+
def parse_glucose(self, value: Any, unit: str = 'mg/dL') -> Optional[float]:
|
|
245
|
+
"""
|
|
246
|
+
Parse glucose value and convert to mg/dL.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
value: Glucose value to parse
|
|
250
|
+
unit: Unit of the value
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
Glucose in mg/dL or None if parsing fails
|
|
254
|
+
"""
|
|
255
|
+
if pd.isna(value):
|
|
256
|
+
return None
|
|
257
|
+
|
|
258
|
+
try:
|
|
259
|
+
glucose_mgdl = float(value)
|
|
260
|
+
|
|
261
|
+
# Convert if necessary
|
|
262
|
+
if unit.lower() in ['mmol/l', 'mmol/l']:
|
|
263
|
+
glucose_mgdl *= 18.0182 # Convert mmol/L to mg/dL
|
|
264
|
+
|
|
265
|
+
return glucose_mgdl
|
|
266
|
+
except (ValueError, TypeError):
|
|
267
|
+
return None
|
|
268
|
+
|
|
269
|
+
def parse_csv(self, file_path: str) -> pd.DataFrame:
|
|
270
|
+
"""
|
|
271
|
+
Parse CSV file with automatic delimiter detection.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
file_path: Path to CSV file
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
Parsed DataFrame
|
|
278
|
+
"""
|
|
279
|
+
# Detect delimiter
|
|
280
|
+
delimiter = self.detect_delimiter(file_path)
|
|
281
|
+
|
|
282
|
+
# Read CSV
|
|
283
|
+
df = pd.read_csv(
|
|
284
|
+
file_path,
|
|
285
|
+
delimiter=delimiter,
|
|
286
|
+
na_values=['', 'NA', 'N/A', 'null', 'NULL', 'NaN', 'nan'],
|
|
287
|
+
keep_default_na=True
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
return df
|
|
291
|
+
|
|
292
|
+
def parse_json(self, file_path: str) -> pd.DataFrame:
|
|
293
|
+
"""
|
|
294
|
+
Parse JSON file.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
file_path: Path to JSON file
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
Parsed DataFrame
|
|
301
|
+
"""
|
|
302
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
303
|
+
data = json.load(f)
|
|
304
|
+
|
|
305
|
+
# Handle different JSON structures
|
|
306
|
+
if isinstance(data, list):
|
|
307
|
+
df = pd.json_normalize(data)
|
|
308
|
+
elif isinstance(data, dict):
|
|
309
|
+
# Check for common nested structures
|
|
310
|
+
if 'data' in data and isinstance(data['data'], list):
|
|
311
|
+
df = pd.json_normalize(data['data'])
|
|
312
|
+
elif 'readings' in data and isinstance(data['readings'], list):
|
|
313
|
+
df = pd.json_normalize(data['readings'])
|
|
314
|
+
elif 'entries' in data and isinstance(data['entries'], list):
|
|
315
|
+
df = pd.json_normalize(data['entries'])
|
|
316
|
+
else:
|
|
317
|
+
df = pd.json_normalize(data)
|
|
318
|
+
else:
|
|
319
|
+
raise ValueError(f"Unexpected JSON structure in {file_path}")
|
|
320
|
+
|
|
321
|
+
return df
|
|
322
|
+
|
|
323
|
+
def convert_to_standard(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, dict]:
|
|
324
|
+
"""
|
|
325
|
+
Convert DataFrame to standard IINTS format.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
df: Input DataFrame with original columns
|
|
329
|
+
|
|
330
|
+
Returns:
|
|
331
|
+
DataFrame with standard columns [timestamp, glucose, carbs, insulin]
|
|
332
|
+
"""
|
|
333
|
+
result_df = df.copy()
|
|
334
|
+
|
|
335
|
+
# Map columns to standard format
|
|
336
|
+
mapping = self.column_mapper.map_columns(list(result_df.columns))
|
|
337
|
+
|
|
338
|
+
if mapping.mapped_columns:
|
|
339
|
+
# Rename columns to standard names
|
|
340
|
+
rename_dict = {v: k for k, v in mapping.mapped_columns.items()}
|
|
341
|
+
result_df = result_df.rename(columns=rename_dict)
|
|
342
|
+
|
|
343
|
+
# Parse timestamp column
|
|
344
|
+
if 'timestamp' in result_df.columns:
|
|
345
|
+
result_df['timestamp'] = result_df['timestamp'].apply(self.parse_datetime)
|
|
346
|
+
|
|
347
|
+
# Ensure glucose is numeric
|
|
348
|
+
if 'glucose' in result_df.columns:
|
|
349
|
+
result_df['glucose'] = pd.to_numeric(result_df['glucose'], errors='coerce')
|
|
350
|
+
|
|
351
|
+
# Ensure carbs and insulin are numeric
|
|
352
|
+
if 'carbs' in result_df.columns:
|
|
353
|
+
result_df['carbs'] = pd.to_numeric(result_df['carbs'], errors='coerce').fillna(0)
|
|
354
|
+
else:
|
|
355
|
+
result_df['carbs'] = 0.0
|
|
356
|
+
|
|
357
|
+
if 'insulin' in result_df.columns:
|
|
358
|
+
result_df['insulin'] = pd.to_numeric(result_df['insulin'], errors='coerce').fillna(0)
|
|
359
|
+
else:
|
|
360
|
+
result_df['insulin'] = 0.0
|
|
361
|
+
|
|
362
|
+
# Select and order standard columns
|
|
363
|
+
standard_cols = ['timestamp', 'glucose', 'carbs', 'insulin']
|
|
364
|
+
# Fix: Exclude columns that have been mapped
|
|
365
|
+
mapped_original_cols = list(mapping.mapped_columns.values())
|
|
366
|
+
other_cols = [c for c in df.columns if c not in mapped_original_cols and c not in standard_cols]
|
|
367
|
+
|
|
368
|
+
# We only want to keep the standard columns in the final dataframe
|
|
369
|
+
result_df = result_df[standard_cols]
|
|
370
|
+
|
|
371
|
+
return result_df, mapping.mapped_columns
|
|
372
|
+
|
|
373
|
+
def normalize_timestamps(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
374
|
+
"""
|
|
375
|
+
Normalize timestamps to minutes from start.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
df: DataFrame with timestamp column
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
DataFrame with normalized timestamps
|
|
382
|
+
"""
|
|
383
|
+
if 'timestamp' not in df.columns:
|
|
384
|
+
return df
|
|
385
|
+
|
|
386
|
+
# If timestamps are already in minutes, ensure they're numeric
|
|
387
|
+
if df['timestamp'].max() < 1440: # Less than 24 hours in minutes
|
|
388
|
+
df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')
|
|
389
|
+
return df
|
|
390
|
+
|
|
391
|
+
# Otherwise, convert to minutes from start
|
|
392
|
+
try:
|
|
393
|
+
# Convert to datetime first
|
|
394
|
+
if df['timestamp'].dtype == object:
|
|
395
|
+
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
|
|
396
|
+
|
|
397
|
+
# Calculate minutes from start
|
|
398
|
+
start_time = df['timestamp'].min()
|
|
399
|
+
df['timestamp'] = (df['timestamp'] - start_time).dt.total_seconds() / 60
|
|
400
|
+
|
|
401
|
+
return df
|
|
402
|
+
except Exception:
|
|
403
|
+
return df
|
|
404
|
+
|
|
405
|
+
def validate_and_clean(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, QualityReport]:
|
|
406
|
+
"""
|
|
407
|
+
Validate data and return quality report.
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
df: DataFrame to validate
|
|
411
|
+
|
|
412
|
+
Returns:
|
|
413
|
+
Tuple of (cleaned DataFrame, QualityReport)
|
|
414
|
+
"""
|
|
415
|
+
report = self.quality_checker.check(df)
|
|
416
|
+
return df, report
|
|
417
|
+
|
|
418
|
+
def _clean_data_based_on_report(self, df: pd.DataFrame, report: QualityReport) -> pd.DataFrame:
|
|
419
|
+
"""
|
|
420
|
+
Cleans the DataFrame based on the QualityReport by setting anomalous glucose values to NaN.
|
|
421
|
+
"""
|
|
422
|
+
cleaned_df = df.copy()
|
|
423
|
+
|
|
424
|
+
for anomaly in report.anomalies:
|
|
425
|
+
if anomaly.anomaly_type in ['impossible_value', 'outlier', 'rapid_change']:
|
|
426
|
+
# Set the anomalous glucose value to NaN
|
|
427
|
+
# Ensure 'glucose' column exists and index is valid
|
|
428
|
+
if 'glucose' in cleaned_df.columns and anomaly.index in cleaned_df.index:
|
|
429
|
+
cleaned_df.loc[anomaly.index, 'glucose'] = np.nan
|
|
430
|
+
|
|
431
|
+
return cleaned_df
|
|
432
|
+
|
|
433
|
+
def parse(self,
|
|
434
|
+
file_path: str,
|
|
435
|
+
validate: Optional[bool] = None,
|
|
436
|
+
metadata: Optional[Dict] = None) -> ParseResult:
|
|
437
|
+
"""
|
|
438
|
+
Main entry point for parsing data files.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
file_path: Path to the data file
|
|
442
|
+
validate: Override auto_validate setting
|
|
443
|
+
metadata: Optional metadata to add to the data pack
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
ParseResult with data pack or errors
|
|
447
|
+
"""
|
|
448
|
+
import time
|
|
449
|
+
start_time = time.time()
|
|
450
|
+
|
|
451
|
+
errors: List[str] = []
|
|
452
|
+
warnings: List[str] = []
|
|
453
|
+
|
|
454
|
+
# Validate file exists
|
|
455
|
+
path = Path(file_path)
|
|
456
|
+
if not path.exists():
|
|
457
|
+
return ParseResult(
|
|
458
|
+
success=False,
|
|
459
|
+
data_pack=None,
|
|
460
|
+
errors=[f"File not found: {file_path}"],
|
|
461
|
+
warnings=[],
|
|
462
|
+
parse_time_seconds=time.time() - start_time
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
# Detect format
|
|
466
|
+
file_format = self.detect_format(file_path)
|
|
467
|
+
if file_format == 'unknown':
|
|
468
|
+
return ParseResult(
|
|
469
|
+
success=False,
|
|
470
|
+
data_pack=None,
|
|
471
|
+
errors=[f"Unsupported file format: {path.suffix}"],
|
|
472
|
+
warnings=[],
|
|
473
|
+
parse_time_seconds=time.time() - start_time
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
try:
|
|
477
|
+
# Parse based on format
|
|
478
|
+
if file_format == 'csv':
|
|
479
|
+
df = self.parse_csv(file_path)
|
|
480
|
+
elif file_format == 'json':
|
|
481
|
+
df = self.parse_json(file_path)
|
|
482
|
+
elif file_format == 'parquet':
|
|
483
|
+
df = pd.read_parquet(file_path)
|
|
484
|
+
else:
|
|
485
|
+
raise ValueError(f"Unsupported format: {file_format}")
|
|
486
|
+
|
|
487
|
+
# Log column mapping info
|
|
488
|
+
mapping = self.column_mapper.map_columns(list(df.columns))
|
|
489
|
+
if mapping.warnings:
|
|
490
|
+
warnings.extend(mapping.warnings)
|
|
491
|
+
|
|
492
|
+
if mapping.confidence < 0.5:
|
|
493
|
+
warnings.append(f"Low column mapping confidence: {mapping.confidence:.1%}")
|
|
494
|
+
|
|
495
|
+
# Convert to standard format
|
|
496
|
+
df, column_mapping = self.convert_to_standard(df)
|
|
497
|
+
|
|
498
|
+
# Check for required columns
|
|
499
|
+
if 'glucose' not in df.columns or df['glucose'].isna().all():
|
|
500
|
+
return ParseResult(
|
|
501
|
+
success=False,
|
|
502
|
+
data_pack=None,
|
|
503
|
+
errors=["No valid glucose data found in file"],
|
|
504
|
+
warnings=warnings,
|
|
505
|
+
parse_time_seconds=time.time() - start_time
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
# Normalize timestamps
|
|
509
|
+
df = self.normalize_timestamps(df)
|
|
510
|
+
|
|
511
|
+
# Create data pack
|
|
512
|
+
data_pack = StandardDataPack(
|
|
513
|
+
data=df,
|
|
514
|
+
metadata=metadata or {},
|
|
515
|
+
source_file=str(path.absolute())
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
# Validate and check quality
|
|
519
|
+
if validate if validate is not None else self.auto_validate:
|
|
520
|
+
original_df = df.copy() # Keep original for comparison if needed
|
|
521
|
+
df, quality_report = self.validate_and_clean(original_df)
|
|
522
|
+
|
|
523
|
+
# Clean data based on the quality report
|
|
524
|
+
cleaned_df = self._clean_data_based_on_report(df, quality_report)
|
|
525
|
+
data_pack.data = cleaned_df # Store the cleaned DataFrame
|
|
526
|
+
data_pack.quality_report = quality_report
|
|
527
|
+
|
|
528
|
+
if quality_report.overall_score < 0.5:
|
|
529
|
+
warnings.append(
|
|
530
|
+
f"Data quality is low ({quality_report.overall_score:.1%}). "
|
|
531
|
+
f"Simulation results may be unreliable."
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
# Add quality warnings
|
|
535
|
+
warnings.extend(quality_report.warnings)
|
|
536
|
+
|
|
537
|
+
# Add metadata
|
|
538
|
+
data_pack.metadata.update({
|
|
539
|
+
'source_file': str(path.absolute()),
|
|
540
|
+
'source_format': file_format,
|
|
541
|
+
'column_mapping': column_mapping,
|
|
542
|
+
'data_points': len(df),
|
|
543
|
+
'duration_hours': data_pack.duration_hours
|
|
544
|
+
})
|
|
545
|
+
|
|
546
|
+
return ParseResult(
|
|
547
|
+
success=True,
|
|
548
|
+
data_pack=data_pack,
|
|
549
|
+
errors=errors,
|
|
550
|
+
warnings=warnings,
|
|
551
|
+
parse_time_seconds=time.time() - start_time
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
except Exception as e:
|
|
555
|
+
return ParseResult(
|
|
556
|
+
success=False,
|
|
557
|
+
data_pack=None,
|
|
558
|
+
errors=[f"Parse error: {str(e)}"],
|
|
559
|
+
warnings=warnings,
|
|
560
|
+
parse_time_seconds=time.time() - start_time
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
def parse_string(self,
|
|
564
|
+
content: str,
|
|
565
|
+
format_type: str = 'csv',
|
|
566
|
+
validate: bool = True) -> ParseResult:
|
|
567
|
+
"""
|
|
568
|
+
Parse data from a string instead of a file.
|
|
569
|
+
|
|
570
|
+
Args:
|
|
571
|
+
content: String containing data
|
|
572
|
+
format_type: Format of the data ('csv' or 'json')
|
|
573
|
+
validate: Whether to run quality checks
|
|
574
|
+
|
|
575
|
+
Returns:
|
|
576
|
+
ParseResult with data pack or errors
|
|
577
|
+
"""
|
|
578
|
+
import io
|
|
579
|
+
import time
|
|
580
|
+
start_time = time.time()
|
|
581
|
+
|
|
582
|
+
errors: List[str] = []
|
|
583
|
+
warnings: List[str] = []
|
|
584
|
+
|
|
585
|
+
try:
|
|
586
|
+
# Parse from string
|
|
587
|
+
if format_type == 'csv':
|
|
588
|
+
df = pd.read_csv(io.StringIO(content))
|
|
589
|
+
elif format_type == 'json':
|
|
590
|
+
data = json.loads(content)
|
|
591
|
+
if isinstance(data, list):
|
|
592
|
+
df = pd.json_normalize(data)
|
|
593
|
+
else:
|
|
594
|
+
df = pd.json_normalize(data)
|
|
595
|
+
else:
|
|
596
|
+
raise ValueError(f"Unsupported format: {format_type}")
|
|
597
|
+
|
|
598
|
+
# Convert to standard format
|
|
599
|
+
df, column_mapping = self.convert_to_standard(df)
|
|
600
|
+
|
|
601
|
+
# Check for required columns before further processing
|
|
602
|
+
if 'glucose' not in df.columns or df['glucose'].isna().all():
|
|
603
|
+
return ParseResult(
|
|
604
|
+
success=False,
|
|
605
|
+
data_pack=None,
|
|
606
|
+
errors=["No valid glucose data found in input string"],
|
|
607
|
+
warnings=warnings,
|
|
608
|
+
parse_time_seconds=time.time() - start_time
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
# Normalize timestamps
|
|
612
|
+
df = self.normalize_timestamps(df)
|
|
613
|
+
|
|
614
|
+
# Create data pack
|
|
615
|
+
data_pack = StandardDataPack(
|
|
616
|
+
data=df,
|
|
617
|
+
metadata={'source': 'string_input', 'column_mapping': column_mapping},
|
|
618
|
+
source_file=None
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
# Validate if requested
|
|
622
|
+
if validate:
|
|
623
|
+
# Keep original for comparison if needed for cleaning
|
|
624
|
+
original_df_for_quality_check = df.copy()
|
|
625
|
+
_, quality_report = self.validate_and_clean(original_df_for_quality_check)
|
|
626
|
+
|
|
627
|
+
# Clean data based on the quality report
|
|
628
|
+
cleaned_df = self._clean_data_based_on_report(df.copy(), quality_report) # Pass a copy to avoid modifying df in place if it's used elsewhere
|
|
629
|
+
data_pack.data = cleaned_df
|
|
630
|
+
data_pack.quality_report = quality_report
|
|
631
|
+
warnings.extend(quality_report.warnings)
|
|
632
|
+
|
|
633
|
+
if quality_report.overall_score < 0.5:
|
|
634
|
+
warnings.append(
|
|
635
|
+
f"Data quality is low ({quality_report.overall_score:.1%}). "
|
|
636
|
+
f"Simulation results may be unreliable."
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
return ParseResult(
|
|
640
|
+
success=True,
|
|
641
|
+
data_pack=data_pack,
|
|
642
|
+
errors=errors,
|
|
643
|
+
warnings=warnings,
|
|
644
|
+
parse_time_seconds=time.time() - start_time
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
except Exception as e:
|
|
648
|
+
return ParseResult(
|
|
649
|
+
success=False,
|
|
650
|
+
data_pack=None,
|
|
651
|
+
errors=[f"Parse error: {str(e)}"],
|
|
652
|
+
warnings=warnings,
|
|
653
|
+
parse_time_seconds=time.time() - start_time
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
|
|
657
|
+
def demo_universal_parser():
|
|
658
|
+
"""Demonstrate universal parsing functionality"""
|
|
659
|
+
print("=" * 70)
|
|
660
|
+
print("UNIVERSAL DATA PARSER DEMONSTRATION")
|
|
661
|
+
print("=" * 70)
|
|
662
|
+
|
|
663
|
+
parser = UniversalParser()
|
|
664
|
+
|
|
665
|
+
# Demo 1: Parse existing Ohio data
|
|
666
|
+
print("\n Demo 1: Parse Ohio T1DM Dataset")
|
|
667
|
+
print("-" * 50)
|
|
668
|
+
|
|
669
|
+
ohio_path = Path("data_packs/public/ohio_t1dm/patient_559/timeseries.csv")
|
|
670
|
+
if ohio_path.exists():
|
|
671
|
+
result = parser.parse(str(ohio_path))
|
|
672
|
+
|
|
673
|
+
if result.success:
|
|
674
|
+
data_pack = result.data_pack
|
|
675
|
+
print(f" Successfully parsed {data_pack.data_points} data points")
|
|
676
|
+
print(f" Duration: {data_pack.duration_hours:.1f} hours")
|
|
677
|
+
print(f" Confidence Score: {data_pack.confidence_score:.1%}")
|
|
678
|
+
print(f"\n Data Preview:")
|
|
679
|
+
print(data_pack.data.head(3).to_string())
|
|
680
|
+
|
|
681
|
+
if data_pack.quality_report:
|
|
682
|
+
print(f"\n Quality Report:")
|
|
683
|
+
print(f" - Completeness: {data_pack.quality_report.completeness_score:.1%}")
|
|
684
|
+
print(f" - Consistency: {data_pack.quality_report.consistency_score:.1%}")
|
|
685
|
+
print(f" - Validity: {data_pack.quality_report.validity_score:.1%}")
|
|
686
|
+
else:
|
|
687
|
+
print(f" Parse failed: {result.errors}")
|
|
688
|
+
|
|
689
|
+
# Demo 2: Parse synthetic data
|
|
690
|
+
print("\n\n Demo 2: Parse Synthetic Data")
|
|
691
|
+
print("-" * 50)
|
|
692
|
+
|
|
693
|
+
synthetic_csv = """timestamp,glucose_mg_dl,carbs_grams,insulin_units
|
|
694
|
+
0,120,0,0
|
|
695
|
+
5,125,0,0.5
|
|
696
|
+
10,130,30,0
|
|
697
|
+
15,140,0,1.0
|
|
698
|
+
20,145,0,0
|
|
699
|
+
25,150,0,0
|
|
700
|
+
30,148,0,0
|
|
701
|
+
35,145,0,0
|
|
702
|
+
40,140,0,0
|
|
703
|
+
45,135,0,0
|
|
704
|
+
50,130,0,0"""
|
|
705
|
+
|
|
706
|
+
result = parser.parse_string(synthetic_csv, format_type='csv')
|
|
707
|
+
|
|
708
|
+
if result.success:
|
|
709
|
+
data_pack = result.data_pack
|
|
710
|
+
print(f" Successfully parsed synthetic data")
|
|
711
|
+
print(f" Data Points: {data_pack.data_points}")
|
|
712
|
+
print(f" Duration: {data_pack.duration_hours:.1f} hours")
|
|
713
|
+
print(f" Confidence: {data_pack.confidence_score:.1%}")
|
|
714
|
+
print(f"\n Data Preview:")
|
|
715
|
+
print(data_pack.data.to_string())
|
|
716
|
+
|
|
717
|
+
if result.warnings:
|
|
718
|
+
print(f"\n Warnings:")
|
|
719
|
+
for w in result.warnings:
|
|
720
|
+
print(f" {w}")
|
|
721
|
+
|
|
722
|
+
# Demo 3: Data with issues
|
|
723
|
+
print("\n\n Demo 3: Data with Quality Issues")
|
|
724
|
+
print("-" * 50)
|
|
725
|
+
|
|
726
|
+
problematic_csv = """timestamp,glucose,carbs,insulin
|
|
727
|
+
0,120,0,0
|
|
728
|
+
5,700,0,0
|
|
729
|
+
10,130,0,0
|
|
730
|
+
15,,0,0
|
|
731
|
+
20,140,0,0
|
|
732
|
+
25,145,0,0
|
|
733
|
+
30,50,0,0
|
|
734
|
+
35,140,0,0
|
|
735
|
+
40,155,0,0"""
|
|
736
|
+
|
|
737
|
+
result = parser.parse_string(problematic_csv, format_type='csv')
|
|
738
|
+
|
|
739
|
+
if result.success:
|
|
740
|
+
data_pack = result.data_pack
|
|
741
|
+
print(f" Parsed with quality issues detected")
|
|
742
|
+
print(f" Confidence: {data_pack.confidence_score:.1%}")
|
|
743
|
+
print(f"\n Warnings:")
|
|
744
|
+
for w in result.warnings:
|
|
745
|
+
print(f" {w}")
|
|
746
|
+
else:
|
|
747
|
+
print(f" Parse failed: {result.errors}")
|
|
748
|
+
|
|
749
|
+
# Demo 4: Different column names
|
|
750
|
+
print("\n\n Demo 4: Different Column Names (Custom Format)")
|
|
751
|
+
print("-" * 50)
|
|
752
|
+
|
|
753
|
+
custom_csv = """Time (min),BG Value,Carbohydrates,Insulin (U)
|
|
754
|
+
0,115,0,0
|
|
755
|
+
5,118,0,0.5
|
|
756
|
+
10,125,30,0
|
|
757
|
+
15,135,0,1.0
|
|
758
|
+
20,140,0,0"""
|
|
759
|
+
|
|
760
|
+
result = parser.parse_string(custom_csv, format_type='csv')
|
|
761
|
+
|
|
762
|
+
if result.success:
|
|
763
|
+
data_pack = result.data_pack
|
|
764
|
+
print(f" Successfully parsed custom format")
|
|
765
|
+
print(f" Data Points: {data_pack.data_points}")
|
|
766
|
+
print(f" Column Mapping: {data_pack.metadata.get('column_mapping', {})}")
|
|
767
|
+
print(f"\n Data Preview:")
|
|
768
|
+
print(data_pack.data.to_string())
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
# Demo 5: Cleaning data with physiological feasibility check
|
|
772
|
+
print("\n\n Demo 5: Cleaning Data with Physiological Feasibility Check")
|
|
773
|
+
print("-" * 50)
|
|
774
|
+
|
|
775
|
+
dirty_csv = """Time,Glucose.Level,Meal.Carbs,Delivered.Insulin
|
|
776
|
+
0,120,0,0
|
|
777
|
+
5,800,0,0
|
|
778
|
+
10,130,0,0
|
|
779
|
+
15,140,0,0
|
|
780
|
+
20,250,0,0
|
|
781
|
+
25,155,0,0
|
|
782
|
+
"""
|
|
783
|
+
|
|
784
|
+
result = parser.parse_string(dirty_csv, format_type='csv')
|
|
785
|
+
|
|
786
|
+
if result.success:
|
|
787
|
+
data_pack = result.data_pack
|
|
788
|
+
print("Successfully parsed and cleaned dirty data")
|
|
789
|
+
print(f" Data Points: {data_pack.data_points}")
|
|
790
|
+
print(f" Column Mapping: {data_pack.metadata.get('column_mapping', {})}")
|
|
791
|
+
print("\n Original Data Preview:")
|
|
792
|
+
# To show the original data, we can re-parse without validation
|
|
793
|
+
original_result = parser.parse_string(dirty_csv, format_type='csv', validate=False)
|
|
794
|
+
if original_result.success:
|
|
795
|
+
print(original_result.data_pack.data.to_string())
|
|
796
|
+
|
|
797
|
+
print("\n Cleaned Data Preview (anomalies set to NaN):")
|
|
798
|
+
print(data_pack.data.to_string())
|
|
799
|
+
|
|
800
|
+
if result.warnings:
|
|
801
|
+
print("\n Warnings:")
|
|
802
|
+
for w in result.warnings:
|
|
803
|
+
print(f" {w}")
|
|
804
|
+
else:
|
|
805
|
+
print(f" Parse failed: {result.errors}")
|
|
806
|
+
|
|
807
|
+
print("\n" + "=" * 70)
|
|
808
|
+
print("UNIVERSAL DATA PARSER DEMONSTRATION COMPLETE")
|
|
809
|
+
print("=" * 70)
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
if __name__ == "__main__":
|
|
813
|
+
demo_universal_parser()
|