iints-sdk-python35 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iints/__init__.py +134 -0
- iints/analysis/__init__.py +12 -0
- iints/analysis/algorithm_xray.py +387 -0
- iints/analysis/baseline.py +92 -0
- iints/analysis/clinical_benchmark.py +198 -0
- iints/analysis/clinical_metrics.py +551 -0
- iints/analysis/clinical_tir_analyzer.py +136 -0
- iints/analysis/diabetes_metrics.py +43 -0
- iints/analysis/edge_performance_monitor.py +315 -0
- iints/analysis/explainability.py +94 -0
- iints/analysis/explainable_ai.py +232 -0
- iints/analysis/hardware_benchmark.py +221 -0
- iints/analysis/metrics.py +117 -0
- iints/analysis/reporting.py +261 -0
- iints/analysis/sensor_filtering.py +54 -0
- iints/analysis/validator.py +273 -0
- iints/api/__init__.py +0 -0
- iints/api/base_algorithm.py +300 -0
- iints/api/template_algorithm.py +195 -0
- iints/cli/__init__.py +0 -0
- iints/cli/cli.py +1286 -0
- iints/core/__init__.py +1 -0
- iints/core/algorithms/__init__.py +0 -0
- iints/core/algorithms/battle_runner.py +138 -0
- iints/core/algorithms/correction_bolus.py +86 -0
- iints/core/algorithms/discovery.py +92 -0
- iints/core/algorithms/fixed_basal_bolus.py +52 -0
- iints/core/algorithms/hybrid_algorithm.py +92 -0
- iints/core/algorithms/lstm_algorithm.py +138 -0
- iints/core/algorithms/mock_algorithms.py +69 -0
- iints/core/algorithms/pid_controller.py +88 -0
- iints/core/algorithms/standard_pump_algo.py +64 -0
- iints/core/device.py +0 -0
- iints/core/device_manager.py +64 -0
- iints/core/devices/__init__.py +3 -0
- iints/core/devices/models.py +155 -0
- iints/core/patient/__init__.py +3 -0
- iints/core/patient/models.py +246 -0
- iints/core/patient/patient_factory.py +117 -0
- iints/core/patient/profile.py +41 -0
- iints/core/safety/__init__.py +4 -0
- iints/core/safety/input_validator.py +87 -0
- iints/core/safety/supervisor.py +29 -0
- iints/core/simulation/__init__.py +0 -0
- iints/core/simulation/scenario_parser.py +61 -0
- iints/core/simulator.py +519 -0
- iints/core/supervisor.py +275 -0
- iints/data/__init__.py +42 -0
- iints/data/adapter.py +142 -0
- iints/data/column_mapper.py +398 -0
- iints/data/demo/__init__.py +1 -0
- iints/data/demo/demo_cgm.csv +289 -0
- iints/data/importer.py +275 -0
- iints/data/ingestor.py +162 -0
- iints/data/quality_checker.py +550 -0
- iints/data/universal_parser.py +813 -0
- iints/data/virtual_patients/clinic_safe_baseline.yaml +9 -0
- iints/data/virtual_patients/clinic_safe_hyper_challenge.yaml +9 -0
- iints/data/virtual_patients/clinic_safe_hypo_prone.yaml +9 -0
- iints/data/virtual_patients/clinic_safe_midnight.yaml +9 -0
- iints/data/virtual_patients/clinic_safe_pizza.yaml +9 -0
- iints/data/virtual_patients/clinic_safe_stress_meal.yaml +9 -0
- iints/data/virtual_patients/default_patient.yaml +11 -0
- iints/data/virtual_patients/patient_559_config.yaml +11 -0
- iints/emulation/__init__.py +80 -0
- iints/emulation/legacy_base.py +414 -0
- iints/emulation/medtronic_780g.py +337 -0
- iints/emulation/omnipod_5.py +367 -0
- iints/emulation/tandem_controliq.py +393 -0
- iints/highlevel.py +192 -0
- iints/learning/__init__.py +3 -0
- iints/learning/autonomous_optimizer.py +194 -0
- iints/learning/learning_system.py +122 -0
- iints/metrics.py +34 -0
- iints/presets/__init__.py +28 -0
- iints/presets/presets.json +114 -0
- iints/templates/__init__.py +0 -0
- iints/templates/default_algorithm.py +56 -0
- iints/templates/scenarios/__init__.py +0 -0
- iints/templates/scenarios/example_scenario.json +34 -0
- iints/utils/__init__.py +3 -0
- iints/utils/plotting.py +50 -0
- iints/validation/__init__.py +117 -0
- iints/validation/schemas.py +72 -0
- iints/visualization/__init__.py +34 -0
- iints/visualization/cockpit.py +691 -0
- iints/visualization/uncertainty_cloud.py +612 -0
- iints_sdk_python35-0.1.7.dist-info/METADATA +122 -0
- iints_sdk_python35-0.1.7.dist-info/RECORD +93 -0
- iints_sdk_python35-0.1.7.dist-info/WHEEL +5 -0
- iints_sdk_python35-0.1.7.dist-info/entry_points.txt +2 -0
- iints_sdk_python35-0.1.7.dist-info/licenses/LICENSE +28 -0
- iints_sdk_python35-0.1.7.dist-info/top_level.txt +1 -0
iints/data/importer.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
6
|
+
import io
|
|
7
|
+
import re
|
|
8
|
+
import sys
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from iints.data.ingestor import DataIngestor
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _normalize_column(name: str) -> str:
|
|
16
|
+
return re.sub(r"[^a-z0-9]+", "", name.lower())
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _find_column(columns: Iterable[str], candidates: Iterable[str]) -> Optional[str]:
|
|
20
|
+
normalized = {col: _normalize_column(col) for col in columns}
|
|
21
|
+
candidate_set = {_normalize_column(c) for c in candidates}
|
|
22
|
+
for col, norm in normalized.items():
|
|
23
|
+
if norm in candidate_set:
|
|
24
|
+
return col
|
|
25
|
+
return None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
DEFAULT_MAPPINGS: Dict[str, Dict[str, List[str]]] = {
|
|
29
|
+
"generic": {
|
|
30
|
+
"timestamp": ["timestamp", "time", "datetime", "date", "eventtime", "device timestamp"],
|
|
31
|
+
"glucose": ["glucose", "bg", "sgv", "sensorglucose", "glucosemgdl", "glucosevalue"],
|
|
32
|
+
"carbs": ["carbs", "carb", "carbohydrates", "carbsg", "carbgrams"],
|
|
33
|
+
"insulin": ["insulin", "insulinunits", "bolus", "basal", "totalinsulin"],
|
|
34
|
+
},
|
|
35
|
+
"dexcom": {
|
|
36
|
+
"timestamp": ["timestamp", "eventtime", "device timestamp"],
|
|
37
|
+
"glucose": ["glucose", "glucosevalue", "sgv", "sensorglucose"],
|
|
38
|
+
"carbs": ["carbs", "carb", "carbohydrates"],
|
|
39
|
+
"insulin": ["insulin", "insulinunits", "bolus", "basal"],
|
|
40
|
+
},
|
|
41
|
+
"libre": {
|
|
42
|
+
"timestamp": ["timestamp", "device timestamp", "datetime", "time"],
|
|
43
|
+
"glucose": ["glucose", "glucosevalue", "sensorglucose", "sgv"],
|
|
44
|
+
"carbs": ["carbs", "carb", "carbohydrates"],
|
|
45
|
+
"insulin": ["insulin", "insulinunits", "bolus", "basal"],
|
|
46
|
+
},
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
IMPORT_FORMAT_SCHEMAS: Dict[str, Dict[str, List[str]]] = {
|
|
50
|
+
"generic": {
|
|
51
|
+
"required": ["timestamp", "glucose"],
|
|
52
|
+
"optional": ["carbs", "insulin"],
|
|
53
|
+
},
|
|
54
|
+
"dexcom": {
|
|
55
|
+
"required": ["timestamp", "glucose"],
|
|
56
|
+
"optional": ["carbs", "insulin"],
|
|
57
|
+
},
|
|
58
|
+
"libre": {
|
|
59
|
+
"required": ["timestamp", "glucose"],
|
|
60
|
+
"optional": ["carbs", "insulin"],
|
|
61
|
+
},
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class ImportResult:
|
|
67
|
+
dataframe: pd.DataFrame
|
|
68
|
+
scenario: Dict[str, Any]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def guess_column_mapping(columns: Iterable[str], data_format: str = "generic") -> Dict[str, Optional[str]]:
|
|
72
|
+
candidates = DEFAULT_MAPPINGS.get(data_format, DEFAULT_MAPPINGS["generic"])
|
|
73
|
+
return {
|
|
74
|
+
"timestamp": _find_column(columns, candidates.get("timestamp", [])),
|
|
75
|
+
"glucose": _find_column(columns, candidates.get("glucose", [])),
|
|
76
|
+
"carbs": _find_column(columns, candidates.get("carbs", [])),
|
|
77
|
+
"insulin": _find_column(columns, candidates.get("insulin", [])),
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def validate_import_schema(
|
|
82
|
+
columns: Iterable[str],
|
|
83
|
+
data_format: str,
|
|
84
|
+
column_map: Optional[Dict[str, str]] = None,
|
|
85
|
+
) -> None:
|
|
86
|
+
schema = IMPORT_FORMAT_SCHEMAS.get(data_format, IMPORT_FORMAT_SCHEMAS["generic"])
|
|
87
|
+
candidates = DEFAULT_MAPPINGS.get(data_format, DEFAULT_MAPPINGS["generic"])
|
|
88
|
+
mapping = column_map or {}
|
|
89
|
+
|
|
90
|
+
missing: List[str] = []
|
|
91
|
+
for key in schema["required"]:
|
|
92
|
+
if key in mapping:
|
|
93
|
+
continue
|
|
94
|
+
found = _find_column(columns, candidates.get(key, []))
|
|
95
|
+
if found is None:
|
|
96
|
+
missing.append(key)
|
|
97
|
+
|
|
98
|
+
if missing:
|
|
99
|
+
raise ValueError(
|
|
100
|
+
f"Missing required columns for format '{data_format}': {', '.join(missing)}. "
|
|
101
|
+
f"Columns: {list(columns)}"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def import_cgm_dataframe(
|
|
106
|
+
df: pd.DataFrame,
|
|
107
|
+
data_format: str = "generic",
|
|
108
|
+
column_map: Optional[Dict[str, str]] = None,
|
|
109
|
+
time_unit: str = "minutes",
|
|
110
|
+
source: Optional[str] = None,
|
|
111
|
+
) -> pd.DataFrame:
|
|
112
|
+
"""
|
|
113
|
+
Import CGM data from an in-memory DataFrame into the universal IINTS schema.
|
|
114
|
+
"""
|
|
115
|
+
columns = list(df.columns)
|
|
116
|
+
mapping = column_map or {}
|
|
117
|
+
mapping = {k: v for k, v in mapping.items() if v}
|
|
118
|
+
|
|
119
|
+
candidates = DEFAULT_MAPPINGS.get(data_format, DEFAULT_MAPPINGS["generic"])
|
|
120
|
+
|
|
121
|
+
validate_import_schema(columns, data_format=data_format, column_map=mapping)
|
|
122
|
+
|
|
123
|
+
def resolve(key: str, required: bool = True) -> Optional[str]:
|
|
124
|
+
if key in mapping:
|
|
125
|
+
return mapping[key]
|
|
126
|
+
col = _find_column(columns, candidates.get(key, []))
|
|
127
|
+
if required and col is None:
|
|
128
|
+
raise ValueError(f"Missing required column for '{key}'. Columns: {columns}")
|
|
129
|
+
return col
|
|
130
|
+
|
|
131
|
+
ts_col = resolve("timestamp", required=True)
|
|
132
|
+
glucose_col = resolve("glucose", required=True)
|
|
133
|
+
carbs_col = resolve("carbs", required=False)
|
|
134
|
+
insulin_col = resolve("insulin", required=False)
|
|
135
|
+
|
|
136
|
+
df = df.rename(
|
|
137
|
+
columns={
|
|
138
|
+
ts_col: "timestamp",
|
|
139
|
+
glucose_col: "glucose",
|
|
140
|
+
carbs_col: "carbs",
|
|
141
|
+
insulin_col: "insulin",
|
|
142
|
+
}
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
if "carbs" not in df.columns:
|
|
146
|
+
df["carbs"] = 0.0
|
|
147
|
+
if "insulin" not in df.columns:
|
|
148
|
+
df["insulin"] = 0.0
|
|
149
|
+
|
|
150
|
+
# Parse timestamps
|
|
151
|
+
if pd.api.types.is_numeric_dtype(df["timestamp"]):
|
|
152
|
+
# Assume numeric (minutes or seconds)
|
|
153
|
+
if time_unit == "seconds":
|
|
154
|
+
df["timestamp"] = df["timestamp"].astype(float) / 60.0
|
|
155
|
+
else:
|
|
156
|
+
df["timestamp"] = df["timestamp"].astype(float)
|
|
157
|
+
elif pd.api.types.is_datetime64_any_dtype(df["timestamp"]):
|
|
158
|
+
ts = df["timestamp"]
|
|
159
|
+
df["timestamp"] = (ts - ts.iloc[0]).dt.total_seconds() / 60.0
|
|
160
|
+
else:
|
|
161
|
+
# Try datetime parsing, fallback to numeric
|
|
162
|
+
ts = pd.to_datetime(df["timestamp"], errors="coerce")
|
|
163
|
+
if ts.isna().all():
|
|
164
|
+
if time_unit == "seconds":
|
|
165
|
+
df["timestamp"] = df["timestamp"].astype(float) / 60.0
|
|
166
|
+
else:
|
|
167
|
+
df["timestamp"] = df["timestamp"].astype(float)
|
|
168
|
+
else:
|
|
169
|
+
df["timestamp"] = (ts - ts.iloc[0]).dt.total_seconds() / 60.0
|
|
170
|
+
|
|
171
|
+
df["source"] = source or data_format
|
|
172
|
+
ingestor = DataIngestor()
|
|
173
|
+
ingestor._validate_schema(df, ingestor.UNIVERSAL_SCHEMA)
|
|
174
|
+
return df[list(ingestor.UNIVERSAL_SCHEMA.keys())]
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def import_cgm_csv(
|
|
178
|
+
path: Union[str, Path],
|
|
179
|
+
data_format: str = "generic",
|
|
180
|
+
column_map: Optional[Dict[str, str]] = None,
|
|
181
|
+
time_unit: str = "minutes",
|
|
182
|
+
source: Optional[str] = None,
|
|
183
|
+
) -> pd.DataFrame:
|
|
184
|
+
"""
|
|
185
|
+
Import CGM data from CSV into the universal IINTS schema.
|
|
186
|
+
"""
|
|
187
|
+
df = pd.read_csv(path)
|
|
188
|
+
return import_cgm_dataframe(
|
|
189
|
+
df,
|
|
190
|
+
data_format=data_format,
|
|
191
|
+
column_map=column_map,
|
|
192
|
+
time_unit=time_unit,
|
|
193
|
+
source=source,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def scenario_from_dataframe(
|
|
198
|
+
df: pd.DataFrame,
|
|
199
|
+
scenario_name: str,
|
|
200
|
+
scenario_version: str = "1.0",
|
|
201
|
+
description: str = "Imported CGM scenario",
|
|
202
|
+
carb_threshold: float = 0.1,
|
|
203
|
+
absorption_delay_minutes: int = 10,
|
|
204
|
+
duration_minutes: int = 60,
|
|
205
|
+
) -> Dict[str, Any]:
|
|
206
|
+
stress_events = []
|
|
207
|
+
if "carbs" in df.columns:
|
|
208
|
+
for _, row in df[df["carbs"] > carb_threshold].iterrows():
|
|
209
|
+
stress_events.append(
|
|
210
|
+
{
|
|
211
|
+
"start_time": int(row["timestamp"]),
|
|
212
|
+
"event_type": "meal",
|
|
213
|
+
"value": float(row["carbs"]),
|
|
214
|
+
"absorption_delay_minutes": absorption_delay_minutes,
|
|
215
|
+
"duration": duration_minutes,
|
|
216
|
+
}
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
return {
|
|
220
|
+
"scenario_name": scenario_name,
|
|
221
|
+
"scenario_version": scenario_version,
|
|
222
|
+
"description": description,
|
|
223
|
+
"stress_events": stress_events,
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def scenario_from_csv(
|
|
228
|
+
path: Union[str, Path],
|
|
229
|
+
scenario_name: str = "Imported CGM Scenario",
|
|
230
|
+
scenario_version: str = "1.0",
|
|
231
|
+
data_format: str = "generic",
|
|
232
|
+
column_map: Optional[Dict[str, str]] = None,
|
|
233
|
+
time_unit: str = "minutes",
|
|
234
|
+
carb_threshold: float = 0.1,
|
|
235
|
+
) -> ImportResult:
|
|
236
|
+
df = import_cgm_csv(
|
|
237
|
+
path,
|
|
238
|
+
data_format=data_format,
|
|
239
|
+
column_map=column_map,
|
|
240
|
+
time_unit=time_unit,
|
|
241
|
+
)
|
|
242
|
+
scenario = scenario_from_dataframe(
|
|
243
|
+
df,
|
|
244
|
+
scenario_name=scenario_name,
|
|
245
|
+
scenario_version=scenario_version,
|
|
246
|
+
carb_threshold=carb_threshold,
|
|
247
|
+
)
|
|
248
|
+
return ImportResult(dataframe=df, scenario=scenario)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def export_standard_csv(df: pd.DataFrame, output_path: Union[str, Path]) -> str:
|
|
252
|
+
output_path = Path(output_path)
|
|
253
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
254
|
+
df.to_csv(output_path, index=False)
|
|
255
|
+
return str(output_path)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _read_demo_csv_text() -> str:
|
|
259
|
+
if sys.version_info >= (3, 9):
|
|
260
|
+
from importlib.resources import files
|
|
261
|
+
return files("iints.data.demo").joinpath("demo_cgm.csv").read_text()
|
|
262
|
+
from importlib import resources
|
|
263
|
+
return resources.read_text("iints.data.demo", "demo_cgm.csv")
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def load_demo_dataframe() -> pd.DataFrame:
|
|
267
|
+
text = _read_demo_csv_text()
|
|
268
|
+
return pd.read_csv(io.StringIO(text))
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def export_demo_csv(output_path: Union[str, Path]) -> str:
|
|
272
|
+
output_path = Path(output_path)
|
|
273
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
274
|
+
output_path.write_text(_read_demo_csv_text())
|
|
275
|
+
return str(output_path)
|
iints/data/ingestor.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Dict, Any, Union, Optional
|
|
4
|
+
import yaml
|
|
5
|
+
|
|
6
|
+
class DataIngestor:
|
|
7
|
+
"""
|
|
8
|
+
Standardized Data Bridge for ingesting various diabetes datasets into a
|
|
9
|
+
universal IINTS-AF format.
|
|
10
|
+
"""
|
|
11
|
+
UNIVERSAL_SCHEMA = {
|
|
12
|
+
"timestamp": float,
|
|
13
|
+
"glucose": float,
|
|
14
|
+
"carbs": float, # Can be null, but pandas will infer float if mixed
|
|
15
|
+
"insulin": float, # Can be null, but pandas will infer float if mixed
|
|
16
|
+
"source": str,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
def __init__(self, data_dir: Optional[str] = None):
|
|
20
|
+
self.data_dir = data_dir or "./data"
|
|
21
|
+
# Zorg dat de rest van je code zelf dit pad gebruikt om bestanden te vinden
|
|
22
|
+
|
|
23
|
+
def _load_ohio_t1dm_csv(self, file_path: Path) -> pd.DataFrame:
|
|
24
|
+
"""
|
|
25
|
+
Loads and transforms Ohio T1DM dataset CSV into universal schema.
|
|
26
|
+
Expected columns in Ohio T1DM: 'timestamp', 'glucose', 'carbs', 'insulin'
|
|
27
|
+
"""
|
|
28
|
+
df = pd.read_csv(file_path)
|
|
29
|
+
|
|
30
|
+
# Assuming 'timestamp' is already in minutes from start or can be converted
|
|
31
|
+
# For simplicity, assuming it's already a float representing minutes
|
|
32
|
+
# If it's a datetime, conversion would be needed:
|
|
33
|
+
# df['timestamp'] = (df['timestamp'] - df['timestamp'].min()).dt.total_seconds() / 60.0
|
|
34
|
+
|
|
35
|
+
# Rename columns to match universal schema if necessary
|
|
36
|
+
# Example: if original columns were different, map them here.
|
|
37
|
+
# For Ohio T1DM, let's assume they are already lowercase 'glucose', 'carbs', 'insulin'
|
|
38
|
+
# if 'BG' in df.columns: df = df.rename(columns={'BG': 'glucose'})
|
|
39
|
+
# if 'Carbs' in df.columns: df = df.rename(columns={'Carbs': 'carbs'})
|
|
40
|
+
# if 'Insulin' in df.columns: df = df.rename(columns={'Insulin': 'insulin'})
|
|
41
|
+
|
|
42
|
+
# Add 'source' column
|
|
43
|
+
df['source'] = 'public_ohio_t1dm'
|
|
44
|
+
|
|
45
|
+
# Ensure only universal schema columns are present and in order
|
|
46
|
+
required_cols = list(self.UNIVERSAL_SCHEMA.keys())
|
|
47
|
+
for col in required_cols:
|
|
48
|
+
if col not in df.columns:
|
|
49
|
+
df[col] = pd.NA # Add missing columns as NA
|
|
50
|
+
|
|
51
|
+
return df[required_cols]
|
|
52
|
+
|
|
53
|
+
def _validate_schema(self, df: pd.DataFrame, schema: Dict[str, type]) -> None:
|
|
54
|
+
"""
|
|
55
|
+
Validates DataFrame against the expected schema.
|
|
56
|
+
Raises ValueError if validation fails.
|
|
57
|
+
"""
|
|
58
|
+
for col, expected_type in schema.items():
|
|
59
|
+
if col not in df.columns:
|
|
60
|
+
raise ValueError(f"Missing required column: {col}")
|
|
61
|
+
# Basic type check (pandas dtypes are more complex, this is a simplified check)
|
|
62
|
+
# if not pd.api.types.is_dtype_equal(df[col].dtype, pd.Series(dtype=expected_type).dtype):
|
|
63
|
+
# print(f"Warning: Column '{col}' type mismatch. Expected {expected_type}, got {df[col].dtype}")
|
|
64
|
+
|
|
65
|
+
# Basic quality checks (from DATA_SCHEMA.md)
|
|
66
|
+
if 'glucose' in df.columns:
|
|
67
|
+
if not ((df['glucose'] >= 20) & (df['glucose'] <= 600)).all():
|
|
68
|
+
raise ValueError("Glucose values outside acceptable range (20-600 mg/dL)")
|
|
69
|
+
if 'insulin' in df.columns and not df['insulin'].isna().all():
|
|
70
|
+
if not ((df['insulin'] >= 0) & (df['insulin'] <= 50)).all():
|
|
71
|
+
raise ValueError("Insulin values outside acceptable range (0-50 units)")
|
|
72
|
+
|
|
73
|
+
# Check for missing timestamps (assuming 'timestamp' exists)
|
|
74
|
+
if df['timestamp'].isnull().any():
|
|
75
|
+
raise ValueError("Missing values in 'timestamp' column.")
|
|
76
|
+
|
|
77
|
+
def get_patient_model(self, file_path: Union[str, Path], data_type: str) -> pd.DataFrame:
|
|
78
|
+
"""
|
|
79
|
+
Loads patient data from a file and returns it as a standardized DataFrame.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
file_path (Union[str, Path]): Path to the data file. Can be extension-less for 'model' type.
|
|
83
|
+
data_type (str): Type of the data source (e.g., 'ohio_t1dm', 'iints_standard_csv', 'model').
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
pd.DataFrame: A DataFrame conforming to the universal IINTS-AF schema.
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
ValueError: If the data_type is not supported or validation fails.
|
|
90
|
+
FileNotFoundError: If the data file cannot be found.
|
|
91
|
+
"""
|
|
92
|
+
file_path = Path(file_path)
|
|
93
|
+
|
|
94
|
+
# If data type is 'model' (JSON), ensure the .json extension is present.
|
|
95
|
+
if data_type == 'model' or data_type == 'iints_standard_json':
|
|
96
|
+
if not file_path.suffix:
|
|
97
|
+
file_path = file_path.with_suffix('.json')
|
|
98
|
+
|
|
99
|
+
if not file_path.is_file():
|
|
100
|
+
raise FileNotFoundError(f"Data file not found: {file_path}")
|
|
101
|
+
|
|
102
|
+
df = pd.DataFrame()
|
|
103
|
+
if data_type == 'ohio_t1dm':
|
|
104
|
+
df = self._load_ohio_t1dm_csv(file_path)
|
|
105
|
+
elif data_type == 'iints_standard_csv':
|
|
106
|
+
# Assume this is already in the universal schema format
|
|
107
|
+
df = pd.read_csv(file_path)
|
|
108
|
+
elif data_type == 'model' or data_type == 'iints_standard_json':
|
|
109
|
+
# Assumes a records-oriented JSON file.
|
|
110
|
+
df = pd.read_json(file_path, orient='records')
|
|
111
|
+
# Add source column if not present
|
|
112
|
+
if 'source' not in df.columns:
|
|
113
|
+
df['source'] = 'iints_standard_json'
|
|
114
|
+
|
|
115
|
+
# Ensure only universal schema columns are present and in order
|
|
116
|
+
required_cols = list(self.UNIVERSAL_SCHEMA.keys())
|
|
117
|
+
for col in required_cols:
|
|
118
|
+
if col not in df.columns:
|
|
119
|
+
df[col] = pd.NA # Add missing columns as NA
|
|
120
|
+
df = df[required_cols]
|
|
121
|
+
else:
|
|
122
|
+
raise ValueError(f"Unsupported data type: {data_type}")
|
|
123
|
+
|
|
124
|
+
# Validate the loaded data against the universal schema
|
|
125
|
+
self._validate_schema(df, self.UNIVERSAL_SCHEMA)
|
|
126
|
+
|
|
127
|
+
return df
|
|
128
|
+
|
|
129
|
+
if __name__ == "__main__":
|
|
130
|
+
# Example usage:
|
|
131
|
+
# This assumes you have a timeseries.csv in data_packs/public/ohio_t1dm/patient_XXX/
|
|
132
|
+
# For testing, we'll try to find one.
|
|
133
|
+
|
|
134
|
+
ohio_data_path = Path("data_packs/public/ohio_t1dm")
|
|
135
|
+
|
|
136
|
+
patient_dirs = [d for d in ohio_data_path.iterdir() if d.is_dir() and d.name.startswith("patient_")]
|
|
137
|
+
|
|
138
|
+
if patient_dirs:
|
|
139
|
+
sample_timeseries_file = None
|
|
140
|
+
for patient_dir in patient_dirs:
|
|
141
|
+
if (patient_dir / "timeseries.csv").is_file():
|
|
142
|
+
sample_timeseries_file = patient_dir / "timeseries.csv"
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
if sample_timeseries_file:
|
|
146
|
+
print(f"Loading sample Ohio T1DM data from: {sample_timeseries_file}")
|
|
147
|
+
ingestor = DataIngestor()
|
|
148
|
+
try:
|
|
149
|
+
df = ingestor.get_patient_model(sample_timeseries_file, 'ohio_t1dm')
|
|
150
|
+
print("Data loaded successfully and validated:")
|
|
151
|
+
print(df.head())
|
|
152
|
+
df.info()
|
|
153
|
+
except FileNotFoundError as e:
|
|
154
|
+
print(f"Error loading data: {e}")
|
|
155
|
+
except ValueError as e:
|
|
156
|
+
print(f"Data quality issue detected during validation for {sample_timeseries_file.name}: {e}")
|
|
157
|
+
except Exception as e:
|
|
158
|
+
print(f"An unexpected error occurred: {e}")
|
|
159
|
+
else:
|
|
160
|
+
print(f"No 'timeseries.csv' found in any patient directory within {ohio_data_path}. Cannot run example.")
|
|
161
|
+
else:
|
|
162
|
+
print(f"No patient directories found in {ohio_data_path}. Cannot run example.")
|