iints-sdk-python35 0.0.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. iints/__init__.py +183 -0
  2. iints/analysis/__init__.py +12 -0
  3. iints/analysis/algorithm_xray.py +387 -0
  4. iints/analysis/baseline.py +92 -0
  5. iints/analysis/clinical_benchmark.py +198 -0
  6. iints/analysis/clinical_metrics.py +551 -0
  7. iints/analysis/clinical_tir_analyzer.py +136 -0
  8. iints/analysis/diabetes_metrics.py +43 -0
  9. iints/analysis/edge_efficiency.py +33 -0
  10. iints/analysis/edge_performance_monitor.py +315 -0
  11. iints/analysis/explainability.py +94 -0
  12. iints/analysis/explainable_ai.py +232 -0
  13. iints/analysis/hardware_benchmark.py +221 -0
  14. iints/analysis/metrics.py +117 -0
  15. iints/analysis/population_report.py +188 -0
  16. iints/analysis/reporting.py +345 -0
  17. iints/analysis/safety_index.py +311 -0
  18. iints/analysis/sensor_filtering.py +54 -0
  19. iints/analysis/validator.py +273 -0
  20. iints/api/__init__.py +0 -0
  21. iints/api/base_algorithm.py +307 -0
  22. iints/api/registry.py +103 -0
  23. iints/api/template_algorithm.py +195 -0
  24. iints/assets/iints_logo.png +0 -0
  25. iints/cli/__init__.py +0 -0
  26. iints/cli/cli.py +2598 -0
  27. iints/core/__init__.py +1 -0
  28. iints/core/algorithms/__init__.py +0 -0
  29. iints/core/algorithms/battle_runner.py +138 -0
  30. iints/core/algorithms/correction_bolus.py +95 -0
  31. iints/core/algorithms/discovery.py +92 -0
  32. iints/core/algorithms/fixed_basal_bolus.py +58 -0
  33. iints/core/algorithms/hybrid_algorithm.py +92 -0
  34. iints/core/algorithms/lstm_algorithm.py +138 -0
  35. iints/core/algorithms/mock_algorithms.py +162 -0
  36. iints/core/algorithms/pid_controller.py +88 -0
  37. iints/core/algorithms/standard_pump_algo.py +64 -0
  38. iints/core/device.py +0 -0
  39. iints/core/device_manager.py +64 -0
  40. iints/core/devices/__init__.py +3 -0
  41. iints/core/devices/models.py +160 -0
  42. iints/core/patient/__init__.py +9 -0
  43. iints/core/patient/bergman_model.py +341 -0
  44. iints/core/patient/models.py +285 -0
  45. iints/core/patient/patient_factory.py +117 -0
  46. iints/core/patient/profile.py +41 -0
  47. iints/core/safety/__init__.py +12 -0
  48. iints/core/safety/config.py +37 -0
  49. iints/core/safety/input_validator.py +95 -0
  50. iints/core/safety/supervisor.py +39 -0
  51. iints/core/simulation/__init__.py +0 -0
  52. iints/core/simulation/scenario_parser.py +61 -0
  53. iints/core/simulator.py +874 -0
  54. iints/core/supervisor.py +367 -0
  55. iints/data/__init__.py +53 -0
  56. iints/data/adapter.py +142 -0
  57. iints/data/column_mapper.py +398 -0
  58. iints/data/datasets.json +132 -0
  59. iints/data/demo/__init__.py +1 -0
  60. iints/data/demo/demo_cgm.csv +289 -0
  61. iints/data/importer.py +275 -0
  62. iints/data/ingestor.py +162 -0
  63. iints/data/nightscout.py +128 -0
  64. iints/data/quality_checker.py +550 -0
  65. iints/data/registry.py +166 -0
  66. iints/data/tidepool.py +38 -0
  67. iints/data/universal_parser.py +813 -0
  68. iints/data/virtual_patients/clinic_safe_baseline.yaml +9 -0
  69. iints/data/virtual_patients/clinic_safe_hyper_challenge.yaml +9 -0
  70. iints/data/virtual_patients/clinic_safe_hypo_prone.yaml +9 -0
  71. iints/data/virtual_patients/clinic_safe_midnight.yaml +9 -0
  72. iints/data/virtual_patients/clinic_safe_pizza.yaml +9 -0
  73. iints/data/virtual_patients/clinic_safe_stress_meal.yaml +9 -0
  74. iints/data/virtual_patients/default_patient.yaml +11 -0
  75. iints/data/virtual_patients/patient_559_config.yaml +11 -0
  76. iints/emulation/__init__.py +80 -0
  77. iints/emulation/legacy_base.py +414 -0
  78. iints/emulation/medtronic_780g.py +337 -0
  79. iints/emulation/omnipod_5.py +367 -0
  80. iints/emulation/tandem_controliq.py +393 -0
  81. iints/highlevel.py +451 -0
  82. iints/learning/__init__.py +3 -0
  83. iints/learning/autonomous_optimizer.py +194 -0
  84. iints/learning/learning_system.py +122 -0
  85. iints/metrics.py +34 -0
  86. iints/population/__init__.py +11 -0
  87. iints/population/generator.py +131 -0
  88. iints/population/runner.py +327 -0
  89. iints/presets/__init__.py +28 -0
  90. iints/presets/presets.json +114 -0
  91. iints/research/__init__.py +30 -0
  92. iints/research/config.py +68 -0
  93. iints/research/dataset.py +319 -0
  94. iints/research/losses.py +73 -0
  95. iints/research/predictor.py +329 -0
  96. iints/scenarios/__init__.py +3 -0
  97. iints/scenarios/generator.py +92 -0
  98. iints/templates/__init__.py +0 -0
  99. iints/templates/default_algorithm.py +91 -0
  100. iints/templates/scenarios/__init__.py +0 -0
  101. iints/templates/scenarios/chaos_insulin_stacking.json +29 -0
  102. iints/templates/scenarios/chaos_runaway_ai.json +25 -0
  103. iints/templates/scenarios/example_scenario.json +35 -0
  104. iints/templates/scenarios/exercise_stress.json +30 -0
  105. iints/utils/__init__.py +3 -0
  106. iints/utils/plotting.py +50 -0
  107. iints/utils/run_io.py +152 -0
  108. iints/validation/__init__.py +133 -0
  109. iints/validation/schemas.py +94 -0
  110. iints/visualization/__init__.py +34 -0
  111. iints/visualization/cockpit.py +691 -0
  112. iints/visualization/uncertainty_cloud.py +612 -0
  113. iints_sdk_python35-0.0.18.dist-info/METADATA +225 -0
  114. iints_sdk_python35-0.0.18.dist-info/RECORD +118 -0
  115. iints_sdk_python35-0.0.18.dist-info/WHEEL +5 -0
  116. iints_sdk_python35-0.0.18.dist-info/entry_points.txt +10 -0
  117. iints_sdk_python35-0.0.18.dist-info/licenses/LICENSE +28 -0
  118. iints_sdk_python35-0.0.18.dist-info/top_level.txt +1 -0
@@ -0,0 +1,813 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Universal Data Parser - IINTS-AF
4
+ Universal ingestion engine for any CSV/JSON data format.
5
+
6
+ This is the "Universal Data Bridge" - it accepts any data format and
7
+ converts it to the standard IINTS format: [Time, Glucose, Carbs, Insulin]
8
+ """
9
+
10
+ import json
11
+ import pandas as pd
12
+ import numpy as np
13
+ from pathlib import Path
14
+ from typing import Dict, List, Optional, Tuple, Union, Any
15
+ from dataclasses import dataclass, field
16
+ from datetime import datetime
17
+
18
+ from .column_mapper import ColumnMapper, ColumnMapping
19
+ from .quality_checker import DataQualityChecker, QualityReport
20
+
21
+
22
+ @dataclass
23
+ class StandardDataPack:
24
+ """
25
+ Standard data format for IINTS-AF.
26
+
27
+ All data is converted to this format before simulation.
28
+ """
29
+ data: pd.DataFrame
30
+ metadata: Dict[str, Any] = field(default_factory=dict)
31
+ quality_report: Optional[QualityReport] = None
32
+ source_file: Optional[str] = None
33
+ data_format: str = "standard"
34
+
35
+ def __post_init__(self):
36
+ """Ensure standard columns exist"""
37
+ required_cols = ['timestamp', 'glucose']
38
+ optional_cols = ['carbs', 'insulin']
39
+
40
+ for col in required_cols:
41
+ if col not in self.data.columns:
42
+ self.data[col] = float('nan')
43
+
44
+ for col in optional_cols:
45
+ if col not in self.data.columns:
46
+ self.data[col] = 0.0
47
+
48
+ @property
49
+ def duration_hours(self) -> float:
50
+ """Get data duration in hours"""
51
+ if 'timestamp' not in self.data.columns or len(self.data) < 2:
52
+ return 0.0
53
+ return (self.data['timestamp'].max() - self.data['timestamp'].min()) / 60.0
54
+
55
+ @property
56
+ def data_points(self) -> int:
57
+ """Get number of data points"""
58
+ return len(self.data)
59
+
60
+ @property
61
+ def confidence_score(self) -> float:
62
+ """Get simulation confidence score"""
63
+ if self.quality_report:
64
+ return self.quality_report.overall_score
65
+ return 0.85 # Default if no quality report
66
+
67
+
68
+ @dataclass
69
+ class ParseResult:
70
+ """Result of a parse operation"""
71
+ success: bool
72
+ data_pack: Optional[StandardDataPack]
73
+ errors: List[str]
74
+ warnings: List[str]
75
+ parse_time_seconds: float
76
+
77
+ def to_dict(self) -> Dict:
78
+ return {
79
+ 'success': self.success,
80
+ 'data_pack': {
81
+ 'data_points': self.data_pack.data_points if self.data_pack else 0,
82
+ 'duration_hours': self.data_pack.duration_hours if self.data_pack else 0,
83
+ 'confidence_score': self.data_pack.confidence_score if self.data_pack else 0
84
+ } if self.data_pack else None,
85
+ 'errors': self.errors,
86
+ 'warnings': self.warnings,
87
+ 'parse_time_seconds': self.parse_time_seconds
88
+ }
89
+
90
+
91
+ class UniversalParser:
92
+ """
93
+ Universal data parser for IINTS-AF.
94
+
95
+ Accepts any CSV or JSON file and converts it to standard format.
96
+ Handles:
97
+ - Automatic column detection and mapping
98
+ - Multiple date/time formats
99
+ - Various glucose unit conversions
100
+ - Data quality assessment
101
+
102
+ Usage:
103
+ parser = UniversalParser()
104
+ result = parser.parse("patient_data.csv")
105
+ if result.success:
106
+ data = result.data_pack
107
+ # Use data for simulation
108
+ """
109
+
110
+ # Supported date formats
111
+ DATE_FORMATS = [
112
+ '%Y-%m-%d %H:%M:%S',
113
+ '%Y-%m-%d %H:%M',
114
+ '%Y-%m-%d',
115
+ '%m/%d/%Y %H:%M:%S',
116
+ '%m/%d/%Y %H:%M',
117
+ '%m/%d/%Y',
118
+ '%H:%M:%S',
119
+ '%H:%M',
120
+ '%Y-%m-%dT%H:%M:%S',
121
+ '%Y-%m-%dT%H:%M:%SZ',
122
+ '%Y-%m-%dT%H:%M:%S.%f',
123
+ '%Y-%m-%dT%H:%M:%S.%fZ',
124
+ None, # Let pandas infer
125
+ ]
126
+
127
+ # Glucose unit conversions (to mg/dL)
128
+ GLUCOSE_CONVERSIONS = {
129
+ 'mg/dl': 1.0,
130
+ 'mg/dL': 1.0,
131
+ 'mmol/l': 18.0182,
132
+ 'mmol/L': 18.0182,
133
+ }
134
+
135
+ def __init__(self,
136
+ auto_validate: bool = True,
137
+ expected_interval: int = 5):
138
+ """
139
+ Initialize universal parser.
140
+
141
+ Args:
142
+ auto_validate: Whether to automatically run quality checks
143
+ expected_interval: Expected time between readings in minutes
144
+ """
145
+ self.column_mapper = ColumnMapper()
146
+ self.quality_checker = DataQualityChecker(
147
+ expected_interval=expected_interval,
148
+ source_type='cgm'
149
+ )
150
+ self.auto_validate = auto_validate
151
+ self.expected_interval = expected_interval
152
+
153
+ def detect_format(self, file_path: str) -> str:
154
+ """
155
+ Detect file format from extension.
156
+
157
+ Args:
158
+ file_path: Path to the file
159
+
160
+ Returns:
161
+ Detected format ('csv', 'json', 'parquet', 'unknown')
162
+ """
163
+ path = Path(file_path)
164
+ suffix = path.suffix.lower()
165
+
166
+ if suffix == '.csv':
167
+ return 'csv'
168
+ elif suffix == '.json':
169
+ return 'json'
170
+ elif suffix == '.parquet':
171
+ return 'parquet'
172
+ else:
173
+ return 'unknown'
174
+
175
+ def detect_delimiter(self, file_path: str) -> Optional[str]:
176
+ """
177
+ Detect CSV delimiter by analyzing the file.
178
+
179
+ Args:
180
+ file_path: Path to CSV file
181
+
182
+ Returns:
183
+ Detected delimiter or None
184
+ """
185
+ with open(file_path, 'r', encoding='utf-8') as f:
186
+ first_line = f.readline()
187
+
188
+ # Common delimiters to check
189
+ delimiters = [',', ';', '\t', '|']
190
+ detected = None
191
+ max_count = 0
192
+
193
+ for delimiter in delimiters:
194
+ count = first_line.count(delimiter)
195
+ if count > max_count:
196
+ max_count = count
197
+ detected = delimiter
198
+
199
+ return detected
200
+
201
+ def parse_datetime(self, value: Any) -> Optional[float]:
202
+ """
203
+ Parse datetime value to minutes from start.
204
+
205
+ Args:
206
+ value: Datetime value to parse
207
+
208
+ Returns:
209
+ Minutes from start or None if parsing fails
210
+ """
211
+ if pd.isna(value):
212
+ return None
213
+
214
+ # If already numeric, assume minutes
215
+ if isinstance(value, (int, float)):
216
+ return float(value)
217
+
218
+ # If string, try to parse
219
+ if isinstance(value, str):
220
+ value = value.strip()
221
+
222
+ try:
223
+ # Use pandas for robust datetime parsing
224
+ dt = pd.to_datetime(value)
225
+ # Return minutes from midnight. The normalize_timestamps function
226
+ # will convert this to minutes from the start of the series.
227
+ return dt.hour * 60 + dt.minute + dt.second / 60 + dt.microsecond / 1_000_000 / 60
228
+ except ValueError:
229
+ # If parsing as a date fails, it might be a time-only format
230
+ # that pandas couldn't infer.
231
+ try:
232
+ parts = value.split(':')
233
+ if len(parts) >= 2:
234
+ hours = int(parts[0])
235
+ minutes = int(parts[1])
236
+ seconds = float(parts[2]) if len(parts) > 2 else 0.0
237
+ if 0 <= hours < 24 and 0 <= minutes < 60 and 0 <= seconds < 60:
238
+ return hours * 60 + minutes + seconds / 60
239
+ except (ValueError, IndexError):
240
+ return None # Could not parse as time
241
+
242
+ return None
243
+
244
+ def parse_glucose(self, value: Any, unit: str = 'mg/dL') -> Optional[float]:
245
+ """
246
+ Parse glucose value and convert to mg/dL.
247
+
248
+ Args:
249
+ value: Glucose value to parse
250
+ unit: Unit of the value
251
+
252
+ Returns:
253
+ Glucose in mg/dL or None if parsing fails
254
+ """
255
+ if pd.isna(value):
256
+ return None
257
+
258
+ try:
259
+ glucose_mgdl = float(value)
260
+
261
+ # Convert if necessary
262
+ if unit.lower() in ['mmol/l', 'mmol/l']:
263
+ glucose_mgdl *= 18.0182 # Convert mmol/L to mg/dL
264
+
265
+ return glucose_mgdl
266
+ except (ValueError, TypeError):
267
+ return None
268
+
269
+ def parse_csv(self, file_path: str) -> pd.DataFrame:
270
+ """
271
+ Parse CSV file with automatic delimiter detection.
272
+
273
+ Args:
274
+ file_path: Path to CSV file
275
+
276
+ Returns:
277
+ Parsed DataFrame
278
+ """
279
+ # Detect delimiter
280
+ delimiter = self.detect_delimiter(file_path)
281
+
282
+ # Read CSV
283
+ df = pd.read_csv(
284
+ file_path,
285
+ delimiter=delimiter,
286
+ na_values=['', 'NA', 'N/A', 'null', 'NULL', 'NaN', 'nan'],
287
+ keep_default_na=True
288
+ )
289
+
290
+ return df
291
+
292
+ def parse_json(self, file_path: str) -> pd.DataFrame:
293
+ """
294
+ Parse JSON file.
295
+
296
+ Args:
297
+ file_path: Path to JSON file
298
+
299
+ Returns:
300
+ Parsed DataFrame
301
+ """
302
+ with open(file_path, 'r', encoding='utf-8') as f:
303
+ data = json.load(f)
304
+
305
+ # Handle different JSON structures
306
+ if isinstance(data, list):
307
+ df = pd.json_normalize(data)
308
+ elif isinstance(data, dict):
309
+ # Check for common nested structures
310
+ if 'data' in data and isinstance(data['data'], list):
311
+ df = pd.json_normalize(data['data'])
312
+ elif 'readings' in data and isinstance(data['readings'], list):
313
+ df = pd.json_normalize(data['readings'])
314
+ elif 'entries' in data and isinstance(data['entries'], list):
315
+ df = pd.json_normalize(data['entries'])
316
+ else:
317
+ df = pd.json_normalize(data)
318
+ else:
319
+ raise ValueError(f"Unexpected JSON structure in {file_path}")
320
+
321
+ return df
322
+
323
+ def convert_to_standard(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, dict]:
324
+ """
325
+ Convert DataFrame to standard IINTS format.
326
+
327
+ Args:
328
+ df: Input DataFrame with original columns
329
+
330
+ Returns:
331
+ DataFrame with standard columns [timestamp, glucose, carbs, insulin]
332
+ """
333
+ result_df = df.copy()
334
+
335
+ # Map columns to standard format
336
+ mapping = self.column_mapper.map_columns(list(result_df.columns))
337
+
338
+ if mapping.mapped_columns:
339
+ # Rename columns to standard names
340
+ rename_dict = {v: k for k, v in mapping.mapped_columns.items()}
341
+ result_df = result_df.rename(columns=rename_dict)
342
+
343
+ # Parse timestamp column
344
+ if 'timestamp' in result_df.columns:
345
+ result_df['timestamp'] = result_df['timestamp'].apply(self.parse_datetime)
346
+
347
+ # Ensure glucose is numeric
348
+ if 'glucose' in result_df.columns:
349
+ result_df['glucose'] = pd.to_numeric(result_df['glucose'], errors='coerce')
350
+
351
+ # Ensure carbs and insulin are numeric
352
+ if 'carbs' in result_df.columns:
353
+ result_df['carbs'] = pd.to_numeric(result_df['carbs'], errors='coerce').fillna(0)
354
+ else:
355
+ result_df['carbs'] = 0.0
356
+
357
+ if 'insulin' in result_df.columns:
358
+ result_df['insulin'] = pd.to_numeric(result_df['insulin'], errors='coerce').fillna(0)
359
+ else:
360
+ result_df['insulin'] = 0.0
361
+
362
+ # Select and order standard columns
363
+ standard_cols = ['timestamp', 'glucose', 'carbs', 'insulin']
364
+ # Fix: Exclude columns that have been mapped
365
+ mapped_original_cols = list(mapping.mapped_columns.values())
366
+ other_cols = [c for c in df.columns if c not in mapped_original_cols and c not in standard_cols]
367
+
368
+ # We only want to keep the standard columns in the final dataframe
369
+ result_df = result_df[standard_cols]
370
+
371
+ return result_df, mapping.mapped_columns
372
+
373
+ def normalize_timestamps(self, df: pd.DataFrame) -> pd.DataFrame:
374
+ """
375
+ Normalize timestamps to minutes from start.
376
+
377
+ Args:
378
+ df: DataFrame with timestamp column
379
+
380
+ Returns:
381
+ DataFrame with normalized timestamps
382
+ """
383
+ if 'timestamp' not in df.columns:
384
+ return df
385
+
386
+ # If timestamps are already in minutes, ensure they're numeric
387
+ if df['timestamp'].max() < 1440: # Less than 24 hours in minutes
388
+ df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')
389
+ return df
390
+
391
+ # Otherwise, convert to minutes from start
392
+ try:
393
+ # Convert to datetime first
394
+ if df['timestamp'].dtype == object:
395
+ df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
396
+
397
+ # Calculate minutes from start
398
+ start_time = df['timestamp'].min()
399
+ df['timestamp'] = (df['timestamp'] - start_time).dt.total_seconds() / 60
400
+
401
+ return df
402
+ except Exception:
403
+ return df
404
+
405
+ def validate_and_clean(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, QualityReport]:
406
+ """
407
+ Validate data and return quality report.
408
+
409
+ Args:
410
+ df: DataFrame to validate
411
+
412
+ Returns:
413
+ Tuple of (cleaned DataFrame, QualityReport)
414
+ """
415
+ report = self.quality_checker.check(df)
416
+ return df, report
417
+
418
+ def _clean_data_based_on_report(self, df: pd.DataFrame, report: QualityReport) -> pd.DataFrame:
419
+ """
420
+ Cleans the DataFrame based on the QualityReport by setting anomalous glucose values to NaN.
421
+ """
422
+ cleaned_df = df.copy()
423
+
424
+ for anomaly in report.anomalies:
425
+ if anomaly.anomaly_type in ['impossible_value', 'outlier', 'rapid_change']:
426
+ # Set the anomalous glucose value to NaN
427
+ # Ensure 'glucose' column exists and index is valid
428
+ if 'glucose' in cleaned_df.columns and anomaly.index in cleaned_df.index:
429
+ cleaned_df.loc[anomaly.index, 'glucose'] = np.nan
430
+
431
+ return cleaned_df
432
+
433
+ def parse(self,
434
+ file_path: str,
435
+ validate: Optional[bool] = None,
436
+ metadata: Optional[Dict] = None) -> ParseResult:
437
+ """
438
+ Main entry point for parsing data files.
439
+
440
+ Args:
441
+ file_path: Path to the data file
442
+ validate: Override auto_validate setting
443
+ metadata: Optional metadata to add to the data pack
444
+
445
+ Returns:
446
+ ParseResult with data pack or errors
447
+ """
448
+ import time
449
+ start_time = time.time()
450
+
451
+ errors: List[str] = []
452
+ warnings: List[str] = []
453
+
454
+ # Validate file exists
455
+ path = Path(file_path)
456
+ if not path.exists():
457
+ return ParseResult(
458
+ success=False,
459
+ data_pack=None,
460
+ errors=[f"File not found: {file_path}"],
461
+ warnings=[],
462
+ parse_time_seconds=time.time() - start_time
463
+ )
464
+
465
+ # Detect format
466
+ file_format = self.detect_format(file_path)
467
+ if file_format == 'unknown':
468
+ return ParseResult(
469
+ success=False,
470
+ data_pack=None,
471
+ errors=[f"Unsupported file format: {path.suffix}"],
472
+ warnings=[],
473
+ parse_time_seconds=time.time() - start_time
474
+ )
475
+
476
+ try:
477
+ # Parse based on format
478
+ if file_format == 'csv':
479
+ df = self.parse_csv(file_path)
480
+ elif file_format == 'json':
481
+ df = self.parse_json(file_path)
482
+ elif file_format == 'parquet':
483
+ df = pd.read_parquet(file_path)
484
+ else:
485
+ raise ValueError(f"Unsupported format: {file_format}")
486
+
487
+ # Log column mapping info
488
+ mapping = self.column_mapper.map_columns(list(df.columns))
489
+ if mapping.warnings:
490
+ warnings.extend(mapping.warnings)
491
+
492
+ if mapping.confidence < 0.5:
493
+ warnings.append(f"Low column mapping confidence: {mapping.confidence:.1%}")
494
+
495
+ # Convert to standard format
496
+ df, column_mapping = self.convert_to_standard(df)
497
+
498
+ # Check for required columns
499
+ if 'glucose' not in df.columns or df['glucose'].isna().all():
500
+ return ParseResult(
501
+ success=False,
502
+ data_pack=None,
503
+ errors=["No valid glucose data found in file"],
504
+ warnings=warnings,
505
+ parse_time_seconds=time.time() - start_time
506
+ )
507
+
508
+ # Normalize timestamps
509
+ df = self.normalize_timestamps(df)
510
+
511
+ # Create data pack
512
+ data_pack = StandardDataPack(
513
+ data=df,
514
+ metadata=metadata or {},
515
+ source_file=str(path.absolute())
516
+ )
517
+
518
+ # Validate and check quality
519
+ if validate if validate is not None else self.auto_validate:
520
+ original_df = df.copy() # Keep original for comparison if needed
521
+ df, quality_report = self.validate_and_clean(original_df)
522
+
523
+ # Clean data based on the quality report
524
+ cleaned_df = self._clean_data_based_on_report(df, quality_report)
525
+ data_pack.data = cleaned_df # Store the cleaned DataFrame
526
+ data_pack.quality_report = quality_report
527
+
528
+ if quality_report.overall_score < 0.5:
529
+ warnings.append(
530
+ f"Data quality is low ({quality_report.overall_score:.1%}). "
531
+ f"Simulation results may be unreliable."
532
+ )
533
+
534
+ # Add quality warnings
535
+ warnings.extend(quality_report.warnings)
536
+
537
+ # Add metadata
538
+ data_pack.metadata.update({
539
+ 'source_file': str(path.absolute()),
540
+ 'source_format': file_format,
541
+ 'column_mapping': column_mapping,
542
+ 'data_points': len(df),
543
+ 'duration_hours': data_pack.duration_hours
544
+ })
545
+
546
+ return ParseResult(
547
+ success=True,
548
+ data_pack=data_pack,
549
+ errors=errors,
550
+ warnings=warnings,
551
+ parse_time_seconds=time.time() - start_time
552
+ )
553
+
554
+ except Exception as e:
555
+ return ParseResult(
556
+ success=False,
557
+ data_pack=None,
558
+ errors=[f"Parse error: {str(e)}"],
559
+ warnings=warnings,
560
+ parse_time_seconds=time.time() - start_time
561
+ )
562
+
563
+ def parse_string(self,
564
+ content: str,
565
+ format_type: str = 'csv',
566
+ validate: bool = True) -> ParseResult:
567
+ """
568
+ Parse data from a string instead of a file.
569
+
570
+ Args:
571
+ content: String containing data
572
+ format_type: Format of the data ('csv' or 'json')
573
+ validate: Whether to run quality checks
574
+
575
+ Returns:
576
+ ParseResult with data pack or errors
577
+ """
578
+ import io
579
+ import time
580
+ start_time = time.time()
581
+
582
+ errors: List[str] = []
583
+ warnings: List[str] = []
584
+
585
+ try:
586
+ # Parse from string
587
+ if format_type == 'csv':
588
+ df = pd.read_csv(io.StringIO(content))
589
+ elif format_type == 'json':
590
+ data = json.loads(content)
591
+ if isinstance(data, list):
592
+ df = pd.json_normalize(data)
593
+ else:
594
+ df = pd.json_normalize(data)
595
+ else:
596
+ raise ValueError(f"Unsupported format: {format_type}")
597
+
598
+ # Convert to standard format
599
+ df, column_mapping = self.convert_to_standard(df)
600
+
601
+ # Check for required columns before further processing
602
+ if 'glucose' not in df.columns or df['glucose'].isna().all():
603
+ return ParseResult(
604
+ success=False,
605
+ data_pack=None,
606
+ errors=["No valid glucose data found in input string"],
607
+ warnings=warnings,
608
+ parse_time_seconds=time.time() - start_time
609
+ )
610
+
611
+ # Normalize timestamps
612
+ df = self.normalize_timestamps(df)
613
+
614
+ # Create data pack
615
+ data_pack = StandardDataPack(
616
+ data=df,
617
+ metadata={'source': 'string_input', 'column_mapping': column_mapping},
618
+ source_file=None
619
+ )
620
+
621
+ # Validate if requested
622
+ if validate:
623
+ # Keep original for comparison if needed for cleaning
624
+ original_df_for_quality_check = df.copy()
625
+ _, quality_report = self.validate_and_clean(original_df_for_quality_check)
626
+
627
+ # Clean data based on the quality report
628
+ cleaned_df = self._clean_data_based_on_report(df.copy(), quality_report) # Pass a copy to avoid modifying df in place if it's used elsewhere
629
+ data_pack.data = cleaned_df
630
+ data_pack.quality_report = quality_report
631
+ warnings.extend(quality_report.warnings)
632
+
633
+ if quality_report.overall_score < 0.5:
634
+ warnings.append(
635
+ f"Data quality is low ({quality_report.overall_score:.1%}). "
636
+ f"Simulation results may be unreliable."
637
+ )
638
+
639
+ return ParseResult(
640
+ success=True,
641
+ data_pack=data_pack,
642
+ errors=errors,
643
+ warnings=warnings,
644
+ parse_time_seconds=time.time() - start_time
645
+ )
646
+
647
+ except Exception as e:
648
+ return ParseResult(
649
+ success=False,
650
+ data_pack=None,
651
+ errors=[f"Parse error: {str(e)}"],
652
+ warnings=warnings,
653
+ parse_time_seconds=time.time() - start_time
654
+ )
655
+
656
+
657
+ def demo_universal_parser():
658
+ """Demonstrate universal parsing functionality"""
659
+ print("=" * 70)
660
+ print("UNIVERSAL DATA PARSER DEMONSTRATION")
661
+ print("=" * 70)
662
+
663
+ parser = UniversalParser()
664
+
665
+ # Demo 1: Parse existing Ohio data
666
+ print("\n Demo 1: Parse Ohio T1DM Dataset")
667
+ print("-" * 50)
668
+
669
+ ohio_path = Path("data_packs/public/ohio_t1dm/patient_559/timeseries.csv")
670
+ if ohio_path.exists():
671
+ result = parser.parse(str(ohio_path))
672
+
673
+ if result.success:
674
+ data_pack = result.data_pack
675
+ print(f" Successfully parsed {data_pack.data_points} data points")
676
+ print(f" Duration: {data_pack.duration_hours:.1f} hours")
677
+ print(f" Confidence Score: {data_pack.confidence_score:.1%}")
678
+ print(f"\n Data Preview:")
679
+ print(data_pack.data.head(3).to_string())
680
+
681
+ if data_pack.quality_report:
682
+ print(f"\n Quality Report:")
683
+ print(f" - Completeness: {data_pack.quality_report.completeness_score:.1%}")
684
+ print(f" - Consistency: {data_pack.quality_report.consistency_score:.1%}")
685
+ print(f" - Validity: {data_pack.quality_report.validity_score:.1%}")
686
+ else:
687
+ print(f" Parse failed: {result.errors}")
688
+
689
+ # Demo 2: Parse synthetic data
690
+ print("\n\n Demo 2: Parse Synthetic Data")
691
+ print("-" * 50)
692
+
693
+ synthetic_csv = """timestamp,glucose_mg_dl,carbs_grams,insulin_units
694
+ 0,120,0,0
695
+ 5,125,0,0.5
696
+ 10,130,30,0
697
+ 15,140,0,1.0
698
+ 20,145,0,0
699
+ 25,150,0,0
700
+ 30,148,0,0
701
+ 35,145,0,0
702
+ 40,140,0,0
703
+ 45,135,0,0
704
+ 50,130,0,0"""
705
+
706
+ result = parser.parse_string(synthetic_csv, format_type='csv')
707
+
708
+ if result.success:
709
+ data_pack = result.data_pack
710
+ print(f" Successfully parsed synthetic data")
711
+ print(f" Data Points: {data_pack.data_points}")
712
+ print(f" Duration: {data_pack.duration_hours:.1f} hours")
713
+ print(f" Confidence: {data_pack.confidence_score:.1%}")
714
+ print(f"\n Data Preview:")
715
+ print(data_pack.data.to_string())
716
+
717
+ if result.warnings:
718
+ print(f"\n Warnings:")
719
+ for w in result.warnings:
720
+ print(f" {w}")
721
+
722
+ # Demo 3: Data with issues
723
+ print("\n\n Demo 3: Data with Quality Issues")
724
+ print("-" * 50)
725
+
726
+ problematic_csv = """timestamp,glucose,carbs,insulin
727
+ 0,120,0,0
728
+ 5,700,0,0
729
+ 10,130,0,0
730
+ 15,,0,0
731
+ 20,140,0,0
732
+ 25,145,0,0
733
+ 30,50,0,0
734
+ 35,140,0,0
735
+ 40,155,0,0"""
736
+
737
+ result = parser.parse_string(problematic_csv, format_type='csv')
738
+
739
+ if result.success:
740
+ data_pack = result.data_pack
741
+ print(f" Parsed with quality issues detected")
742
+ print(f" Confidence: {data_pack.confidence_score:.1%}")
743
+ print(f"\n Warnings:")
744
+ for w in result.warnings:
745
+ print(f" {w}")
746
+ else:
747
+ print(f" Parse failed: {result.errors}")
748
+
749
+ # Demo 4: Different column names
750
+ print("\n\n Demo 4: Different Column Names (Custom Format)")
751
+ print("-" * 50)
752
+
753
+ custom_csv = """Time (min),BG Value,Carbohydrates,Insulin (U)
754
+ 0,115,0,0
755
+ 5,118,0,0.5
756
+ 10,125,30,0
757
+ 15,135,0,1.0
758
+ 20,140,0,0"""
759
+
760
+ result = parser.parse_string(custom_csv, format_type='csv')
761
+
762
+ if result.success:
763
+ data_pack = result.data_pack
764
+ print(f" Successfully parsed custom format")
765
+ print(f" Data Points: {data_pack.data_points}")
766
+ print(f" Column Mapping: {data_pack.metadata.get('column_mapping', {})}")
767
+ print(f"\n Data Preview:")
768
+ print(data_pack.data.to_string())
769
+
770
+
771
+ # Demo 5: Cleaning data with physiological feasibility check
772
+ print("\n\n Demo 5: Cleaning Data with Physiological Feasibility Check")
773
+ print("-" * 50)
774
+
775
+ dirty_csv = """Time,Glucose.Level,Meal.Carbs,Delivered.Insulin
776
+ 0,120,0,0
777
+ 5,800,0,0
778
+ 10,130,0,0
779
+ 15,140,0,0
780
+ 20,250,0,0
781
+ 25,155,0,0
782
+ """
783
+
784
+ result = parser.parse_string(dirty_csv, format_type='csv')
785
+
786
+ if result.success:
787
+ data_pack = result.data_pack
788
+ print("Successfully parsed and cleaned dirty data")
789
+ print(f" Data Points: {data_pack.data_points}")
790
+ print(f" Column Mapping: {data_pack.metadata.get('column_mapping', {})}")
791
+ print("\n Original Data Preview:")
792
+ # To show the original data, we can re-parse without validation
793
+ original_result = parser.parse_string(dirty_csv, format_type='csv', validate=False)
794
+ if original_result.success:
795
+ print(original_result.data_pack.data.to_string())
796
+
797
+ print("\n Cleaned Data Preview (anomalies set to NaN):")
798
+ print(data_pack.data.to_string())
799
+
800
+ if result.warnings:
801
+ print("\n Warnings:")
802
+ for w in result.warnings:
803
+ print(f" {w}")
804
+ else:
805
+ print(f" Parse failed: {result.errors}")
806
+
807
+ print("\n" + "=" * 70)
808
+ print("UNIVERSAL DATA PARSER DEMONSTRATION COMPLETE")
809
+ print("=" * 70)
810
+
811
+
812
+ if __name__ == "__main__":
813
+ demo_universal_parser()