pylantir 0.2.3__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,367 @@
1
+ """
2
+ REDCap Data Source Plugin
3
+
4
+ This module implements the DataSourcePlugin interface for REDCap API integration.
5
+ Extracts and transforms REDCap data into DICOM worklist entries.
6
+
7
+ Version: 1.0.0
8
+ """
9
+
10
+ import os
11
+ import logging
12
+ import re
13
+ from redcap.project import Project # type: ignore
14
+ import uuid
15
+ from datetime import datetime, timedelta
16
+ import gc
17
+ from typing import Dict, List, Tuple
18
+
19
+ from .base import DataSourcePlugin, PluginFetchError, PluginConfigError
20
+
21
+ lgr = logging.getLogger(__name__)
22
+
23
+
24
+ class REDCapPlugin(DataSourcePlugin):
25
+ """
26
+ REDCap data source plugin.
27
+
28
+ Connects to REDCap API and fetches worklist entries with memory-efficient
29
+ processing (avoiding pandas DataFrames).
30
+
31
+ Configuration Requirements:
32
+ - site_id: Site identifier
33
+ - protocol: Protocol mapping dictionary
34
+
35
+ Environment Variables:
36
+ - REDCAP_API_URL: REDCap API endpoint
37
+ - REDCAP_API_TOKEN: REDCap API access token
38
+ """
39
+
40
+ def __init__(self):
41
+ super().__init__()
42
+ self._api_url = None
43
+ self._api_token = None
44
+ self._site_id = None
45
+ self._protocol = None
46
+
47
+ def validate_config(self, config: Dict) -> Tuple[bool, str]:
48
+ """Validate REDCap plugin configuration."""
49
+ # Check required config keys
50
+ if "site_id" not in config:
51
+ return (False, "Missing required configuration key: site_id")
52
+
53
+ if "protocol" not in config:
54
+ return (False, "Missing required configuration key: protocol")
55
+
56
+ # Protocol can be either a string (protocol name) or dict (legacy format)
57
+ if not isinstance(config["protocol"], (str, dict)):
58
+ return (False, "protocol must be a string or dictionary")
59
+
60
+ # Check environment variables
61
+ self._api_url = os.getenv("REDCAP_API_URL")
62
+ self._api_token = os.getenv("REDCAP_API_TOKEN")
63
+
64
+ if not self._api_url:
65
+ return (False, "REDCAP_API_URL environment variable not set")
66
+
67
+ if not self._api_token:
68
+ return (False, "REDCAP_API_TOKEN environment variable not set")
69
+
70
+ self._site_id = config.get("site_id")
71
+ self._protocol = config.get("protocol")
72
+
73
+ self.logger.info(f"REDCap plugin validated for site {config['site_id']}")
74
+ return (True, "")
75
+
76
+ def fetch_entries(
77
+ self,
78
+ field_mapping: Dict[str, str],
79
+ interval: float
80
+ ) -> List[Dict]:
81
+ """
82
+ Fetch worklist entries from REDCap.
83
+
84
+ Uses incremental sync based on interval parameter to fetch only
85
+ recently modified records.
86
+ """
87
+ try:
88
+ # Extract REDCap field names from mapping
89
+ redcap_fields = list(field_mapping.keys())
90
+
91
+ # Ensure required REDCap fields are included
92
+ default_fields = [
93
+ "record_id", "study_id", "redcap_repeat_instrument",
94
+ "mri_instance", "mri_date", "mri_time", "family_id",
95
+ "youth_dob_y", "demo_sex"
96
+ ]
97
+ for field in default_fields:
98
+ if field not in redcap_fields:
99
+ redcap_fields.append(field)
100
+
101
+ # Fetch from REDCap API
102
+ raw_records = self._fetch_redcap_entries(redcap_fields, interval)
103
+
104
+ # Transform to WorklistItem format
105
+ entries = self._transform_records(raw_records, field_mapping)
106
+
107
+ self.logger.info(f"Fetched {len(entries)} worklist entries from REDCap")
108
+ return entries
109
+
110
+ except Exception as e:
111
+ raise PluginFetchError(f"Failed to fetch REDCap data: {e}") from e
112
+
113
+ def _fetch_redcap_entries(self, redcap_fields: List[str], interval: float) -> List[Dict]:
114
+ """
115
+ Fetch REDCap entries using PyCap with memory-efficient processing.
116
+
117
+ MEMORY OPTIMIZATION: Uses format_type="json" instead of "df" to avoid
118
+ creating large pandas DataFrames (50-100x memory reduction).
119
+ """
120
+ project = Project(self._api_url, self._api_token)
121
+
122
+ try:
123
+ # Fetch metadata to get valid REDCap field names
124
+ valid_fields = {field["field_name"] for field in project.export_metadata()}
125
+ redcap_fields = [field for field in redcap_fields if field in valid_fields]
126
+
127
+ if not redcap_fields:
128
+ self.logger.error("No valid REDCap fields found in provided mapping")
129
+ return []
130
+
131
+ self.logger.info(f"Fetching REDCap data for fields: {redcap_fields}")
132
+
133
+ # Calculate date range for incremental sync
134
+ datetime_now = datetime.now()
135
+ datetime_interval = datetime_now - timedelta(seconds=interval)
136
+
137
+ # Export data as JSON (list of dicts) instead of DataFrame
138
+ records = project.export_records(
139
+ fields=redcap_fields,
140
+ date_begin=datetime_interval,
141
+ date_end=datetime_now,
142
+ format_type="json"
143
+ )
144
+
145
+ finally:
146
+ # Clean up PyCap Project immediately after export
147
+ del project
148
+ gc.collect()
149
+
150
+ if not records:
151
+ self.logger.warning("No records retrieved from REDCap")
152
+ return []
153
+
154
+ self.logger.info(f"Retrieved {len(records)} raw records from REDCap")
155
+
156
+ # Filter for valid MRI records
157
+ filtered_records = self._filter_mri_records(records, redcap_fields)
158
+
159
+ # Clean up intermediate data
160
+ del records
161
+ gc.collect()
162
+
163
+ return filtered_records
164
+
165
+ def _filter_mri_records(
166
+ self,
167
+ records: List[Dict],
168
+ redcap_fields: List[str]
169
+ ) -> List[Dict]:
170
+ """
171
+ Filter and group REDCap records to extract valid MRI entries.
172
+
173
+ Groups by record_id and merges baseline + MRI instrument data.
174
+ """
175
+ # Group records by record_id using native Python
176
+ records_by_id = {}
177
+ for record in records:
178
+ record_id = record.get('record_id')
179
+ if record_id not in records_by_id:
180
+ records_by_id[record_id] = []
181
+ records_by_id[record_id].append(record)
182
+
183
+ filtered_records = []
184
+
185
+ # Process each record_id group
186
+ for record_id, group in records_by_id.items():
187
+ # Find baseline (non-repeated instrument) values
188
+ baseline_record = None
189
+ for rec in group:
190
+ if not rec.get('redcap_repeat_instrument'):
191
+ baseline_record = rec
192
+ break
193
+
194
+ if baseline_record is None:
195
+ baseline_record = {}
196
+
197
+ # Filter for valid MRI rows only
198
+ mri_rows = [
199
+ rec for rec in group
200
+ if rec.get('redcap_repeat_instrument') == 'mri'
201
+ and rec.get('mri_instance')
202
+ and rec.get('mri_instance') != ''
203
+ and rec.get('mri_date')
204
+ and rec.get('mri_time')
205
+ ]
206
+
207
+ for mri_row in mri_rows:
208
+ record = {"record_id": record_id}
209
+
210
+ # Merge fields from baseline and mri_row
211
+ for field in redcap_fields:
212
+ # Use MRI row value if present, otherwise baseline
213
+ if field in mri_row and mri_row[field] not in (None, '', 'NaN'):
214
+ record[field] = mri_row[field]
215
+ elif field in baseline_record:
216
+ record[field] = baseline_record[field]
217
+ else:
218
+ record[field] = None
219
+
220
+ filtered_records.append(record)
221
+
222
+ # Clean up intermediate data
223
+ del records_by_id
224
+ gc.collect()
225
+
226
+ self.logger.info(f"Filtered to {len(filtered_records)} MRI records")
227
+ return filtered_records
228
+
229
+ def _transform_records(
230
+ self,
231
+ raw_records: List[Dict],
232
+ field_mapping: Dict[str, str]
233
+ ) -> List[Dict]:
234
+ """
235
+ Transform REDCap records to WorklistItem format.
236
+
237
+ Applies field mapping and constructs DICOM-compliant identifiers.
238
+ """
239
+ entries = []
240
+
241
+ for record in raw_records:
242
+ # Extract core identifiers
243
+ study_id = record.get("study_id", "")
244
+ if study_id:
245
+ study_id = study_id.split('-')[-1]
246
+
247
+ family_id = record.get("family_id", "")
248
+ if family_id:
249
+ family_id = family_id.split('-')[-1]
250
+
251
+ ses_id = record.get("mri_instance", "")
252
+
253
+ # Skip if missing required identifiers
254
+ if not study_id:
255
+ self.logger.warning("Skipping record due to missing study_id")
256
+ continue
257
+
258
+ # Construct DICOM identifiers
259
+ patient_name = f"cpip-id-{study_id}^fa-{family_id}"
260
+ patient_id = f"sub_{study_id}_ses_{ses_id}_fam_{family_id}"
261
+
262
+ # Build entry with mapped fields
263
+ entry = {
264
+ "patient_name": patient_name,
265
+ "patient_id": patient_id,
266
+ "modality": "MR", # Default modality
267
+ "study_instance_uid": self._generate_instance_uid(),
268
+ "performed_procedure_step_status": "SCHEDULED",
269
+ "data_source": self.get_source_name(), # Track which data source created this entry
270
+ }
271
+
272
+ # Apply field mapping
273
+ for source_field, target_field in field_mapping.items():
274
+ if source_field in record and record[source_field] not in (None, '', 'NaN'):
275
+ entry[target_field] = record[source_field]
276
+
277
+ # Ensure scheduled_start_date/time are populated for generic insertion
278
+ if "scheduled_start_date" not in entry:
279
+ entry["scheduled_start_date"] = record.get("mri_date") or record.get("scheduled_date")
280
+ if "scheduled_start_time" not in entry:
281
+ entry["scheduled_start_time"] = record.get("mri_time") or record.get("scheduled_time")
282
+
283
+ entry["scheduled_start_date"] = self._normalize_legacy_date(entry.get("scheduled_start_date"))
284
+ entry["scheduled_start_time"] = self._normalize_legacy_time(entry.get("scheduled_start_time"))
285
+
286
+ # Apply protocol name when available
287
+ if "protocol_name" not in entry and self._protocol is not None:
288
+ if isinstance(self._protocol, str):
289
+ entry["protocol_name"] = self._protocol
290
+ elif self._site_id and isinstance(self._protocol, dict):
291
+ entry["protocol_name"] = self._protocol.get(self._site_id)
292
+
293
+ entries.append(entry)
294
+
295
+ return entries
296
+
297
+ def _normalize_legacy_date(self, value) -> str | None:
298
+ """Normalize date values to legacy YYYY-MM-DD."""
299
+ if value is None:
300
+ return None
301
+
302
+ if isinstance(value, (int, float)):
303
+ value = str(int(value))
304
+
305
+ value = str(value).strip()
306
+ if not value:
307
+ return None
308
+
309
+ match = re.match(r"^(\d{4})[-/.](\d{2})[-/.](\d{2})$", value)
310
+ if match:
311
+ return f"{match.group(1)}-{match.group(2)}-{match.group(3)}"
312
+
313
+ if len(value) == 8 and value.isdigit():
314
+ return f"{value[0:4]}-{value[4:6]}-{value[6:8]}"
315
+
316
+ try:
317
+ return datetime.strptime(value, "%Y-%m-%d").strftime("%Y-%m-%d")
318
+ except Exception:
319
+ self.logger.debug(f"Unrecognized date format: {value}")
320
+ return value
321
+
322
+ def _normalize_legacy_time(self, value) -> str | None:
323
+ """Normalize time values to legacy HH:MM."""
324
+ if value is None:
325
+ return None
326
+
327
+ if isinstance(value, (int, float)):
328
+ value = str(int(value))
329
+
330
+ value = str(value).strip()
331
+ if not value:
332
+ return None
333
+
334
+ match = re.match(r"^(\d{2}):(\d{2})(?::(\d{2}))?$", value)
335
+ if match:
336
+ hh, mm, _ss = match.groups()
337
+ return f"{hh}:{mm}"
338
+
339
+ if len(value) == 6 and value.isdigit():
340
+ return f"{value[0:2]}:{value[2:4]}"
341
+
342
+ if len(value) == 4 and value.isdigit():
343
+ return f"{value[0:2]}:{value[2:4]}"
344
+
345
+ if len(value) == 2 and value.isdigit():
346
+ return f"{value}:00"
347
+
348
+ self.logger.debug(f"Unrecognized time format: {value}")
349
+ return value
350
+
351
+ def _generate_instance_uid(self) -> str:
352
+ """Generate a valid Study Instance UID."""
353
+ return f"1.2.840.10008.3.1.2.3.4.{uuid.uuid4().int}"
354
+
355
+ def get_source_name(self) -> str:
356
+ """Return source type identifier."""
357
+ return "REDCap"
358
+
359
+ def supports_incremental_sync(self) -> bool:
360
+ """REDCap supports incremental sync via date filtering."""
361
+ return True
362
+
363
+ def cleanup(self) -> None:
364
+ """Perform memory cleanup after sync."""
365
+ # Force garbage collection
366
+ gc.collect()
367
+ self.logger.debug("REDCap plugin cleanup complete")
pylantir/db_setup.py CHANGED
@@ -72,8 +72,11 @@ def get_threadsafe_engine(db_path="worklist.db", echo=False):
72
72
 
73
73
  # Load environment variables (you can use dotenv for more flexibility)
74
74
  DB_PATH = os.getenv("DB_PATH", "worklist.db") # Default: current directory
75
+ DB_PATH = os.path.expanduser(DB_PATH)
75
76
  DB_ECHO = os.getenv("DB_ECHO", "False").lower() in ("true", "1")
76
77
 
78
+ lgr.info(f"Using worklist database path: {DB_PATH}")
79
+
77
80
  # Create the engine
78
81
  engine = get_engine(db_path=DB_PATH, echo=DB_ECHO)
79
82
 
pylantir/models.py CHANGED
@@ -43,6 +43,9 @@ class WorklistItem(Base):
43
43
  lgr.warning("Could not get hisris_coding_designator check models.py ")
44
44
  performed_procedure_step_status = Column(String, default="SCHEDULED")
45
45
 
46
+ # Data source tracking (for multi-source architecture)
47
+ data_source = Column(String(255), nullable=True, default=None)
48
+
46
49
 
47
50
  def __repr__(self):
48
51
  return (f"<WorklistItem(id={self.id}, study_instance_uid={self.study_instance_uid}, patient_name={self.patient_name}, "
pylantir/populate_db.py CHANGED
@@ -21,7 +21,8 @@ def populate_data():
21
21
  station_name="MRI_ROOM_1",
22
22
  protocol_name="BRAIN_MRI_3T",
23
23
  study_instance_uid="1.2.3.4.5.6.7.8.1",
24
- study_description="MRI BRAIN" # CPIP
24
+ study_description="MRI BRAIN", # CPIP
25
+ data_source="test_data"
25
26
  )
26
27
  session.add(item1)
27
28
 
@@ -41,7 +42,8 @@ def populate_data():
41
42
  station_name="MRI_ROOM_1",
42
43
  protocol_name="BRAIN_MRI_3T",
43
44
  study_instance_uid="1.2.3.4.5.6.7.8.2",
44
- study_description="MRI BRAIN" # CPIP
45
+ study_description="MRI BRAIN", # CPIP
46
+ data_source="test_data"
45
47
  )
46
48
  session.add(item2)
47
49
 
@@ -60,7 +62,8 @@ def populate_data():
60
62
  station_name="MRI_ROOM_1",
61
63
  protocol_name="BRAIN_MRI_3T",
62
64
  study_instance_uid="1.2.3.4.5.6.7.8.3",
63
- study_description="MRI BRAIN" # CPIP
65
+ study_description="MRI BRAIN", # CPIP
66
+ data_source="test_data"
64
67
  )
65
68
  session.add(item3)
66
69