pylantir 0.2.3__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pylantir/api_server.py +13 -9
- pylantir/cli/run.py +307 -41
- pylantir/config/calpendo_config_example.json +65 -0
- pylantir/config/mwl_config.json +3 -1
- pylantir/data_sources/__init__.py +84 -0
- pylantir/data_sources/base.py +117 -0
- pylantir/data_sources/calpendo_plugin.py +702 -0
- pylantir/data_sources/redcap_plugin.py +367 -0
- pylantir/db_setup.py +3 -0
- pylantir/models.py +3 -0
- pylantir/populate_db.py +6 -3
- pylantir/redcap_to_db.py +128 -81
- {pylantir-0.2.3.dist-info → pylantir-0.3.1.dist-info}/METADATA +316 -33
- pylantir-0.3.1.dist-info/RECORD +25 -0
- pylantir-0.2.3.dist-info/RECORD +0 -20
- {pylantir-0.2.3.dist-info → pylantir-0.3.1.dist-info}/WHEEL +0 -0
- {pylantir-0.2.3.dist-info → pylantir-0.3.1.dist-info}/entry_points.txt +0 -0
- {pylantir-0.2.3.dist-info → pylantir-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
"""
|
|
2
|
+
REDCap Data Source Plugin
|
|
3
|
+
|
|
4
|
+
This module implements the DataSourcePlugin interface for REDCap API integration.
|
|
5
|
+
Extracts and transforms REDCap data into DICOM worklist entries.
|
|
6
|
+
|
|
7
|
+
Version: 1.0.0
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import logging
|
|
12
|
+
import re
|
|
13
|
+
from redcap.project import Project # type: ignore
|
|
14
|
+
import uuid
|
|
15
|
+
from datetime import datetime, timedelta
|
|
16
|
+
import gc
|
|
17
|
+
from typing import Dict, List, Tuple
|
|
18
|
+
|
|
19
|
+
from .base import DataSourcePlugin, PluginFetchError, PluginConfigError
|
|
20
|
+
|
|
21
|
+
lgr = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class REDCapPlugin(DataSourcePlugin):
|
|
25
|
+
"""
|
|
26
|
+
REDCap data source plugin.
|
|
27
|
+
|
|
28
|
+
Connects to REDCap API and fetches worklist entries with memory-efficient
|
|
29
|
+
processing (avoiding pandas DataFrames).
|
|
30
|
+
|
|
31
|
+
Configuration Requirements:
|
|
32
|
+
- site_id: Site identifier
|
|
33
|
+
- protocol: Protocol mapping dictionary
|
|
34
|
+
|
|
35
|
+
Environment Variables:
|
|
36
|
+
- REDCAP_API_URL: REDCap API endpoint
|
|
37
|
+
- REDCAP_API_TOKEN: REDCap API access token
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self):
|
|
41
|
+
super().__init__()
|
|
42
|
+
self._api_url = None
|
|
43
|
+
self._api_token = None
|
|
44
|
+
self._site_id = None
|
|
45
|
+
self._protocol = None
|
|
46
|
+
|
|
47
|
+
def validate_config(self, config: Dict) -> Tuple[bool, str]:
|
|
48
|
+
"""Validate REDCap plugin configuration."""
|
|
49
|
+
# Check required config keys
|
|
50
|
+
if "site_id" not in config:
|
|
51
|
+
return (False, "Missing required configuration key: site_id")
|
|
52
|
+
|
|
53
|
+
if "protocol" not in config:
|
|
54
|
+
return (False, "Missing required configuration key: protocol")
|
|
55
|
+
|
|
56
|
+
# Protocol can be either a string (protocol name) or dict (legacy format)
|
|
57
|
+
if not isinstance(config["protocol"], (str, dict)):
|
|
58
|
+
return (False, "protocol must be a string or dictionary")
|
|
59
|
+
|
|
60
|
+
# Check environment variables
|
|
61
|
+
self._api_url = os.getenv("REDCAP_API_URL")
|
|
62
|
+
self._api_token = os.getenv("REDCAP_API_TOKEN")
|
|
63
|
+
|
|
64
|
+
if not self._api_url:
|
|
65
|
+
return (False, "REDCAP_API_URL environment variable not set")
|
|
66
|
+
|
|
67
|
+
if not self._api_token:
|
|
68
|
+
return (False, "REDCAP_API_TOKEN environment variable not set")
|
|
69
|
+
|
|
70
|
+
self._site_id = config.get("site_id")
|
|
71
|
+
self._protocol = config.get("protocol")
|
|
72
|
+
|
|
73
|
+
self.logger.info(f"REDCap plugin validated for site {config['site_id']}")
|
|
74
|
+
return (True, "")
|
|
75
|
+
|
|
76
|
+
def fetch_entries(
|
|
77
|
+
self,
|
|
78
|
+
field_mapping: Dict[str, str],
|
|
79
|
+
interval: float
|
|
80
|
+
) -> List[Dict]:
|
|
81
|
+
"""
|
|
82
|
+
Fetch worklist entries from REDCap.
|
|
83
|
+
|
|
84
|
+
Uses incremental sync based on interval parameter to fetch only
|
|
85
|
+
recently modified records.
|
|
86
|
+
"""
|
|
87
|
+
try:
|
|
88
|
+
# Extract REDCap field names from mapping
|
|
89
|
+
redcap_fields = list(field_mapping.keys())
|
|
90
|
+
|
|
91
|
+
# Ensure required REDCap fields are included
|
|
92
|
+
default_fields = [
|
|
93
|
+
"record_id", "study_id", "redcap_repeat_instrument",
|
|
94
|
+
"mri_instance", "mri_date", "mri_time", "family_id",
|
|
95
|
+
"youth_dob_y", "demo_sex"
|
|
96
|
+
]
|
|
97
|
+
for field in default_fields:
|
|
98
|
+
if field not in redcap_fields:
|
|
99
|
+
redcap_fields.append(field)
|
|
100
|
+
|
|
101
|
+
# Fetch from REDCap API
|
|
102
|
+
raw_records = self._fetch_redcap_entries(redcap_fields, interval)
|
|
103
|
+
|
|
104
|
+
# Transform to WorklistItem format
|
|
105
|
+
entries = self._transform_records(raw_records, field_mapping)
|
|
106
|
+
|
|
107
|
+
self.logger.info(f"Fetched {len(entries)} worklist entries from REDCap")
|
|
108
|
+
return entries
|
|
109
|
+
|
|
110
|
+
except Exception as e:
|
|
111
|
+
raise PluginFetchError(f"Failed to fetch REDCap data: {e}") from e
|
|
112
|
+
|
|
113
|
+
def _fetch_redcap_entries(self, redcap_fields: List[str], interval: float) -> List[Dict]:
|
|
114
|
+
"""
|
|
115
|
+
Fetch REDCap entries using PyCap with memory-efficient processing.
|
|
116
|
+
|
|
117
|
+
MEMORY OPTIMIZATION: Uses format_type="json" instead of "df" to avoid
|
|
118
|
+
creating large pandas DataFrames (50-100x memory reduction).
|
|
119
|
+
"""
|
|
120
|
+
project = Project(self._api_url, self._api_token)
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
# Fetch metadata to get valid REDCap field names
|
|
124
|
+
valid_fields = {field["field_name"] for field in project.export_metadata()}
|
|
125
|
+
redcap_fields = [field for field in redcap_fields if field in valid_fields]
|
|
126
|
+
|
|
127
|
+
if not redcap_fields:
|
|
128
|
+
self.logger.error("No valid REDCap fields found in provided mapping")
|
|
129
|
+
return []
|
|
130
|
+
|
|
131
|
+
self.logger.info(f"Fetching REDCap data for fields: {redcap_fields}")
|
|
132
|
+
|
|
133
|
+
# Calculate date range for incremental sync
|
|
134
|
+
datetime_now = datetime.now()
|
|
135
|
+
datetime_interval = datetime_now - timedelta(seconds=interval)
|
|
136
|
+
|
|
137
|
+
# Export data as JSON (list of dicts) instead of DataFrame
|
|
138
|
+
records = project.export_records(
|
|
139
|
+
fields=redcap_fields,
|
|
140
|
+
date_begin=datetime_interval,
|
|
141
|
+
date_end=datetime_now,
|
|
142
|
+
format_type="json"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
finally:
|
|
146
|
+
# Clean up PyCap Project immediately after export
|
|
147
|
+
del project
|
|
148
|
+
gc.collect()
|
|
149
|
+
|
|
150
|
+
if not records:
|
|
151
|
+
self.logger.warning("No records retrieved from REDCap")
|
|
152
|
+
return []
|
|
153
|
+
|
|
154
|
+
self.logger.info(f"Retrieved {len(records)} raw records from REDCap")
|
|
155
|
+
|
|
156
|
+
# Filter for valid MRI records
|
|
157
|
+
filtered_records = self._filter_mri_records(records, redcap_fields)
|
|
158
|
+
|
|
159
|
+
# Clean up intermediate data
|
|
160
|
+
del records
|
|
161
|
+
gc.collect()
|
|
162
|
+
|
|
163
|
+
return filtered_records
|
|
164
|
+
|
|
165
|
+
def _filter_mri_records(
|
|
166
|
+
self,
|
|
167
|
+
records: List[Dict],
|
|
168
|
+
redcap_fields: List[str]
|
|
169
|
+
) -> List[Dict]:
|
|
170
|
+
"""
|
|
171
|
+
Filter and group REDCap records to extract valid MRI entries.
|
|
172
|
+
|
|
173
|
+
Groups by record_id and merges baseline + MRI instrument data.
|
|
174
|
+
"""
|
|
175
|
+
# Group records by record_id using native Python
|
|
176
|
+
records_by_id = {}
|
|
177
|
+
for record in records:
|
|
178
|
+
record_id = record.get('record_id')
|
|
179
|
+
if record_id not in records_by_id:
|
|
180
|
+
records_by_id[record_id] = []
|
|
181
|
+
records_by_id[record_id].append(record)
|
|
182
|
+
|
|
183
|
+
filtered_records = []
|
|
184
|
+
|
|
185
|
+
# Process each record_id group
|
|
186
|
+
for record_id, group in records_by_id.items():
|
|
187
|
+
# Find baseline (non-repeated instrument) values
|
|
188
|
+
baseline_record = None
|
|
189
|
+
for rec in group:
|
|
190
|
+
if not rec.get('redcap_repeat_instrument'):
|
|
191
|
+
baseline_record = rec
|
|
192
|
+
break
|
|
193
|
+
|
|
194
|
+
if baseline_record is None:
|
|
195
|
+
baseline_record = {}
|
|
196
|
+
|
|
197
|
+
# Filter for valid MRI rows only
|
|
198
|
+
mri_rows = [
|
|
199
|
+
rec for rec in group
|
|
200
|
+
if rec.get('redcap_repeat_instrument') == 'mri'
|
|
201
|
+
and rec.get('mri_instance')
|
|
202
|
+
and rec.get('mri_instance') != ''
|
|
203
|
+
and rec.get('mri_date')
|
|
204
|
+
and rec.get('mri_time')
|
|
205
|
+
]
|
|
206
|
+
|
|
207
|
+
for mri_row in mri_rows:
|
|
208
|
+
record = {"record_id": record_id}
|
|
209
|
+
|
|
210
|
+
# Merge fields from baseline and mri_row
|
|
211
|
+
for field in redcap_fields:
|
|
212
|
+
# Use MRI row value if present, otherwise baseline
|
|
213
|
+
if field in mri_row and mri_row[field] not in (None, '', 'NaN'):
|
|
214
|
+
record[field] = mri_row[field]
|
|
215
|
+
elif field in baseline_record:
|
|
216
|
+
record[field] = baseline_record[field]
|
|
217
|
+
else:
|
|
218
|
+
record[field] = None
|
|
219
|
+
|
|
220
|
+
filtered_records.append(record)
|
|
221
|
+
|
|
222
|
+
# Clean up intermediate data
|
|
223
|
+
del records_by_id
|
|
224
|
+
gc.collect()
|
|
225
|
+
|
|
226
|
+
self.logger.info(f"Filtered to {len(filtered_records)} MRI records")
|
|
227
|
+
return filtered_records
|
|
228
|
+
|
|
229
|
+
def _transform_records(
|
|
230
|
+
self,
|
|
231
|
+
raw_records: List[Dict],
|
|
232
|
+
field_mapping: Dict[str, str]
|
|
233
|
+
) -> List[Dict]:
|
|
234
|
+
"""
|
|
235
|
+
Transform REDCap records to WorklistItem format.
|
|
236
|
+
|
|
237
|
+
Applies field mapping and constructs DICOM-compliant identifiers.
|
|
238
|
+
"""
|
|
239
|
+
entries = []
|
|
240
|
+
|
|
241
|
+
for record in raw_records:
|
|
242
|
+
# Extract core identifiers
|
|
243
|
+
study_id = record.get("study_id", "")
|
|
244
|
+
if study_id:
|
|
245
|
+
study_id = study_id.split('-')[-1]
|
|
246
|
+
|
|
247
|
+
family_id = record.get("family_id", "")
|
|
248
|
+
if family_id:
|
|
249
|
+
family_id = family_id.split('-')[-1]
|
|
250
|
+
|
|
251
|
+
ses_id = record.get("mri_instance", "")
|
|
252
|
+
|
|
253
|
+
# Skip if missing required identifiers
|
|
254
|
+
if not study_id:
|
|
255
|
+
self.logger.warning("Skipping record due to missing study_id")
|
|
256
|
+
continue
|
|
257
|
+
|
|
258
|
+
# Construct DICOM identifiers
|
|
259
|
+
patient_name = f"cpip-id-{study_id}^fa-{family_id}"
|
|
260
|
+
patient_id = f"sub_{study_id}_ses_{ses_id}_fam_{family_id}"
|
|
261
|
+
|
|
262
|
+
# Build entry with mapped fields
|
|
263
|
+
entry = {
|
|
264
|
+
"patient_name": patient_name,
|
|
265
|
+
"patient_id": patient_id,
|
|
266
|
+
"modality": "MR", # Default modality
|
|
267
|
+
"study_instance_uid": self._generate_instance_uid(),
|
|
268
|
+
"performed_procedure_step_status": "SCHEDULED",
|
|
269
|
+
"data_source": self.get_source_name(), # Track which data source created this entry
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
# Apply field mapping
|
|
273
|
+
for source_field, target_field in field_mapping.items():
|
|
274
|
+
if source_field in record and record[source_field] not in (None, '', 'NaN'):
|
|
275
|
+
entry[target_field] = record[source_field]
|
|
276
|
+
|
|
277
|
+
# Ensure scheduled_start_date/time are populated for generic insertion
|
|
278
|
+
if "scheduled_start_date" not in entry:
|
|
279
|
+
entry["scheduled_start_date"] = record.get("mri_date") or record.get("scheduled_date")
|
|
280
|
+
if "scheduled_start_time" not in entry:
|
|
281
|
+
entry["scheduled_start_time"] = record.get("mri_time") or record.get("scheduled_time")
|
|
282
|
+
|
|
283
|
+
entry["scheduled_start_date"] = self._normalize_legacy_date(entry.get("scheduled_start_date"))
|
|
284
|
+
entry["scheduled_start_time"] = self._normalize_legacy_time(entry.get("scheduled_start_time"))
|
|
285
|
+
|
|
286
|
+
# Apply protocol name when available
|
|
287
|
+
if "protocol_name" not in entry and self._protocol is not None:
|
|
288
|
+
if isinstance(self._protocol, str):
|
|
289
|
+
entry["protocol_name"] = self._protocol
|
|
290
|
+
elif self._site_id and isinstance(self._protocol, dict):
|
|
291
|
+
entry["protocol_name"] = self._protocol.get(self._site_id)
|
|
292
|
+
|
|
293
|
+
entries.append(entry)
|
|
294
|
+
|
|
295
|
+
return entries
|
|
296
|
+
|
|
297
|
+
def _normalize_legacy_date(self, value) -> str | None:
|
|
298
|
+
"""Normalize date values to legacy YYYY-MM-DD."""
|
|
299
|
+
if value is None:
|
|
300
|
+
return None
|
|
301
|
+
|
|
302
|
+
if isinstance(value, (int, float)):
|
|
303
|
+
value = str(int(value))
|
|
304
|
+
|
|
305
|
+
value = str(value).strip()
|
|
306
|
+
if not value:
|
|
307
|
+
return None
|
|
308
|
+
|
|
309
|
+
match = re.match(r"^(\d{4})[-/.](\d{2})[-/.](\d{2})$", value)
|
|
310
|
+
if match:
|
|
311
|
+
return f"{match.group(1)}-{match.group(2)}-{match.group(3)}"
|
|
312
|
+
|
|
313
|
+
if len(value) == 8 and value.isdigit():
|
|
314
|
+
return f"{value[0:4]}-{value[4:6]}-{value[6:8]}"
|
|
315
|
+
|
|
316
|
+
try:
|
|
317
|
+
return datetime.strptime(value, "%Y-%m-%d").strftime("%Y-%m-%d")
|
|
318
|
+
except Exception:
|
|
319
|
+
self.logger.debug(f"Unrecognized date format: {value}")
|
|
320
|
+
return value
|
|
321
|
+
|
|
322
|
+
def _normalize_legacy_time(self, value) -> str | None:
|
|
323
|
+
"""Normalize time values to legacy HH:MM."""
|
|
324
|
+
if value is None:
|
|
325
|
+
return None
|
|
326
|
+
|
|
327
|
+
if isinstance(value, (int, float)):
|
|
328
|
+
value = str(int(value))
|
|
329
|
+
|
|
330
|
+
value = str(value).strip()
|
|
331
|
+
if not value:
|
|
332
|
+
return None
|
|
333
|
+
|
|
334
|
+
match = re.match(r"^(\d{2}):(\d{2})(?::(\d{2}))?$", value)
|
|
335
|
+
if match:
|
|
336
|
+
hh, mm, _ss = match.groups()
|
|
337
|
+
return f"{hh}:{mm}"
|
|
338
|
+
|
|
339
|
+
if len(value) == 6 and value.isdigit():
|
|
340
|
+
return f"{value[0:2]}:{value[2:4]}"
|
|
341
|
+
|
|
342
|
+
if len(value) == 4 and value.isdigit():
|
|
343
|
+
return f"{value[0:2]}:{value[2:4]}"
|
|
344
|
+
|
|
345
|
+
if len(value) == 2 and value.isdigit():
|
|
346
|
+
return f"{value}:00"
|
|
347
|
+
|
|
348
|
+
self.logger.debug(f"Unrecognized time format: {value}")
|
|
349
|
+
return value
|
|
350
|
+
|
|
351
|
+
def _generate_instance_uid(self) -> str:
|
|
352
|
+
"""Generate a valid Study Instance UID."""
|
|
353
|
+
return f"1.2.840.10008.3.1.2.3.4.{uuid.uuid4().int}"
|
|
354
|
+
|
|
355
|
+
def get_source_name(self) -> str:
|
|
356
|
+
"""Return source type identifier."""
|
|
357
|
+
return "REDCap"
|
|
358
|
+
|
|
359
|
+
def supports_incremental_sync(self) -> bool:
|
|
360
|
+
"""REDCap supports incremental sync via date filtering."""
|
|
361
|
+
return True
|
|
362
|
+
|
|
363
|
+
def cleanup(self) -> None:
|
|
364
|
+
"""Perform memory cleanup after sync."""
|
|
365
|
+
# Force garbage collection
|
|
366
|
+
gc.collect()
|
|
367
|
+
self.logger.debug("REDCap plugin cleanup complete")
|
pylantir/db_setup.py
CHANGED
|
@@ -72,8 +72,11 @@ def get_threadsafe_engine(db_path="worklist.db", echo=False):
|
|
|
72
72
|
|
|
73
73
|
# Load environment variables (you can use dotenv for more flexibility)
|
|
74
74
|
DB_PATH = os.getenv("DB_PATH", "worklist.db") # Default: current directory
|
|
75
|
+
DB_PATH = os.path.expanduser(DB_PATH)
|
|
75
76
|
DB_ECHO = os.getenv("DB_ECHO", "False").lower() in ("true", "1")
|
|
76
77
|
|
|
78
|
+
lgr.info(f"Using worklist database path: {DB_PATH}")
|
|
79
|
+
|
|
77
80
|
# Create the engine
|
|
78
81
|
engine = get_engine(db_path=DB_PATH, echo=DB_ECHO)
|
|
79
82
|
|
pylantir/models.py
CHANGED
|
@@ -43,6 +43,9 @@ class WorklistItem(Base):
|
|
|
43
43
|
lgr.warning("Could not get hisris_coding_designator check models.py ")
|
|
44
44
|
performed_procedure_step_status = Column(String, default="SCHEDULED")
|
|
45
45
|
|
|
46
|
+
# Data source tracking (for multi-source architecture)
|
|
47
|
+
data_source = Column(String(255), nullable=True, default=None)
|
|
48
|
+
|
|
46
49
|
|
|
47
50
|
def __repr__(self):
|
|
48
51
|
return (f"<WorklistItem(id={self.id}, study_instance_uid={self.study_instance_uid}, patient_name={self.patient_name}, "
|
pylantir/populate_db.py
CHANGED
|
@@ -21,7 +21,8 @@ def populate_data():
|
|
|
21
21
|
station_name="MRI_ROOM_1",
|
|
22
22
|
protocol_name="BRAIN_MRI_3T",
|
|
23
23
|
study_instance_uid="1.2.3.4.5.6.7.8.1",
|
|
24
|
-
study_description="MRI BRAIN" # CPIP
|
|
24
|
+
study_description="MRI BRAIN", # CPIP
|
|
25
|
+
data_source="test_data"
|
|
25
26
|
)
|
|
26
27
|
session.add(item1)
|
|
27
28
|
|
|
@@ -41,7 +42,8 @@ def populate_data():
|
|
|
41
42
|
station_name="MRI_ROOM_1",
|
|
42
43
|
protocol_name="BRAIN_MRI_3T",
|
|
43
44
|
study_instance_uid="1.2.3.4.5.6.7.8.2",
|
|
44
|
-
study_description="MRI BRAIN" # CPIP
|
|
45
|
+
study_description="MRI BRAIN", # CPIP
|
|
46
|
+
data_source="test_data"
|
|
45
47
|
)
|
|
46
48
|
session.add(item2)
|
|
47
49
|
|
|
@@ -60,7 +62,8 @@ def populate_data():
|
|
|
60
62
|
station_name="MRI_ROOM_1",
|
|
61
63
|
protocol_name="BRAIN_MRI_3T",
|
|
62
64
|
study_instance_uid="1.2.3.4.5.6.7.8.3",
|
|
63
|
-
study_description="MRI BRAIN" # CPIP
|
|
65
|
+
study_description="MRI BRAIN", # CPIP
|
|
66
|
+
data_source="test_data"
|
|
64
67
|
)
|
|
65
68
|
session.add(item3)
|
|
66
69
|
|