pylantir 0.2.3__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pylantir/redcap_to_db.py CHANGED
@@ -1,6 +1,15 @@
1
+ """
2
+ LEGACY MODULE - Backward Compatibility Wrapper for REDCapPlugin
3
+
4
+ This module provides backward-compatible function signatures that internally
5
+ delegate to the new plugin-based architecture. Existing code can continue
6
+ calling these functions without modification.
7
+
8
+ MIGRATION PATH: New code should use src/pylantir/data_sources/redcap_plugin.py
9
+ directly instead of these legacy wrappers.
10
+ """
1
11
  import os
2
12
  import logging
3
- import pandas as pd
4
13
  from redcap import Project
5
14
  import uuid
6
15
  from sqlalchemy.orm import sessionmaker
@@ -13,6 +22,13 @@ import gc
13
22
 
14
23
  lgr = logging.getLogger(__name__)
15
24
 
25
+ # Import the new plugin system
26
+ from .data_sources.redcap_plugin import REDCapPlugin
27
+ from .data_sources.base import PluginError
28
+
29
+ # NOTE: pandas import removed - we use native Python dicts/lists to avoid
30
+ # DataFrame memory overhead (50-100x memory reduction per sync cycle)
31
+
16
32
  # Optional memory monitoring (install with: pip install psutil)
17
33
  try:
18
34
  import psutil
@@ -33,80 +49,71 @@ Session = sessionmaker(bind=engine)
33
49
 
34
50
 
35
51
  def fetch_redcap_entries(redcap_fields: list, interval: float) -> list:
36
- """Fetch REDCap entries using PyCap and return a list of filtered dicts."""
37
- project = Project(REDCAP_API_URL, REDCAP_API_TOKEN)
52
+ """
53
+ LEGACY WRAPPER: Fetch REDCap entries using PyCap and return a list of filtered dicts.
38
54
 
39
- if not redcap_fields:
40
- lgr.error("No field mapping (redcap2wl) provided for REDCap retrieval.")
41
- return []
55
+ **DEPRECATION NOTICE**: This function is deprecated and maintained only for
56
+ backward compatibility. New code should use REDCapPlugin.fetch_entries() directly
57
+ from src/pylantir/data_sources/redcap_plugin.py
42
58
 
43
- # Fetch metadata to get valid REDCap field names
44
- valid_fields = {field["field_name"] for field in project.export_metadata()}
45
- redcap_fields = [field for field in redcap_fields if field in valid_fields]
59
+ This function now delegates to REDCapPlugin for consistency with the new
60
+ plugin architecture. Existing callers can continue using this signature.
46
61
 
47
- if not redcap_fields:
48
- lgr.error("No valid REDCap fields found in provided mapping.")
49
- return []
62
+ Args:
63
+ redcap_fields: List of REDCap field names to fetch
64
+ interval: Time window in seconds to fetch records from
50
65
 
51
- lgr.info(f"Fetching REDCap data for fields: {redcap_fields}")
66
+ Returns:
67
+ List of filtered MRI record dictionaries
52
68
 
53
- # Export data
54
- datetime_now = datetime.now()
55
- datetime_interval = datetime_now - timedelta(seconds=interval)
56
- records = project.export_records(fields=redcap_fields, date_begin=datetime_interval, date_end=datetime_now, format_type="df")
69
+ MIGRATION PATH: Use REDCapPlugin directly:
70
+ ```python
71
+ from pylantir.data_sources.redcap_plugin import REDCapPlugin
72
+ plugin = REDCapPlugin(name, config, field_mapping)
73
+ entries = plugin.fetch_entries(since=datetime_interval)
74
+ ```
75
+ """
76
+ lgr.warning(
77
+ "fetch_redcap_entries() is deprecated. "
78
+ "Use REDCapPlugin from src/pylantir/data_sources/redcap_plugin.py instead."
79
+ )
57
80
 
58
- # Clean up PyCap Project immediately after export to free API client cache
59
- del project
60
- gc.collect()
81
+ try:
82
+ # Build plugin configuration from environment variables
83
+ config = {
84
+ "site_id": "default", # Legacy calls don't have site_id
85
+ "protocol": "DEFAULT_PROTOCOL"
86
+ }
61
87
 
62
- if records.empty:
63
- lgr.warning("No records retrieved from REDCap.")
64
- # Explicitly clean up the empty DataFrame to release any allocated buffers
65
- del records
66
- gc.collect()
67
- return []
88
+ # Create field mapping for plugin (maps REDCap field name to itself)
89
+ field_mapping = {field: field for field in redcap_fields}
68
90
 
69
- filtered_records = []
70
-
71
- # Group by 'record_id' (index level 0)
72
- # Convert to list to avoid holding groupby iterator reference
73
- record_groups = list(records.groupby(level=0))
74
- for record_id, group in record_groups:
75
-
76
- # Try to get baseline (non-repeated instrument) values
77
- baseline_rows = group[group['redcap_repeat_instrument'].isna()]
78
- baseline_row = baseline_rows.iloc[0] if not baseline_rows.empty else {}
79
-
80
- # Filter for valid MRI rows only
81
- mri_rows = group[
82
- (group["redcap_repeat_instrument"] == "mri") &
83
- (group.get("mri_instance").notna()) &
84
- (group.get("mri_instance") != "" ) &
85
- (group.get("mri_date").notna()) &
86
- (group.get("mri_time").notna())
87
- ]
88
-
89
- for _, mri_row in mri_rows.iterrows():
90
- record = {"record_id": record_id}
91
-
92
- # Merge fields from baseline and mri_row, only include requested fields
93
- for field in redcap_fields:
94
- record[field] = (
95
- mri_row.get(field)
96
- if pd.notna(mri_row.get(field))
97
- else baseline_row.get(field)
98
- )
91
+ # Instantiate plugin (no arguments)
92
+ plugin = REDCapPlugin()
93
+
94
+ # Validate configuration
95
+ is_valid, error_msg = plugin.validate_config(config)
96
+ if not is_valid:
97
+ lgr.error(f"Plugin configuration validation failed: {error_msg}")
98
+ return []
99
+
100
+ # Fetch entries using plugin
101
+ datetime_now = datetime.now()
102
+ datetime_interval = datetime_now - timedelta(seconds=interval)
99
103
 
100
- filtered_records.append(record)
104
+ entries = plugin.fetch_entries(field_mapping=field_mapping, interval=interval)
101
105
 
102
- # Explicitly clean up DataFrame and groupby list to free memory
103
- del record_groups
104
- del records
105
- gc.collect()
106
-
107
- return filtered_records
106
+ # Cleanup plugin resources
107
+ plugin.cleanup()
108
108
 
109
- # TODO: Implement age binning for paricipants
109
+ return entries
110
+
111
+ except PluginError as e:
112
+ lgr.error(f"Plugin error in legacy fetch_redcap_entries: {e}")
113
+ return []
114
+ except Exception as e:
115
+ lgr.error(f"Unexpected error in legacy fetch_redcap_entries: {e}")
116
+ return []# TODO: Implement age binning for paricipants
110
117
  def age_binning():
111
118
  return None
112
119
 
@@ -163,35 +170,35 @@ def cleanup_memory_and_connections():
163
170
  This function should be called after each synchronization cycle.
164
171
  """
165
172
  lgr.debug("Starting memory and connection cleanup...")
166
-
173
+
167
174
  # Get memory usage before cleanup
168
175
  memory_before = get_memory_usage()
169
-
176
+
170
177
  try:
171
178
  # 1. Clear pandas cache and temporary objects
172
179
  # Force garbage collection of pandas objects
173
180
  gc.collect()
174
-
181
+
175
182
  # 2. Close any idle database connections in the pool
176
183
  if hasattr(engine, 'pool'):
177
184
  # Dispose of the connection pool to free up connections
178
185
  lgr.debug("Disposing database connection pool")
179
186
  engine.pool.dispose()
180
-
187
+
181
188
  # 3. Force Python garbage collection targeting all generations
182
189
  # Target generation 2 (oldest) first to catch long-lived objects
183
190
  collected = gc.collect(generation=2) # Oldest generation
184
191
  collected += gc.collect(generation=1) # Middle generation
185
192
  collected += gc.collect(generation=0) # Youngest generation
186
-
193
+
187
194
  # 4. Clear any cached SQLAlchemy metadata
188
195
  if hasattr(engine, 'pool'):
189
196
  # Recreate the pool with fresh connections
190
197
  engine.pool.recreate()
191
-
198
+
192
199
  # Get memory usage after cleanup
193
200
  memory_after = get_memory_usage()
194
-
201
+
195
202
  # Log cleanup results with simplified, focused metrics
196
203
  if memory_before and memory_after and 'rss_mb' in memory_before:
197
204
  freed = memory_before['rss_mb'] - memory_after['rss_mb']
@@ -203,7 +210,7 @@ def cleanup_memory_and_connections():
203
210
  )
204
211
  else:
205
212
  lgr.info(f"Memory cleanup: Collected {collected} objects")
206
-
213
+
207
214
  except Exception as e:
208
215
  lgr.error(f"Error during cleanup: {e}")
209
216
  # Don't let cleanup errors stop the main process
@@ -215,8 +222,29 @@ def sync_redcap_to_db(
215
222
  protocol: dict,
216
223
  redcap2wl: dict,
217
224
  interval: float = 60.0,
225
+ source_name: str = None,
218
226
  ) -> None:
219
- """Sync REDCap patient data with the worklist database."""
227
+ """
228
+ LEGACY WRAPPER: Sync REDCap patient data with the worklist database.
229
+
230
+ **DEPRECATION NOTICE**: This function is deprecated and maintained only for
231
+ backward compatibility. New code should use the plugin-based architecture
232
+ from src/pylantir/data_sources/
233
+
234
+ NOTE: This function now uses REDCapPlugin internally via fetch_redcap_entries()
235
+ wrapper, ensuring consistent behavior with the new plugin architecture.
236
+
237
+ Args:
238
+ site_id: Site identifier
239
+ protocol: Protocol mapping dictionary
240
+ redcap2wl: Field mapping dictionary
241
+ interval: Sync interval in seconds
242
+ source_name: Optional data source name for tracking (new in v0.3.0)
243
+ """
244
+ lgr.warning(
245
+ "sync_redcap_to_db() is deprecated. "
246
+ "Use data_sources configuration with REDCapPlugin instead."
247
+ )
220
248
 
221
249
  if not redcap2wl:
222
250
  lgr.error("No field mapping (redcap2wl) provided for syncing.")
@@ -241,6 +269,7 @@ def sync_redcap_to_db(
241
269
  if i not in redcap_fields:
242
270
  redcap_fields.append(i)
243
271
 
272
+ # NOTE: fetch_redcap_entries() now delegates to REDCapPlugin internally
244
273
  redcap_entries = fetch_redcap_entries(redcap_fields, interval)
245
274
 
246
275
  for record in redcap_entries:
@@ -290,6 +319,9 @@ def sync_redcap_to_db(
290
319
  existing_entry.modality = record.get("modality", "MR")
291
320
  existing_entry.scheduled_start_date = record.get("mri_date")
292
321
  existing_entry.scheduled_start_time = record.get("mri_time")
322
+ # Track data source if provided
323
+ if source_name:
324
+ existing_entry.data_source = source_name
293
325
  # Dynamically update DICOM worklist fields from REDCap
294
326
  for redcap_field, dicom_field in redcap2wl.items():
295
327
  if redcap_field in record:
@@ -324,13 +356,14 @@ def sync_redcap_to_db(
324
356
  # performing_physician=record.get("performing_physician"),
325
357
  study_description=record.get("study_description", "CPIP"),
326
358
  # station_name=record.get("station_name"),
327
- performed_procedure_step_status="SCHEDULED"
359
+ performed_procedure_step_status="SCHEDULED",
360
+ data_source=source_name # Track which source created this entry
328
361
  )
329
362
  session.add(new_entry)
330
363
 
331
364
  session.commit()
332
365
  logging.info("REDCap data synchronized successfully with DICOM worklist database.")
333
-
366
+
334
367
  except Exception as e:
335
368
  lgr.error(f"Error during REDCap synchronization: {e}")
336
369
  if session:
@@ -342,10 +375,10 @@ def sync_redcap_to_db(
342
375
  # Detach all ORM objects from session to clear identity map
343
376
  session.expunge_all()
344
377
  session.close()
345
-
378
+
346
379
  # Perform cleanup after sync
347
380
  cleanup_memory_and_connections()
348
-
381
+
349
382
  # Log memory usage after cleanup
350
383
  memory_after = get_memory_usage()
351
384
  if memory_after:
@@ -358,12 +391,22 @@ def sync_redcap_to_db_repeatedly(
358
391
  redcap2wl=None,
359
392
  interval=60,
360
393
  operation_interval={"start_time": [00,00], "end_time": [23,59]},
394
+ source_name=None,
361
395
  ):
362
396
  """
363
- Keep syncing with REDCap in a loop every `interval` seconds,
364
- but only between operation_interval[start_time] and operation_interval[end_time].
365
- Exit cleanly when STOP_EVENT is set.
397
+ LEGACY WRAPPER: Keep syncing with REDCap in a loop every `interval` seconds.
398
+
399
+ **DEPRECATION NOTICE**: This function is deprecated and maintained only for
400
+ backward compatibility. New code should use the plugin-based multi-source
401
+ orchestration from src/pylantir/cli/run.py
402
+
403
+ MIGRATION PATH: Configure data_sources array in mwl_config.json and use
404
+ the new orchestration system.
366
405
  """
406
+ lgr.warning(
407
+ "sync_redcap_to_db_repeatedly() is deprecated. "
408
+ "Use data_sources configuration with multi-source orchestration instead."
409
+ )
367
410
  if operation_interval is None:
368
411
  operation_interval = {"start_time": [0, 0], "end_time": [23, 59]}
369
412
 
@@ -399,6 +442,8 @@ def sync_redcap_to_db_repeatedly(
399
442
  dt_end_yesterday = datetime.combine(yesterday, end_time)
400
443
  dt_start_today = datetime.combine(today_date, start_time)
401
444
  delta = dt_start_today - dt_end_yesterday
445
+ #terporary large interval to catch up on missed data
446
+ # delta = delta + timedelta(seconds=6000000)
402
447
  # guaranteed to be positive if yesterday < today
403
448
  extended_interval = delta.total_seconds()
404
449
  logging.info(f"Using extended interval: {extended_interval}, {interval} seconds until next sync.")
@@ -417,6 +462,7 @@ def sync_redcap_to_db_repeatedly(
417
462
  protocol=protocol,
418
463
  redcap2wl=redcap2wl,
419
464
  interval=extended_interval,
465
+ source_name=source_name,
420
466
  )
421
467
  else:
422
468
  sync_redcap_to_db(
@@ -424,6 +470,7 @@ def sync_redcap_to_db_repeatedly(
424
470
  protocol=protocol,
425
471
  redcap2wl=redcap2wl,
426
472
  interval=interval_sync,
473
+ source_name=source_name,
427
474
  )
428
475
  last_sync_date = today_date
429
476
  logging.debug(f"REDCap sync completed at {now_time}. Next sync atempt in {interval} seconds.")
@@ -440,7 +487,7 @@ def sync_redcap_to_db_repeatedly(
440
487
  f"Current time {now_time} is outside operation window "
441
488
  f"({start_time}–{end_time}). Sleeping for {interval} seconds."
442
489
  )
443
-
490
+
444
491
  # Run periodic cleanup even during off-hours to prevent memory buildup
445
492
  # Only run every 10th cycle to avoid excessive overhead
446
493
  if (now_dt.hour == 3 and now_dt.minute == 0): # Daily cleanup at 3 AM