medicafe 0.250822.3__py3-none-any.whl → 0.250912.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1729 +1,1822 @@
1
- # MediBot_Preprocessor_lib.py
2
- """
3
- Core preprocessing library for MediBot
4
- Contains core preprocessing functions and utilities.
5
- """
6
-
7
- import csv, time, os, sys
8
- from datetime import datetime, timedelta
9
- from collections import OrderedDict
10
-
11
- # Try to import chardet for encoding detection
12
- try:
13
- import chardet
14
- except ImportError:
15
- chardet = None # Fallback if chardet is not available
16
-
17
- # Use core utilities for standardized imports
18
- from MediCafe.core_utils import (
19
- import_medibot_module,
20
- import_medilink_module,
21
- get_config_loader_with_fallback
22
- )
23
-
24
- # Initialize configuration loader with fallback
25
- MediLink_ConfigLoader = get_config_loader_with_fallback()
26
-
27
- # Import MediLink_DataMgmt using centralized import function
28
- MediLink_DataMgmt = import_medilink_module('MediLink_DataMgmt')
29
-
30
- # Import MediBot modules using centralized import functions
31
- MediBot_UI = import_medibot_module('MediBot_UI')
32
- if MediBot_UI:
33
- app_control = getattr(MediBot_UI, 'app_control', None)
34
- get_app_control = getattr(MediBot_UI, '_get_app_control', None)
35
- def _ac():
36
- try:
37
- return get_app_control() if get_app_control else getattr(MediBot_UI, 'app_control', None)
38
- except Exception:
39
- return getattr(MediBot_UI, 'app_control', None)
40
- else:
41
- app_control = None
42
-
43
- MediBot_docx_decoder = import_medibot_module('MediBot_docx_decoder')
44
- if MediBot_docx_decoder:
45
- parse_docx = getattr(MediBot_docx_decoder, 'parse_docx', None)
46
- else:
47
- parse_docx = None
48
-
49
- # Add the parent directory of the project to the Python path
50
- sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
51
-
52
- # Configuration cache to avoid repeated loading
53
- _config_cache = None
54
- _crosswalk_cache = None
55
-
56
- # Use core utilities for standardized imports
57
- from MediCafe.core_utils import get_shared_config_loader
58
- MediLink_ConfigLoader = get_shared_config_loader()
59
-
60
- # Ensure MediLink_ConfigLoader is available
61
- if MediLink_ConfigLoader is None:
62
- print("Warning: MediLink_ConfigLoader not available. Some functionality may be limited.")
63
- # Create a minimal fallback logger
64
- class FallbackLogger:
65
- def log(self, message, level="INFO"):
66
- print("[{}] {}".format(level, message))
67
- MediLink_ConfigLoader = FallbackLogger()
68
-
69
- # Import centralized logging configuration
70
- try:
71
- from MediCafe.logging_config import PERFORMANCE_LOGGING
72
- except ImportError:
73
- # Fallback to local flag if centralized config is not available
74
- PERFORMANCE_LOGGING = False
75
-
76
- # XP Compatibility: Add robust fallback for configuration loading
77
- def get_cached_configuration_xp_safe():
78
- """
79
- XP-compatible version of get_cached_configuration with robust fallbacks.
80
- """
81
- global _config_cache, _crosswalk_cache
82
-
83
- # If we already have cached data, return it
84
- if _config_cache is not None and _crosswalk_cache is not None:
85
- return _config_cache, _crosswalk_cache
86
-
87
- # Try to load configuration using the standard method
88
- try:
89
- if MediLink_ConfigLoader and hasattr(MediLink_ConfigLoader, 'load_configuration'):
90
- _config_cache, _crosswalk_cache = MediLink_ConfigLoader.load_configuration()
91
- return _config_cache, _crosswalk_cache
92
- except Exception as e:
93
- print("Warning: Failed to load configuration via MediLink_ConfigLoader: {}".format(e))
94
-
95
- # Fallback: Try to load configuration files directly
96
- try:
97
- import json
98
- project_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
99
-
100
- # Try to load config.json
101
- config_path = os.path.join(project_dir, 'json', 'config.json')
102
- if os.path.exists(config_path):
103
- with open(config_path, 'r') as f:
104
- _config_cache = json.load(f)
105
- else:
106
- _config_cache = {}
107
-
108
- # Try to load crosswalk.json
109
- crosswalk_path = os.path.join(project_dir, 'json', 'crosswalk.json')
110
- if os.path.exists(crosswalk_path):
111
- with open(crosswalk_path, 'r') as f:
112
- _crosswalk_cache = json.load(f)
113
- else:
114
- _crosswalk_cache = {}
115
-
116
- return _config_cache, _crosswalk_cache
117
-
118
- except Exception as e:
119
- print("Warning: Failed to load configuration files directly: {}".format(e))
120
- # Return empty defaults
121
- _config_cache = {}
122
- _crosswalk_cache = {}
123
- return _config_cache, _crosswalk_cache
124
-
125
- class InitializationError(Exception):
126
- def __init__(self, message):
127
- self.message = message
128
- super().__init__(self.message)
129
-
130
- def initialize(config):
131
- global AHK_EXECUTABLE, CSV_FILE_PATH, field_mapping, page_end_markers
132
-
133
- required_keys = {
134
- 'AHK_EXECUTABLE': "",
135
- 'CSV_FILE_PATH': "",
136
- 'field_mapping': {},
137
- 'page_end_markers': []
138
- }
139
-
140
- for key, default in required_keys.items():
141
- try:
142
- globals()[key] = config.get(key, default) if key != 'field_mapping' else OrderedDict(config.get(key, default))
143
- except AttributeError:
144
- raise InitializationError("Error: '{}' not found in config.".format(key))
145
-
146
- def get_cached_configuration():
147
- """
148
- Returns cached configuration and crosswalk data to avoid repeated I/O operations.
149
- """
150
- return get_cached_configuration_xp_safe()
151
-
152
- def open_csv_for_editing(csv_file_path):
153
- try:
154
- # Open the CSV file with its associated application
155
- os.system('start "" "{}"'.format(csv_file_path))
156
- print("After saving the revised CSV, please re-run MediBot.")
157
- except Exception as e:
158
- print("Failed to open CSV file:", e)
159
-
160
- # Function to clean the headers
161
- def clean_header(headers):
162
- """
163
- Cleans the header strings by removing unwanted characters and trimming whitespace.
164
-
165
- Parameters:
166
- headers (list of str): The original header strings.
167
-
168
- Returns:
169
- list of str: The cleaned header strings.
170
- """
171
- cleaned_headers = []
172
-
173
- for header in headers:
174
- # Strip leading and trailing whitespace
175
- cleaned_header = header.strip()
176
- # Remove unwanted characters while keeping spaces, alphanumeric characters, hyphens, and underscores
177
- cleaned_header = ''.join(char for char in cleaned_header if char.isalnum() or char.isspace() or char in ['-', '_'])
178
- cleaned_headers.append(cleaned_header)
179
-
180
- # Log the original and cleaned headers for debugging
181
- MediLink_ConfigLoader.log("Original headers: {}".format(headers), level="INFO")
182
- MediLink_ConfigLoader.log("Cleaned headers: {}".format(cleaned_headers), level="INFO")
183
-
184
- # Check if 'Surgery Date' is in the cleaned headers
185
- if 'Surgery Date' not in cleaned_headers:
186
- MediLink_ConfigLoader.log("WARNING: 'Surgery Date' header not found after cleaning.", level="WARNING")
187
- print("WARNING: 'Surgery Date' header not found after cleaning.")
188
- raise ValueError("Error: 'Surgery Date' header not found after cleaning.")
189
-
190
- return cleaned_headers
191
-
192
- # Function to load and process CSV data
193
- def load_csv_data(csv_file_path):
194
- try:
195
- # Check if the file exists
196
- if not os.path.exists(csv_file_path):
197
- raise FileNotFoundError("***Error: CSV file '{}' not found.".format(csv_file_path))
198
-
199
- # Detect the file encoding
200
- with open(csv_file_path, 'rb') as f:
201
- raw_data = f.read()
202
- if chardet:
203
- result = chardet.detect(raw_data)
204
- encoding = result['encoding']
205
- confidence = result['confidence']
206
- else:
207
- # Fallback to UTF-8 when chardet is not available
208
- encoding = 'utf-8'
209
- confidence = 1.0
210
- print("Detected encoding: {} (Confidence: {:.2f})".format(encoding, confidence))
211
-
212
- # Read the CSV file with the detected encoding
213
- with open(csv_file_path, 'r', encoding=encoding) as csvfile:
214
- reader = csv.DictReader(csvfile)
215
- # Clean the headers
216
- cleaned_headers = clean_header(reader.fieldnames)
217
-
218
- # PERFORMANCE FIX: Use zip() instead of range(len()) for header mapping
219
- header_mapping = {clean: orig for clean, orig in zip(cleaned_headers, reader.fieldnames)}
220
-
221
- # Process the remaining rows - optimize by pre-allocating the list
222
- csv_data = []
223
- # Pre-allocate list size if we can estimate it (optional optimization)
224
- # csv_data = [None] * estimated_size # if we had row count
225
-
226
- for row in reader:
227
- # PERFORMANCE FIX: Use zip() instead of range(len()) for row processing
228
- cleaned_row = {clean: row[header_mapping[clean]] for clean in cleaned_headers}
229
- csv_data.append(cleaned_row)
230
-
231
- return csv_data # Return a list of dictionaries
232
- except FileNotFoundError as e:
233
- print(e) # Print the informative error message
234
- print("Hint: Check if CSV file is located in the expected directory or specify a different path in config file.")
235
- print("Please correct the issue and re-run MediBot.")
236
- sys.exit(1) # Halt the script
237
- except IOError as e:
238
- print("Error reading CSV file: {}. Please check the file path and permissions.".format(e))
239
- sys.exit(1) # Halt the script in case of other IO errors
240
-
241
- # CSV Pre-processor Helper functions
242
- def add_columns(csv_data, column_headers):
243
- """
244
- Adds one or multiple columns to the CSV data.
245
-
246
- Parameters:
247
- csv_data (list of dict): The CSV data where each row is represented as a dictionary.
248
- column_headers (list of str or str): A list of column headers to be added to each row, or a single column header.
249
-
250
- Returns:
251
- None: The function modifies the csv_data in place.
252
- """
253
- if isinstance(column_headers, str):
254
- column_headers = [column_headers]
255
- elif not isinstance(column_headers, list):
256
- raise ValueError("column_headers should be a list or a string")
257
-
258
- # PERFORMANCE FIX: Optimize column initialization to avoid nested loop
259
- for row in csv_data:
260
- # Use dict.update() to set multiple columns at once
261
- row.update({header: '' for header in column_headers})
262
-
263
- # Extracting the list to a variable for future refactoring:
264
- def filter_rows(csv_data):
265
- # TODO: This should be written in the crosswalk and not hardcoded here.
266
- excluded_insurance = {'AETNA', 'AETNA MEDICARE', 'HUMANA MED HMO'}
267
- csv_data[:] = [row for row in csv_data if row.get('Patient ID') and row.get('Primary Insurance') not in excluded_insurance]
268
-
269
- def detect_date_format(date_str):
270
- """
271
- PERFORMANCE OPTIMIZATION: Quickly detect the most likely date format
272
- to avoid trying all formats for every date string.
273
-
274
- Parameters:
275
- - date_str (str): The date string to analyze
276
-
277
- Returns:
278
- - str: The most likely format string, or None if unclear
279
- """
280
- if not date_str:
281
- return None
282
-
283
- # Remove time components if present
284
- date_only = date_str.split()[0]
285
-
286
- # Count separators to guess format
287
- slash_count = date_only.count('/')
288
- dash_count = date_only.count('-')
289
-
290
- # Check for 4-digit year (likely YYYY format)
291
- if len(date_only) >= 10: # YYYY-MM-DD or YYYY/MM/DD
292
- if dash_count == 2:
293
- return '%Y-%m-%d'
294
- elif slash_count == 2:
295
- return '%Y/%m/%d'
296
-
297
- # Check for 2-digit year (likely MM/DD/YY or MM-DD-YY)
298
- if len(date_only) >= 8: # MM/DD/YY or MM-DD-YY
299
- if dash_count == 2:
300
- return '%m-%d-%y'
301
- elif slash_count == 2:
302
- return '%m/%d/%y'
303
-
304
- # Default to most common format (MM/DD/YYYY)
305
- if dash_count == 2:
306
- return '%m-%d-%Y'
307
- elif slash_count == 2:
308
- return '%m/%d/%Y'
309
-
310
- return None
311
-
312
- class OptimizedDate:
313
- """
314
- Optimized date object that pre-computes all common format variations
315
- to avoid redundant datetime conversions throughout the application.
316
- """
317
- def __init__(self, datetime_obj):
318
- self.datetime = datetime_obj
319
- # Pre-compute all common format variations
320
- self._display_short = datetime_obj.strftime('%m-%d') # For table display
321
- self._display_full = datetime_obj.strftime('%m-%d-%Y') # Full format
322
- self._medisoft_format = datetime_obj.strftime('%m%d%Y') # For Medisoft entry
323
- self._iso_format = datetime_obj.strftime('%Y-%m-%d') # For sorting/comparison
324
-
325
- @property
326
- def display_short(self):
327
- """Short display format: MM-DD"""
328
- return self._display_short
329
-
330
- @property
331
- def display_full(self):
332
- """Full display format: MM-DD-YYYY"""
333
- return self._display_full
334
-
335
- @property
336
- def medisoft_format(self):
337
- """Medisoft entry format: MMDDYYYY"""
338
- return self._medisoft_format
339
-
340
- @property
341
- def iso_format(self):
342
- """ISO format for sorting: YYYY-MM-DD"""
343
- return self._iso_format
344
-
345
- def __str__(self):
346
- return self._display_full
347
-
348
- def __repr__(self):
349
- return "OptimizedDate({})".format(self._display_full)
350
-
351
- def __eq__(self, other):
352
- if isinstance(other, OptimizedDate):
353
- return self.datetime == other.datetime
354
- elif hasattr(other, 'strftime'): # datetime object
355
- return self.datetime == other
356
- return False
357
-
358
- def __lt__(self, other):
359
- if isinstance(other, OptimizedDate):
360
- return self.datetime < other.datetime
361
- elif hasattr(other, 'strftime'): # datetime object
362
- return self.datetime < other
363
- return NotImplemented
364
-
365
- def __gt__(self, other):
366
- if isinstance(other, OptimizedDate):
367
- return self.datetime > other.datetime
368
- elif hasattr(other, 'strftime'): # datetime object
369
- return self.datetime > other
370
- return NotImplemented
371
-
372
- def strftime(self, format_str):
373
- """Fallback for any custom format needs"""
374
- return self.datetime.strftime(format_str)
375
-
376
- @classmethod
377
- def from_string(cls, date_str, cleaned=False):
378
- """
379
- Create OptimizedDate from string, with optional pre-cleaning.
380
-
381
- Args:
382
- date_str: Date string to parse
383
- cleaned: If True, assumes string is already cleaned
384
-
385
- Returns:
386
- OptimizedDate object or None if parsing fails
387
- """
388
- if not cleaned:
389
- date_str = clean_surgery_date_string(date_str)
390
- if not date_str:
391
- return None
392
-
393
- # Try standard format first (most common)
394
- try:
395
- return cls(datetime.strptime(date_str, '%m/%d/%Y'))
396
- except ValueError:
397
- pass
398
-
399
- # Try alternative formats
400
- formats = ['%m-%d-%Y', '%m/%d/%y', '%m-%d-%y', '%Y/%m/%d', '%Y-%m-%d']
401
- for fmt in formats:
402
- try:
403
- return cls(datetime.strptime(date_str, fmt))
404
- except ValueError:
405
- continue
406
-
407
- return None
408
-
409
- def clean_surgery_date_string(date_str):
410
- """
411
- Cleans and normalizes surgery date strings to handle damaged data.
412
-
413
- Parameters:
414
- - date_str (str): The raw date string from the CSV
415
-
416
- Returns:
417
- - str: Cleaned date string in MM/DD/YYYY format, or empty string if unparseable
418
- """
419
- if not date_str:
420
- return ''
421
-
422
- # Convert to string and strip whitespace
423
- date_str = str(date_str).strip()
424
- if not date_str:
425
- return ''
426
-
427
- # Remove common problematic characters and normalize
428
- date_str = date_str.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
429
- date_str = ' '.join(date_str.split()) # Normalize whitespace
430
-
431
- # PERFORMANCE OPTIMIZATION: Try detected format first
432
- detected_format = detect_date_format(date_str)
433
- if detected_format:
434
- try:
435
- parsed_date = datetime.strptime(date_str, detected_format)
436
- return parsed_date.strftime('%m/%d/%Y')
437
- except ValueError:
438
- pass
439
-
440
- # PERFORMANCE OPTIMIZATION: Try most common format first (MM/DD/YYYY)
441
- # This reduces the average number of format attempts from 8 to ~1-2
442
- try:
443
- parsed_date = datetime.strptime(date_str, '%m/%d/%Y')
444
- return parsed_date.strftime('%m/%d/%Y')
445
- except ValueError:
446
- pass
447
-
448
- # PERFORMANCE OPTIMIZATION: Try second most common format (MM-DD-YYYY)
449
- try:
450
- parsed_date = datetime.strptime(date_str, '%m-%d-%Y')
451
- return parsed_date.strftime('%m/%d/%Y')
452
- except ValueError:
453
- pass
454
-
455
- # PERFORMANCE OPTIMIZATION: Try 2-digit year formats only if needed
456
- try:
457
- parsed_date = datetime.strptime(date_str, '%m/%d/%y')
458
- return parsed_date.strftime('%m/%d/%Y')
459
- except ValueError:
460
- pass
461
-
462
- try:
463
- parsed_date = datetime.strptime(date_str, '%m-%d-%y')
464
- return parsed_date.strftime('%m/%d/%Y')
465
- except ValueError:
466
- pass
467
-
468
- # PERFORMANCE OPTIMIZATION: Try YYYY formats only if needed
469
- try:
470
- parsed_date = datetime.strptime(date_str, '%Y/%m/%d')
471
- return parsed_date.strftime('%m/%d/%Y')
472
- except ValueError:
473
- pass
474
-
475
- try:
476
- parsed_date = datetime.strptime(date_str, '%Y-%m-%d')
477
- return parsed_date.strftime('%m/%d/%Y')
478
- except ValueError:
479
- pass
480
-
481
- # PERFORMANCE OPTIMIZATION: Try datetime formats only if needed
482
- try:
483
- parsed_date = datetime.strptime(date_str, '%m/%d/%Y %H:%M:%S')
484
- return parsed_date.strftime('%m/%d/%Y')
485
- except ValueError:
486
- pass
487
-
488
- try:
489
- parsed_date = datetime.strptime(date_str, '%m-%d-%Y %H:%M:%S')
490
- return parsed_date.strftime('%m/%d/%Y')
491
- except ValueError:
492
- pass
493
-
494
- # If no format matches, try to extract date components
495
- try:
496
- # Remove any time components and extra text
497
- date_only = date_str.split()[0] # Take first part if there's extra text
498
-
499
- # Try to extract numeric components
500
- import re
501
- numbers = re.findall(r'\d+', date_only)
502
-
503
- if len(numbers) >= 3:
504
- # Assume MM/DD/YYYY or MM-DD-YYYY format
505
- month, day, year = int(numbers[0]), int(numbers[1]), int(numbers[2])
506
-
507
- # Validate ranges
508
- if 1 <= month <= 12 and 1 <= day <= 31 and 1900 <= year <= 2100:
509
- # Handle 2-digit years
510
- if year < 100:
511
- year += 2000 if year < 50 else 1900
512
-
513
- parsed_date = datetime(year, month, day)
514
- return parsed_date.strftime('%m/%d/%Y')
515
- except (ValueError, IndexError):
516
- pass
517
-
518
- # If all parsing attempts fail, return empty string
519
- return ''
520
-
521
- def convert_surgery_date(csv_data):
522
- """
523
- Converts surgery date strings to datetime objects with comprehensive data cleaning.
524
-
525
- Parameters:
526
- - csv_data (list): List of dictionaries containing CSV row data
527
- """
528
- # TIMING: Start surgery date conversion with granular tracking
529
- total_start_time = time.time()
530
- date_cleaning_time = 0
531
- date_parsing_time = 0
532
- processed_count = 0
533
- empty_count = 0
534
- error_count = 0
535
-
536
- print("Starting surgery date conversion for {} rows...".format(len(csv_data)))
537
- # LOGGING STRATEGY: Only log start/end of looped events at INFO level, not individual successes
538
- # MediLink_ConfigLoader.log("Starting surgery date conversion for {} rows...".format(len(csv_data)), level="INFO") # REMOVED
539
-
540
- # PERFORMANCE OPTIMIZATION: Pre-compile datetime.strptime for the most common format
541
- # This avoids repeated format string parsing
542
- standard_format = '%m/%d/%Y'
543
-
544
- for row_idx, row in enumerate(csv_data, 1):
545
- surgery_date_str = row.get('Surgery Date', '')
546
-
547
- if not surgery_date_str:
548
- empty_count += 1
549
- # LOGGING STRATEGY: Only log actual errors/failures, not routine empty dates
550
- # if empty_count <= 5: # Only log first 5 empty dates
551
- # MediLink_ConfigLoader.log("Warning: Surgery Date not found for row: {}".format(row), level="WARNING")
552
- # print("Surgery Date not found for row: {}".format(row))
553
- row['Surgery Date'] = datetime.min # Assign a minimum datetime value if empty
554
- else:
555
- # TIMING: Start date string cleaning
556
- cleaning_start = time.time()
557
-
558
- # Clean the date string first
559
- cleaned_date_str = clean_surgery_date_string(surgery_date_str)
560
-
561
- # TIMING: End date string cleaning
562
- cleaning_end = time.time()
563
- date_cleaning_time += (cleaning_end - cleaning_start)
564
-
565
- if not cleaned_date_str:
566
- error_count += 1
567
- # LOGGING STRATEGY: Log actual errors (cleaning failures) at INFO level
568
- if error_count <= 5: # Only log first 5 errors
569
- MediLink_ConfigLoader.log("Error: Could not clean Surgery Date '{}' for row: {}".format(surgery_date_str, row), level="INFO")
570
- print("Could not clean Surgery Date '{}' for row: {}".format(surgery_date_str, row))
571
- row['Surgery Date'] = datetime.min # Assign a minimum datetime value if cleaning fails
572
- else:
573
- # TIMING: Start date parsing
574
- parsing_start = time.time()
575
-
576
- try:
577
- # PERFORMANCE OPTIMIZATION: Use pre-compiled format string
578
- # Parse the cleaned date string
579
- row['Surgery Date'] = datetime.strptime(cleaned_date_str, standard_format)
580
- processed_count += 1
581
- # LOGGING STRATEGY: Remove success logging - DEBUG is typically silent anyway
582
- # if processed_count <= 10 or processed_count % 100 == 0: # Log first 10 and every 100th
583
- # MediLink_ConfigLoader.log("Successfully cleaned and parsed Surgery Date '{}' -> '{}' for row: {}".format(
584
- # surgery_date_str, cleaned_date_str, row), level="DEBUG")
585
- except ValueError as e:
586
- error_count += 1
587
- # LOGGING STRATEGY: Log actual errors (parsing failures) at INFO level
588
- if error_count <= 5: # Only log first 5 parsing errors
589
- MediLink_ConfigLoader.log("Error parsing cleaned Surgery Date '{}': {} for row: {}".format(
590
- cleaned_date_str, e, row), level="INFO")
591
- row['Surgery Date'] = datetime.min # Assign a minimum datetime value if parsing fails
592
-
593
- # TIMING: End date parsing
594
- parsing_end = time.time()
595
- date_parsing_time += (parsing_end - parsing_start)
596
-
597
- # TIMING: End total surgery date conversion
598
- total_end_time = time.time()
599
- total_duration = total_end_time - total_start_time
600
-
601
- if PERFORMANCE_LOGGING:
602
- print("Surgery date conversion completed:")
603
- print(" - Total duration: {:.2f} seconds".format(total_duration))
604
- print(" - Date cleaning time: {:.2f} seconds ({:.1f}%)".format(date_cleaning_time, (date_cleaning_time/total_duration)*100))
605
- print(" - Date parsing time: {:.2f} seconds ({:.1f}%)".format(date_parsing_time, (date_parsing_time/total_duration)*100))
606
- print(" - Processed: {} rows, Empty: {} rows, Errors: {} rows".format(processed_count, empty_count, error_count))
607
-
608
- # LOGGING STRATEGY: Log completion summary at INFO level (end of looped event)
609
- MediLink_ConfigLoader.log("Surgery date conversion completed - Total: {:.2f}s, Cleaning: {:.2f}s, Parsing: {:.2f}s, Processed: {}, Empty: {}, Errors: {}".format(
610
- total_duration, date_cleaning_time, date_parsing_time, processed_count, empty_count, error_count), level="INFO")
611
-
612
- def sort_and_deduplicate(csv_data):
613
- # Create a dictionary to hold unique patients based on Patient ID
614
- unique_patients = {}
615
- # Create a dictionary to store multiple surgery dates per patient
616
- patient_surgery_dates = {}
617
-
618
- # Iterate through the CSV data and populate the unique_patients dictionary
619
- for row in csv_data:
620
- patient_id = row.get('Patient ID')
621
- surgery_date = row.get('Surgery Date')
622
-
623
- if patient_id not in unique_patients:
624
- unique_patients[patient_id] = row
625
- patient_surgery_dates[patient_id] = [surgery_date]
626
- else:
627
- # If the patient ID already exists, compare surgery dates
628
- existing_row = unique_patients[patient_id]
629
- existing_date = existing_row['Surgery Date']
630
-
631
- # Ensure both dates are comparable by converting to datetime objects
632
- def normalize_date_for_comparison(date_value):
633
- if isinstance(date_value, datetime):
634
- return date_value
635
- elif isinstance(date_value, str) and date_value.strip():
636
- try:
637
- # Try to parse the string as a date
638
- return datetime.strptime(date_value, '%m/%d/%Y')
639
- except ValueError:
640
- try:
641
- return datetime.strptime(date_value, '%m-%d-%Y')
642
- except ValueError:
643
- # If parsing fails, return minimum datetime
644
- return datetime.min
645
- else:
646
- # Empty or invalid values get minimum datetime
647
- return datetime.min
648
-
649
- normalized_surgery_date = normalize_date_for_comparison(surgery_date)
650
- normalized_existing_date = normalize_date_for_comparison(existing_date)
651
-
652
- # Keep the most current demographic data (later surgery date takes precedence)
653
- if normalized_surgery_date > normalized_existing_date:
654
- # Store the old row's surgery date before replacing
655
- old_date = existing_row['Surgery Date']
656
- # Add the old date to the list if it's not already there
657
- if old_date not in patient_surgery_dates[patient_id]:
658
- patient_surgery_dates[patient_id].append(old_date)
659
- # Replace with newer row (better demographics)
660
- unique_patients[patient_id] = row
661
- # Add the new surgery date to the list if it's not already there
662
- if surgery_date not in patient_surgery_dates[patient_id]:
663
- patient_surgery_dates[patient_id].append(surgery_date)
664
- else:
665
- # Add this surgery date to the list for this patient if it's not already there
666
- if surgery_date not in patient_surgery_dates[patient_id]:
667
- patient_surgery_dates[patient_id].append(surgery_date)
668
-
669
- # Store the surgery dates information in the first row of each patient for later access
670
- for patient_id, row in unique_patients.items():
671
- # Convert surgery dates to strings for consistent storage
672
- surgery_date_strings = []
673
- for date in patient_surgery_dates[patient_id]:
674
- if isinstance(date, datetime):
675
- if date == datetime.min:
676
- surgery_date_strings.append('MISSING')
677
- else:
678
- surgery_date_strings.append(date.strftime('%m-%d-%Y'))
679
- else:
680
- surgery_date_strings.append(str(date) if date else 'MISSING')
681
-
682
- # Remove duplicates and sort
683
- unique_surgery_dates = list(set(surgery_date_strings))
684
- sorted_surgery_dates = sorted(unique_surgery_dates, key=lambda x: datetime.strptime(x, '%m-%d-%Y') if x != 'MISSING' else datetime.min)
685
- row['_all_surgery_dates'] = sorted_surgery_dates
686
- row['_primary_surgery_date'] = row['Surgery Date'] # Keep track of which date has the demographics
687
- # Compute and store earliest surgery date for emission sort
688
- earliest_dt = None
689
- earliest_str = None
690
- for d in sorted_surgery_dates:
691
- if d and d != 'MISSING':
692
- try:
693
- earliest_dt = datetime.strptime(d, '%m-%d-%Y')
694
- earliest_str = d
695
- break
696
- except Exception:
697
- pass
698
- # Fallback to demographics date if earliest could not be determined
699
- if earliest_str is None:
700
- try:
701
- sd = row.get('Surgery Date')
702
- if isinstance(sd, datetime) and sd != datetime.min:
703
- earliest_dt = sd
704
- earliest_str = sd.strftime('%m-%d-%Y')
705
- elif isinstance(sd, str) and sd.strip():
706
- try:
707
- earliest_dt = datetime.strptime(sd, '%m/%d/%Y')
708
- except Exception:
709
- try:
710
- earliest_dt = datetime.strptime(sd, '%m-%d-%Y')
711
- except Exception:
712
- earliest_dt = None
713
- earliest_str = sd
714
- except Exception:
715
- earliest_dt = None
716
- earliest_str = None
717
- row['_earliest_surgery_date'] = earliest_str
718
-
719
-
720
-
721
- # Convert the unique_patients dictionary back to a list and sort it
722
- # Use the same normalization function for consistent sorting
723
- def sort_key(row):
724
- # Prefer earliest surgery date across all known dates for the patient
725
- earliest = row.get('_earliest_surgery_date')
726
- if isinstance(earliest, str) and earliest and earliest != 'MISSING':
727
- try:
728
- normalized_date = datetime.strptime(earliest, '%m-%d-%Y')
729
- except Exception:
730
- normalized_date = datetime.min
731
- else:
732
- # Fallback to the single Surgery Date field
733
- surgery_date = row.get('Surgery Date')
734
- if isinstance(surgery_date, datetime):
735
- normalized_date = surgery_date
736
- elif isinstance(surgery_date, str) and surgery_date.strip():
737
- try:
738
- normalized_date = datetime.strptime(surgery_date, '%m/%d/%Y')
739
- except ValueError:
740
- try:
741
- normalized_date = datetime.strptime(surgery_date, '%m-%d-%Y')
742
- except ValueError:
743
- normalized_date = datetime.min
744
- else:
745
- normalized_date = datetime.min
746
- # Tie-break per requirement: last name (case-insensitive), then first name, then patient id
747
- last_name = ((row.get('Patient Last') or '')).strip().upper()
748
- first_name = ((row.get('Patient First') or '')).strip().upper()
749
- patient_id_tiebreak = str(row.get('Patient ID') or '')
750
- return (normalized_date, last_name, first_name, patient_id_tiebreak)
751
-
752
- csv_data[:] = sorted(unique_patients.values(), key=sort_key) # TODO Does this need to be sorted twice? once before and once after?
753
-
754
- # TODO: Consider adding an option in the config to sort based on Surgery Schedules when available.
755
- # If no schedule is available, the current sorting strategy will be used.
756
-
757
- def combine_fields(csv_data):
758
- for row in csv_data:
759
- # Safely handle the 'Surgery Date' conversion with clear missing indicator
760
- surgery_date = row.get('Surgery Date')
761
- try:
762
- if isinstance(surgery_date, datetime):
763
- if surgery_date == datetime.min:
764
- row['Surgery Date'] = 'MISSING'
765
- else:
766
- row['Surgery Date'] = surgery_date.strftime('%m-%d-%Y')
767
- elif surgery_date:
768
- # Already a non-empty string
769
- row['Surgery Date'] = str(surgery_date)
770
- else:
771
- row['Surgery Date'] = 'MISSING'
772
- except Exception:
773
- row['Surgery Date'] = 'MISSING'
774
-
775
- first_name = '_'.join(part.strip() for part in row.get('Patient First', '').split()) # Join the first name parts with underscores after cleaning.
776
- middle_name = row.get('Patient Middle', '').strip()
777
- middle_name = middle_name[0] if len(middle_name) > 1 else '' # Take only the first character or empty
778
- last_name = '_'.join(part.strip() for part in row.get('Patient Last', '').split()) # Join the last name parts with underscores after cleaning.
779
- row['Patient Name'] = ', '.join(filter(None, [last_name, first_name])) + (' ' + middle_name if middle_name else '') # Comma between last and first, space before middle
780
-
781
- address1 = row.get('Patient Address1', '').strip()
782
- address2 = row.get('Patient Address2', '').strip()
783
- row['Patient Street'] = ' '.join(filter(None, [address1, address2])) # Join non-empty addresses
784
-
785
- def apply_replacements(csv_data, crosswalk):
786
- replacements = crosswalk.get('csv_replacements', {})
787
- # Pre-define the keys to check for better performance
788
- keys_to_check = ['Patient SSN', 'Primary Insurance', 'Ins1 Payer ID']
789
-
790
- for row in csv_data:
791
- # Use early termination - check each replacement only if needed
792
- for old_value, new_value in replacements.items():
793
- replacement_made = False
794
- for key in keys_to_check:
795
- if row.get(key) == old_value:
796
- row[key] = new_value
797
- replacement_made = True
798
- break # Exit the key loop once a replacement is made
799
- if replacement_made:
800
- break # Exit the replacement loop once any replacement is made
801
-
802
- import difflib
803
- from collections import defaultdict
804
-
805
- def find_best_medisoft_id(insurance_name, medisoft_ids, medisoft_to_mains_names):
806
- """
807
- Finds the best matching Medisoft ID for a given insurance name using fuzzy matching.
808
-
809
- Parameters:
810
- - insurance_name (str): The insurance name from the CSV row.
811
- - medisoft_ids (list): List of Medisoft IDs associated with the Payer ID.
812
- - medisoft_to_mains_names (dict): Mapping from Medisoft ID to list of MAINS names.
813
-
814
- Returns:
815
- - int or None: The best matching Medisoft ID or None if no match is found.
816
- """
817
- best_match_ratio = 0
818
- best_medisoft_id = None
819
-
820
- # Pre-process insurance name once
821
- processed_insurance = ''.join(c for c in insurance_name if not c.isdigit()).upper()
822
-
823
- for medisoft_id in medisoft_ids:
824
- mains_names = medisoft_to_mains_names.get(medisoft_id, [])
825
- for mains_name in mains_names:
826
- # Preprocess names by extracting non-numeric characters and converting to uppercase
827
- # Use more efficient string processing
828
- processed_mains = ''.join(c for c in mains_name if not c.isdigit()).upper()
829
-
830
- # Log the processed names before computing the match ratio
831
- MediLink_ConfigLoader.log("Processing Medisoft ID '{}': Comparing processed insurance '{}' with processed mains '{}'.".format(medisoft_id, processed_insurance, processed_mains), level="DEBUG")
832
-
833
- # Compute the similarity ratio
834
- match_ratio = difflib.SequenceMatcher(None, processed_insurance, processed_mains).ratio()
835
-
836
- # Log the match ratio
837
- MediLink_ConfigLoader.log("Match ratio for Medisoft ID '{}': {:.2f}".format(medisoft_id, match_ratio), level="DEBUG")
838
-
839
- if match_ratio > best_match_ratio:
840
- best_match_ratio = match_ratio
841
- best_medisoft_id = medisoft_id
842
- # Log the current best match
843
- MediLink_ConfigLoader.log("New best match found: Medisoft ID '{}' with match ratio {:.2f}".format(best_medisoft_id, best_match_ratio), level="DEBUG")
844
-
845
- # Log the final best match ratio and ID
846
- MediLink_ConfigLoader.log("Final best match ratio: {:.2f} for Medisoft ID '{}'".format(best_match_ratio, best_medisoft_id), level="DEBUG")
847
-
848
- # No threshold applied, return the best match found
849
- return best_medisoft_id
850
-
851
- def NEW_update_insurance_ids(csv_data, config, crosswalk):
852
- """
853
- Updates the 'Ins1 Insurance ID' field in each row of csv_data based on the crosswalk and MAINS data.
854
-
855
- Parameters:
856
- - csv_data (list of dict): The CSV data where each row is represented as a dictionary.
857
- - config (dict): Configuration object containing necessary paths and parameters.
858
- - crosswalk (dict): Crosswalk data containing mappings between Payer IDs and Medisoft IDs.
859
-
860
- Returns:
861
- - None: The function modifies the csv_data in place.
862
- """
863
- processed_payer_ids = set() # Track processed Payer IDs
864
- MediLink_ConfigLoader.log("Starting update of insurance IDs.", level="INFO")
865
-
866
- # PERFORMANCE FIX: Pre-build flattened payer lookup cache to avoid nested dictionary access
867
- payer_cache = {}
868
- crosswalk_payers = crosswalk.get('payer_id', {})
869
- for payer_id, details in crosswalk_payers.items():
870
- payer_cache[payer_id] = {
871
- 'medisoft_id': details.get('medisoft_id', []),
872
- 'medisoft_medicare_id': details.get('medisoft_medicare_id', []),
873
- 'endpoint': details.get('endpoint', None)
874
- }
875
- MediLink_ConfigLoader.log("Built payer cache for {} payers".format(len(payer_cache)), level="DEBUG")
876
-
877
- # Load MAINS data to get mapping from Medisoft ID to MAINS names
878
- insurance_to_id = load_insurance_data_from_mains(config) # Assuming it returns a dict mapping insurance names to IDs
879
- MediLink_ConfigLoader.log("Loaded MAINS data for insurance to ID mapping.", level="DEBUG")
880
-
881
- # Invert the mapping to get Medisoft ID to MAINS names
882
- medisoft_to_mains_names = defaultdict(list)
883
- for insurance_name, medisoft_id in insurance_to_id.items():
884
- medisoft_to_mains_names[medisoft_id].append(insurance_name)
885
-
886
- for row_idx, row in enumerate(csv_data, 1):
887
- # PERFORMANCE FIX: Store row index to avoid O(n) csv_data.index() calls later
888
- row['_row_index'] = row_idx
889
- ins1_payer_id = row.get('Ins1 Payer ID', '').strip()
890
- MediLink_ConfigLoader.log("Processing row with Ins1 Payer ID: '{}'.".format(ins1_payer_id), level="DEBUG")
891
-
892
- if ins1_payer_id:
893
- # Mark this Payer ID as processed
894
- if ins1_payer_id not in processed_payer_ids:
895
- processed_payer_ids.add(ins1_payer_id) # Add to set
896
- MediLink_ConfigLoader.log("Marked Payer ID '{}' as processed.".format(ins1_payer_id), level="DEBUG")
897
-
898
- # PERFORMANCE FIX: Use flattened cache instead of nested dictionary lookups
899
- payer_info = payer_cache.get(ins1_payer_id, {})
900
- medisoft_ids = payer_info.get('medisoft_id', [])
901
- MediLink_ConfigLoader.log("Retrieved Medisoft IDs for Payer ID '{}': {}".format(ins1_payer_id, medisoft_ids), level="DEBUG")
902
-
903
- if not medisoft_ids:
904
- MediLink_ConfigLoader.log("No Medisoft IDs available for Payer ID '{}', creating placeholder entry.".format(ins1_payer_id), level="WARNING")
905
- # Create a placeholder entry in the crosswalk and cache
906
- placeholder_entry = {
907
- 'medisoft_id': [], # Placeholder for future Medisoft IDs
908
- 'medisoft_medicare_id': [], # Placeholder for future Medicare IDs
909
- 'endpoint': None # Placeholder for future endpoint
910
- }
911
- if 'payer_id' not in crosswalk:
912
- crosswalk['payer_id'] = {}
913
- crosswalk['payer_id'][ins1_payer_id] = placeholder_entry
914
- # PERFORMANCE FIX: Update cache with placeholder entry
915
- payer_cache[ins1_payer_id] = placeholder_entry
916
- continue # Skip further processing for this Payer ID
917
-
918
- # If only one Medisoft ID is associated, assign it directly
919
- if len(medisoft_ids) == 1:
920
- try:
921
- medisoft_id = int(medisoft_ids[0])
922
- row['Ins1 Insurance ID'] = medisoft_id
923
- # PERFORMANCE FIX: Use enumerate index instead of csv_data.index() which is O(n)
924
- row_number = getattr(row, '_row_index', 'Unknown')
925
- MediLink_ConfigLoader.log("Assigned Medisoft ID '{}' to row number {} with Payer ID '{}'.".format(medisoft_id, row_number, ins1_payer_id), level="DEBUG")
926
- except ValueError as e:
927
- MediLink_ConfigLoader.log("Error converting Medisoft ID '{}' to integer for Payer ID '{}': {}".format(medisoft_ids[0], ins1_payer_id, e), level="ERROR")
928
- row['Ins1 Insurance ID'] = None
929
- continue # Move to the next row
930
-
931
- # If multiple Medisoft IDs are associated, perform fuzzy matching
932
- insurance_name = row.get('Primary Insurance', '').strip()
933
- if not insurance_name:
934
- MediLink_ConfigLoader.log("Row with Payer ID '{}' missing 'Primary Insurance', skipping assignment.".format(ins1_payer_id), level="WARNING")
935
- continue # Skip if insurance name is missing
936
-
937
- best_medisoft_id = find_best_medisoft_id(insurance_name, medisoft_ids, medisoft_to_mains_names)
938
-
939
- if best_medisoft_id:
940
- row['Ins1 Insurance ID'] = best_medisoft_id
941
- MediLink_ConfigLoader.log("Assigned Medisoft ID '{}' to row with Payer ID '{}' based on fuzzy match.".format(best_medisoft_id, ins1_payer_id), level="INFO")
942
- else:
943
- # Default to the first Medisoft ID if no good match is found
944
- try:
945
- default_medisoft_id = int(medisoft_ids[0])
946
- row['Ins1 Insurance ID'] = default_medisoft_id
947
- MediLink_ConfigLoader.log("No suitable match found. Defaulted to Medisoft ID '{}' for Payer ID '{}'.".format(default_medisoft_id, ins1_payer_id), level="INFO")
948
- except ValueError as e:
949
- MediLink_ConfigLoader.log("Error converting default Medisoft ID '{}' to integer for Payer ID '{}': {}".format(medisoft_ids[0], ins1_payer_id, e), level="ERROR")
950
- row['Ins1 Insurance ID'] = None
951
-
952
- def update_insurance_ids(csv_data, config, crosswalk):
953
- # LOGGING STRATEGY: Remove DEBUG level function start log - DEBUG is typically silent anyway
954
- # MediLink_ConfigLoader.log("Starting update_insurance_ids function.", level="DEBUG")
955
-
956
- # TIMING: Start insurance ID updates with granular tracking
957
- total_start_time = time.time()
958
- lookup_build_time = 0
959
- csv_processing_time = 0
960
- processed_count = 0
961
- medicare_count = 0
962
- regular_count = 0
963
- placeholder_count = 0
964
-
965
- print("Starting insurance ID updates for {} rows...".format(len(csv_data)))
966
- # LOGGING STRATEGY: Only log start/end of looped events at INFO level, not individual successes
967
- # MediLink_ConfigLoader.log("Starting insurance ID updates for {} rows...".format(len(csv_data)), level="INFO") # REMOVED
968
-
969
- # TIMING: Start lookup dictionary building
970
- lookup_start_time = time.time()
971
-
972
- # PERFORMANCE FIX: Pre-build optimized lookup dictionaries for both regular and Medicare IDs
973
- # This reduces Medicare processing overhead by building lookups once instead of repeated processing
974
- payer_id_to_medisoft = {}
975
- payer_id_to_medicare = {}
976
- # LOGGING STRATEGY: Remove DEBUG level initialization log - DEBUG is typically silent anyway
977
- # MediLink_ConfigLoader.log("Initialized optimized lookup dictionaries for Medicare and regular IDs.", level="DEBUG")
978
-
979
- # Build both lookup dictionaries simultaneously to avoid multiple iterations
980
- for payer_id, details in crosswalk.get('payer_id', {}).items():
981
- # Get both regular and Medicare IDs
982
- medisoft_ids = details.get('medisoft_id', [])
983
- medicare_ids = details.get('medisoft_medicare_id', [])
984
-
985
- # Filter empty strings once for each type
986
- medisoft_ids = [id for id in medisoft_ids if id] if medisoft_ids else []
987
- medicare_ids = [id for id in medicare_ids if id] if medicare_ids else []
988
-
989
- # Store first valid ID for quick lookup (Medicare takes precedence if available)
990
- payer_id_to_medisoft[payer_id] = int(medisoft_ids[0]) if medisoft_ids else None
991
- payer_id_to_medicare[payer_id] = int(medicare_ids[0]) if medicare_ids else None
992
-
993
- # LOGGING STRATEGY: Remove success logging - DEBUG is typically silent anyway
994
- # if len(payer_id_to_medisoft) <= 10 or len(payer_id_to_medisoft) % 50 == 0: # Log first 10 and every 50th
995
- # MediLink_ConfigLoader.log("Processed Payer ID '{}': Regular IDs: {}, Medicare IDs: {}".format(
996
- # payer_id, medisoft_ids, medicare_ids), level="DEBUG")
997
-
998
- # TIMING: End lookup dictionary building
999
- lookup_end_time = time.time()
1000
- lookup_build_time = lookup_end_time - lookup_start_time
1001
-
1002
- if PERFORMANCE_LOGGING:
1003
- print("Built lookup dictionaries in {:.2f} seconds for {} payer IDs".format(lookup_build_time, len(payer_id_to_medisoft)))
1004
-
1005
-
1006
- # TIMING: Start CSV processing
1007
- csv_start_time = time.time()
1008
-
1009
- # PERFORMANCE FIX: Single pass through CSV data with optimized Medicare ID resolution
1010
- for row_idx, row in enumerate(csv_data, 1):
1011
- ins1_payer_id = row.get('Ins1 Payer ID', '').strip()
1012
- # LOGGING STRATEGY: Remove success logging - DEBUG is typically silent anyway
1013
- # if row_idx <= 10 or row_idx % 100 == 0: # Log first 10 and every 100th
1014
- # MediLink_ConfigLoader.log("Processing row #{} with Ins1 Payer ID '{}'.".format(row_idx, ins1_payer_id), level="DEBUG")
1015
-
1016
- # Try Medicare ID first, then fall back to regular ID (optimized Medicare processing)
1017
- insurance_id = (payer_id_to_medicare.get(ins1_payer_id) or
1018
- payer_id_to_medisoft.get(ins1_payer_id))
1019
-
1020
- if insurance_id is None and ins1_payer_id not in payer_id_to_medisoft:
1021
- # Add placeholder entry for new payer ID (preserve original functionality)
1022
- payer_id_to_medisoft[ins1_payer_id] = None
1023
- payer_id_to_medicare[ins1_payer_id] = None
1024
- crosswalk.setdefault('payer_id', {})[ins1_payer_id] = {
1025
- 'medisoft_id': [], # Placeholder for future Medisoft IDs
1026
- 'medisoft_medicare_id': [], # Placeholder for future Medicare IDs
1027
- 'endpoint': None # Placeholder for future endpoint
1028
- }
1029
- placeholder_count += 1
1030
- # LOGGING STRATEGY: Log actual events (new payer IDs) at INFO level
1031
- if placeholder_count <= 5: # Only log first 5 placeholders
1032
- MediLink_ConfigLoader.log("Added placeholder entry for new Payer ID '{}'.".format(ins1_payer_id), level="INFO")
1033
- elif insurance_id == payer_id_to_medicare.get(ins1_payer_id):
1034
- medicare_count += 1
1035
- else:
1036
- regular_count += 1
1037
-
1038
- # Assign the resolved insurance ID to the row
1039
- row['Ins1 Insurance ID'] = insurance_id
1040
- # TODO (SECONDARY QUEUE): When building a secondary-claims queue after Medicare crossover,
1041
- # set claim_type='secondary' and attach prior payer fields here from the Medicare primary outcome:
1042
- # - row['prior_payer_name'] = 'MEDICARE'
1043
- # - row['prior_payer_id'] = best Medicare ID from config/crosswalk
1044
- # - optionally row['primary_paid_amount'], row['cas_adjustments'] extracted from 835
1045
- processed_count += 1
1046
- # LOGGING STRATEGY: Remove success logging - DEBUG is typically silent anyway
1047
- # if processed_count <= 10 or processed_count % 100 == 0: # Log first 10 and every 100th
1048
- # MediLink_ConfigLoader.log("Assigned Insurance ID '{}' to row with Ins1 Payer ID '{}'.".format(insurance_id, ins1_payer_id), level="DEBUG")
1049
-
1050
- # TIMING: End CSV processing
1051
- csv_end_time = time.time()
1052
- csv_processing_time = csv_end_time - csv_start_time
1053
-
1054
- # TIMING: End total insurance ID updates
1055
- total_end_time = time.time()
1056
- total_duration = total_end_time - total_start_time
1057
-
1058
- if PERFORMANCE_LOGGING:
1059
- print("Insurance ID updates completed:")
1060
- print(" - Total duration: {:.2f} seconds".format(total_duration))
1061
- print(" - Lookup building time: {:.2f} seconds ({:.1f}%)".format(lookup_build_time, (lookup_build_time/total_duration)*100))
1062
- print(" - CSV processing time: {:.2f} seconds ({:.1f}%)".format(csv_processing_time, (csv_processing_time/total_duration)*100))
1063
- print(" - Processed: {} rows, Medicare: {} rows, Regular: {} rows, Placeholders: {} rows".format(
1064
- processed_count, medicare_count, regular_count, placeholder_count))
1065
-
1066
- # LOGGING STRATEGY: Log completion summary at INFO level (end of looped event)
1067
- MediLink_ConfigLoader.log("Insurance ID updates completed - Total: {:.2f}s, Lookup: {:.2f}s, Processing: {:.2f}s, Processed: {}, Medicare: {}, Regular: {}, Placeholders: {}".format(
1068
- total_duration, lookup_build_time, csv_processing_time, processed_count, medicare_count, regular_count, placeholder_count), level="INFO")
1069
-
1070
- def update_procedure_codes(csv_data, crosswalk):
1071
-
1072
- # Get Medisoft shorthand dictionary from crosswalk and reverse it
1073
- diagnosis_to_medisoft = crosswalk.get('diagnosis_to_medisoft', {}) # BUG We need to be careful here in case we decide we need to change the crosswalk data specifically with regard to the T8/H usage.
1074
- medisoft_to_diagnosis = {v: k for k, v in diagnosis_to_medisoft.items()}
1075
-
1076
- # Get procedure code to diagnosis dictionary from crosswalk and reverse it for easier lookup
1077
- diagnosis_to_procedure = {
1078
- diagnosis_code: procedure_code
1079
- for procedure_code, diagnosis_codes in crosswalk.get('procedure_to_diagnosis', {}).items()
1080
- for diagnosis_code in diagnosis_codes
1081
- }
1082
-
1083
- # Initialize counters for tracking
1084
- updated_count = 0
1085
- missing_medisoft_codes = set()
1086
- missing_procedure_mappings = set()
1087
-
1088
- # Update the "Procedure Code" column in the CSV data
1089
- for row_num, row in enumerate(csv_data, start=1):
1090
- try:
1091
- medisoft_code = row.get('Default Diagnosis #1', '').strip()
1092
- diagnosis_code = medisoft_to_diagnosis.get(medisoft_code)
1093
-
1094
- if diagnosis_code:
1095
- procedure_code = diagnosis_to_procedure.get(diagnosis_code)
1096
- if procedure_code:
1097
- row['Procedure Code'] = procedure_code
1098
- updated_count += 1
1099
- else:
1100
- # Track missing procedure mapping
1101
- missing_procedure_mappings.add(diagnosis_code)
1102
- row['Procedure Code'] = "Unknown" # Will be handled by 837p encoder
1103
- MediLink_ConfigLoader.log("Missing procedure mapping for diagnosis code '{}' (Medisoft code: '{}') in row {}".format(
1104
- diagnosis_code, medisoft_code, row_num), level="WARNING")
1105
- else:
1106
- # Track missing Medisoft code mapping
1107
- if medisoft_code: # Only track if there's actually a code
1108
- missing_medisoft_codes.add(medisoft_code)
1109
- row['Procedure Code'] = "Unknown" # Will be handled by 837p encoder
1110
- MediLink_ConfigLoader.log("Missing Medisoft code mapping for '{}' in row {}".format(
1111
- medisoft_code, row_num), level="WARNING")
1112
- except Exception as e:
1113
- MediLink_ConfigLoader.log("In update_procedure_codes, Error processing row {}: {}".format(row_num, e), level="ERROR")
1114
-
1115
- # Log summary statistics
1116
- MediLink_ConfigLoader.log("Total {} 'Procedure Code' rows updated.".format(updated_count), level="INFO")
1117
-
1118
- if missing_medisoft_codes:
1119
- MediLink_ConfigLoader.log("Missing Medisoft code mappings: {}".format(sorted(missing_medisoft_codes)), level="WARNING")
1120
- print("WARNING: {} Medisoft codes need to be added to diagnosis_to_medisoft mapping: {}".format(
1121
- len(missing_medisoft_codes), sorted(missing_medisoft_codes)))
1122
-
1123
- if missing_procedure_mappings:
1124
- MediLink_ConfigLoader.log("Missing procedure mappings for diagnosis codes: {}".format(sorted(missing_procedure_mappings)), level="WARNING")
1125
- print("WARNING: {} diagnosis codes need to be added to procedure_to_diagnosis mapping: {}".format(
1126
- len(missing_procedure_mappings), sorted(missing_procedure_mappings)))
1127
-
1128
- return True
1129
-
1130
- def update_diagnosis_codes(csv_data):
1131
- try:
1132
- # TIMING: Start surgery schedule parsing timing
1133
- parsing_start_time = time.time()
1134
- print("Starting surgery schedule parsing at: {}".format(time.strftime("%H:%M:%S")))
1135
- MediLink_ConfigLoader.log("Starting surgery schedule parsing at: {}".format(time.strftime("%H:%M:%S")), level="INFO")
1136
-
1137
- # Use cached configuration instead of loading repeatedly
1138
- config, crosswalk = get_cached_configuration()
1139
-
1140
- # Extract the local storage path from the configuration
1141
- local_storage_path = config['MediLink_Config']['local_storage_path']
1142
-
1143
- # Initialize a dictionary to hold diagnosis codes from all DOCX files
1144
- all_patient_data = {}
1145
-
1146
- # Convert surgery dates in CSV data
1147
- convert_surgery_date(csv_data)
1148
-
1149
- # Extract all valid surgery dates from csv_data
1150
- surgery_dates = [row['Surgery Date'] for row in csv_data if row['Surgery Date'] != datetime.min]
1151
-
1152
- if not surgery_dates:
1153
- raise ValueError("No valid surgery dates found in csv_data.")
1154
-
1155
- # Determine the minimum and maximum surgery dates
1156
- min_surgery_date = min(surgery_dates)
1157
- max_surgery_date = max(surgery_dates)
1158
-
1159
- # Apply a +/-8-day margin to the surgery dates... Increased from 5 days.
1160
- margin = timedelta(days=8)
1161
- threshold_start = min_surgery_date - margin
1162
- threshold_end = max_surgery_date + margin
1163
-
1164
- # TODO (Low) This is a bad idea. We need a better way to handle this because it leaves
1165
- # us with a situation where if we take 'too long' to download the DOCX files, it will presume that the DOCX files are out of range because
1166
- # the modfied date is a bad proxy for the date of the surgery which would be contained inside the DOCX file. The processing overhead for extracting the
1167
- # date of the surgery from the DOCX file is non-trivial and computationally expensive so we need a smarter way to handle this.
1168
-
1169
- MediLink_ConfigLoader.log("BAD IDEA: Processing DOCX files modified between {} and {}.".format(threshold_start, threshold_end), level="INFO")
1170
-
1171
- # TIMING: Start file system operations
1172
- filesystem_start_time = time.time()
1173
-
1174
- # PERFORMANCE OPTIMIZATION: Batch file system operations with caching
1175
- # Pre-convert threshold timestamps for efficient comparison (Windows XP compatible)
1176
- threshold_start_ts = threshold_start.timestamp() if hasattr(threshold_start, 'timestamp') else time.mktime(threshold_start.timetuple())
1177
- threshold_end_ts = threshold_end.timestamp() if hasattr(threshold_end, 'timestamp') else time.mktime(threshold_end.timetuple())
1178
-
1179
- valid_files = []
1180
- try:
1181
- # Use os.listdir() with optimized timestamp comparison (XP/3.4.4 compatible)
1182
- for filename in os.listdir(local_storage_path):
1183
- if filename.endswith('.docx'):
1184
- filepath = os.path.join(local_storage_path, filename)
1185
- # Get file modification time in single operation
1186
- try:
1187
- stat_info = os.stat(filepath)
1188
- # Direct timestamp comparison avoids datetime conversion overhead
1189
- if threshold_start_ts <= stat_info.st_mtime <= threshold_end_ts:
1190
- valid_files.append(filepath)
1191
- except (OSError, ValueError):
1192
- # Skip files with invalid modification times
1193
- continue
1194
- except OSError:
1195
- MediLink_ConfigLoader.log("Error accessing directory: {}".format(local_storage_path), level="ERROR")
1196
- return
1197
-
1198
- # TIMING: End file system operations
1199
- filesystem_end_time = time.time()
1200
- filesystem_duration = filesystem_end_time - filesystem_start_time
1201
-
1202
- # PERFORMANCE OPTIMIZATION: Log file count for debugging without processing overhead
1203
- MediLink_ConfigLoader.log("Found {} DOCX files within date threshold".format(len(valid_files)), level="INFO")
1204
-
1205
- # TIMING: Start CSV data preprocessing
1206
- csv_prep_start_time = time.time()
1207
-
1208
- # PERFORMANCE OPTIMIZATION: Pre-process patient IDs for efficient lookup
1209
- # Create a set of patient IDs from CSV data for faster lookups
1210
- patient_ids_in_csv = {row.get('Patient ID', '').strip() for row in csv_data}
1211
-
1212
- # PERFORMANCE OPTIMIZATION: Pre-convert surgery dates to string format
1213
- # Convert all surgery dates to string format once to avoid repeated conversions in loops
1214
- surgery_date_strings = {}
1215
- for row in csv_data:
1216
- patient_id = row.get('Patient ID', '').strip()
1217
- surgery_date = row.get('Surgery Date')
1218
- if surgery_date != datetime.min:
1219
- surgery_date_strings[patient_id] = surgery_date.strftime("%m-%d-%Y")
1220
- else:
1221
- surgery_date_strings[patient_id] = ''
1222
-
1223
- # TIMING: End CSV data preprocessing
1224
- csv_prep_end_time = time.time()
1225
- csv_prep_duration = csv_prep_end_time - csv_prep_start_time
1226
-
1227
- # TIMING: Log before processing DOCX files
1228
- docx_processing_start_time = time.time()
1229
- print("Found {} DOCX files to process. Starting DOCX parsing...".format(len(valid_files)))
1230
- MediLink_ConfigLoader.log("Found {} DOCX files to process. Starting DOCX parsing...".format(len(valid_files)), level="INFO")
1231
-
1232
- # TIMING: Track individual DOCX file processing
1233
- docx_files_processed = 0
1234
- docx_files_skipped = 0
1235
- docx_parse_errors = 0
1236
-
1237
- # Process valid DOCX files
1238
- for filepath in valid_files:
1239
- # TIMING: Start individual file processing
1240
- file_start_time = time.time()
1241
-
1242
- try:
1243
- patient_data = parse_docx(filepath, surgery_dates) # Pass surgery_dates to parse_docx
1244
- docx_files_processed += 1
1245
-
1246
- # PERFORMANCE OPTIMIZATION: Use defaultdict for more efficient dictionary operations
1247
- for patient_id, service_dates in patient_data.items():
1248
- if patient_id not in all_patient_data:
1249
- all_patient_data[patient_id] = {}
1250
- for date_of_service, diagnosis_data in service_dates.items():
1251
- # TODO: SURGERY SCHEDULE CONFLICT RESOLUTION
1252
- # Implement enhanced conflict detection and logging as outlined in
1253
- # surgery_schedule_conflict_resolution_strategy.md
1254
- #
1255
- # Current behavior: Silent overwriting with latest file wins
1256
- # Proposed enhancement:
1257
- # 1. Detect when multiple files contain data for same date
1258
- # 2. Log conflicts with date-organized notifications showing:
1259
- # - Source files (with modification timestamps)
1260
- # - Patients affected (added/removed/modified)
1261
- # - Specific changes (diagnosis, laterality, etc.)
1262
- # 3. Use file modification time to determine priority
1263
- # 4. Generate summary report organized by surgery date
1264
- #
1265
- # Example notification format:
1266
- # "SURGERY SCHEDULE CONFLICTS DETECTED FOR: 12/15/2023"
1267
- # " Original: file1.docx (modified: 08:30:00)"
1268
- # " Revised: file2.docx (modified: 14:45:00)"
1269
- # " Patients affected: 3 modified, 1 added, 1 removed"
1270
- # " Resolution: Using latest file (file2.docx)"
1271
- #
1272
- # This will provide transparency when revised schedules overwrite
1273
- # original schedules, organized by the affected surgery dates.
1274
- all_patient_data[patient_id][date_of_service] = diagnosis_data
1275
- except Exception as e:
1276
- docx_parse_errors += 1
1277
- MediLink_ConfigLoader.log("Error parsing DOCX file {}: {}".format(filepath, e), level="ERROR")
1278
-
1279
- # TIMING: End individual file processing
1280
- file_end_time = time.time()
1281
- file_duration = file_end_time - file_start_time
1282
-
1283
- # Log slow files (taking more than 1 second)
1284
- if file_duration > 1.0 and PERFORMANCE_LOGGING:
1285
- print(" - Slow file: {} (Duration: {:.2f} seconds)".format(os.path.basename(filepath), file_duration))
1286
-
1287
- # TIMING: Log DOCX processing completion
1288
- docx_processing_end_time = time.time()
1289
- docx_processing_duration = docx_processing_end_time - docx_processing_start_time
1290
- if PERFORMANCE_LOGGING:
1291
- print("DOCX parsing completed at: {} (Duration: {:.2f} seconds)".format(
1292
- time.strftime("%H:%M:%S"), docx_processing_duration))
1293
- print(" - Files processed: {}, Files skipped: {}, Parse errors: {}".format(
1294
- docx_files_processed, docx_files_skipped, docx_parse_errors))
1295
- MediLink_ConfigLoader.log("DOCX parsing completed at: {} (Duration: {:.2f} seconds)".format(
1296
- time.strftime("%H:%M:%S"), docx_processing_duration), level="INFO")
1297
-
1298
- # Log if no valid files were found
1299
- if not valid_files:
1300
- MediLink_ConfigLoader.log("No valid DOCX files found within the modification time threshold.", level="INFO")
1301
-
1302
- # Debug logging for all_patient_data
1303
- MediLink_ConfigLoader.log("All patient data collected from DOCX files: {}".format(all_patient_data), level="DEBUG")
1304
-
1305
- # Check if any patient data was collected
1306
- if not all_patient_data or not patient_ids_in_csv.intersection(all_patient_data.keys()):
1307
- MediLink_ConfigLoader.log("No patient data collected or no matching Patient IDs found. Skipping further processing.", level="INFO")
1308
- return # Exit the function early if no data is available
1309
-
1310
- # TIMING: Start CSV data matching
1311
- csv_matching_start_time = time.time()
1312
-
1313
- # Get Medisoft shorthand dictionary from crosswalk.
1314
- diagnosis_to_medisoft = crosswalk.get('diagnosis_to_medisoft', {})
1315
-
1316
- # Initialize counter for updated rows
1317
- updated_count = 0
1318
-
1319
- # PERFORMANCE OPTIMIZATION: Single pass through CSV data with pre-processed lookups
1320
- # Update the "Default Diagnosis #1" column in the CSV data and store diagnosis codes for all surgery dates
1321
- for row_num, row in enumerate(csv_data, start=1):
1322
- patient_id = row.get('Patient ID', '').strip()
1323
- # Use pre-processed patient ID lookup for efficiency
1324
- if patient_id not in patient_ids_in_csv:
1325
- continue # Skip rows that do not match any patient ID
1326
-
1327
- MediLink_ConfigLoader.log("Processing row number {}.".format(row_num), level="DEBUG")
1328
-
1329
- # Get all surgery dates for this patient
1330
- all_surgery_dates = row.get('_all_surgery_dates', [row.get('Surgery Date')])
1331
-
1332
- # Create a mapping of surgery dates to diagnosis codes for this patient
1333
- surgery_date_to_diagnosis = {}
1334
-
1335
- if patient_id in all_patient_data:
1336
- # Process each surgery date for this patient
1337
- for surgery_date in all_surgery_dates:
1338
- # Convert surgery date to string format for lookup
1339
- try:
1340
- if hasattr(surgery_date, 'strftime'):
1341
- surgery_date_str = surgery_date.strftime('%m-%d-%Y')
1342
- else:
1343
- surgery_date_str = str(surgery_date)
1344
- except Exception:
1345
- surgery_date_str = str(surgery_date)
1346
-
1347
- MediLink_ConfigLoader.log("Patient ID: {}, Surgery Date: {}".format(patient_id, surgery_date_str), level="DEBUG")
1348
-
1349
- if surgery_date_str in all_patient_data[patient_id]:
1350
- diagnosis_data = all_patient_data[patient_id][surgery_date_str]
1351
- # XP SP3 + Py3.4.4 compatible tuple unpacking with safety check
1352
- try:
1353
- if isinstance(diagnosis_data, (list, tuple)) and len(diagnosis_data) >= 3:
1354
- diagnosis_code, left_or_right_eye, femto_yes_or_no = diagnosis_data
1355
- else:
1356
- # Handle case where diagnosis_data is not a proper tuple
1357
- diagnosis_code = diagnosis_data if diagnosis_data else None
1358
- left_or_right_eye = None
1359
- femto_yes_or_no = None
1360
- except Exception as e:
1361
- MediLink_ConfigLoader.log("Error unpacking diagnosis data for Patient ID: {}, Surgery Date: {}: {}".format(
1362
- patient_id, surgery_date_str, str(e)), level="WARNING")
1363
- diagnosis_code = None
1364
- left_or_right_eye = None
1365
- femto_yes_or_no = None
1366
-
1367
- MediLink_ConfigLoader.log("Found diagnosis data for Patient ID: {}, Surgery Date: {}".format(patient_id, surgery_date_str), level="DEBUG")
1368
-
1369
- # Convert diagnosis code to Medisoft shorthand format.
1370
- # XP SP3 + Py3.4.4 compatible null check
1371
- if diagnosis_code is None:
1372
- medisoft_shorthand = 'N/A'
1373
- MediLink_ConfigLoader.log("Diagnosis code is None for Patient ID: {}, Surgery Date: {}".format(
1374
- patient_id, surgery_date_str), level="WARNING")
1375
- else:
1376
- medisoft_shorthand = diagnosis_to_medisoft.get(diagnosis_code, None)
1377
- if medisoft_shorthand is None and diagnosis_code:
1378
- # Use fallback logic for missing mapping (XP SP3 + Py3.4.4 compatible)
1379
- try:
1380
- defaulted_code = diagnosis_code.lstrip('H').lstrip('T8').replace('.', '')[-5:]
1381
- # Basic validation: ensure code is not empty and has reasonable length
1382
- if defaulted_code and len(defaulted_code) >= 3:
1383
- medisoft_shorthand = defaulted_code
1384
- MediLink_ConfigLoader.log("Missing diagnosis mapping for '{}', using fallback code '{}'".format(
1385
- diagnosis_code, medisoft_shorthand), level="WARNING")
1386
- else:
1387
- medisoft_shorthand = 'N/A'
1388
- MediLink_ConfigLoader.log("Fallback diagnosis code validation failed for '{}', using 'N/A'".format(
1389
- diagnosis_code), level="WARNING")
1390
- except Exception as e:
1391
- medisoft_shorthand = 'N/A'
1392
- MediLink_ConfigLoader.log("Error in fallback diagnosis code generation for '{}': {}".format(
1393
- diagnosis_code, str(e)), level="WARNING")
1394
-
1395
- MediLink_ConfigLoader.log("Converted diagnosis code to Medisoft shorthand: {}".format(medisoft_shorthand), level="DEBUG")
1396
-
1397
- surgery_date_to_diagnosis[surgery_date_str] = medisoft_shorthand
1398
- else:
1399
- MediLink_ConfigLoader.log("No matching surgery date found for Patient ID: {} on date {}.".format(patient_id, surgery_date_str), level="INFO")
1400
- surgery_date_to_diagnosis[surgery_date_str] = 'N/A'
1401
-
1402
- # Store the diagnosis mapping for all surgery dates
1403
- row['_surgery_date_to_diagnosis'] = surgery_date_to_diagnosis
1404
-
1405
- # Set the primary diagnosis code (for the main surgery date)
1406
- primary_surgery_date = row.get('Surgery Date')
1407
- # Convert primary surgery date to string for lookup
1408
- if isinstance(primary_surgery_date, datetime):
1409
- primary_surgery_date_str = primary_surgery_date.strftime('%m-%d-%Y')
1410
- else:
1411
- primary_surgery_date_str = str(primary_surgery_date)
1412
- primary_diagnosis = surgery_date_to_diagnosis.get(primary_surgery_date_str, 'N/A')
1413
- row['Default Diagnosis #1'] = primary_diagnosis
1414
-
1415
- updated_count += 1
1416
- MediLink_ConfigLoader.log("Updated row number {} with diagnosis codes for {} surgery dates.".format(row_num, len(all_surgery_dates)), level="INFO")
1417
- else:
1418
- MediLink_ConfigLoader.log("Patient ID: {} not found in DOCX data for row {}.".format(patient_id, row_num), level="INFO")
1419
-
1420
- # TIMING: End CSV data matching
1421
- csv_matching_end_time = time.time()
1422
- csv_matching_duration = csv_matching_end_time - csv_matching_start_time
1423
-
1424
- # Log total count of updated rows
1425
- MediLink_ConfigLoader.log("Total {} 'Default Diagnosis #1' rows updated.".format(updated_count), level="INFO")
1426
-
1427
- # TIMING: End surgery schedule parsing timing
1428
- parsing_end_time = time.time()
1429
- parsing_duration = parsing_end_time - parsing_start_time
1430
- if PERFORMANCE_LOGGING:
1431
- print("Surgery schedule parsing completed at: {} (Duration: {:.2f} seconds)".format(
1432
- time.strftime("%H:%M:%S"), parsing_duration))
1433
- print(" - File system operations: {:.2f} seconds ({:.1f}%)".format(filesystem_duration, (filesystem_duration/parsing_duration)*100))
1434
- print(" - CSV data preprocessing: {:.2f} seconds ({:.1f}%)".format(csv_prep_duration, (csv_prep_duration/parsing_duration)*100))
1435
- print(" - DOCX file processing: {:.2f} seconds ({:.1f}%)".format(docx_processing_duration, (docx_processing_duration/parsing_duration)*100))
1436
- print(" - CSV data matching: {:.2f} seconds ({:.1f}%)".format(csv_matching_duration, (csv_matching_duration/parsing_duration)*100))
1437
- print(" - Files processed: {}, Files skipped: {}, Parse errors: {}".format(docx_files_processed, docx_files_skipped, docx_parse_errors))
1438
- MediLink_ConfigLoader.log("Surgery schedule parsing completed at: {} (Duration: {:.2f} seconds)".format(
1439
- time.strftime("%H:%M:%S"), parsing_duration), level="INFO")
1440
-
1441
- except Exception as e:
1442
- message = "An error occurred while updating diagnosis codes. Please check the DOCX files and configuration: {}".format(e)
1443
- MediLink_ConfigLoader.log(message, level="ERROR")
1444
- print(message)
1445
-
1446
- def load_data_sources(config, crosswalk):
1447
- """Loads historical mappings from MAPAT and Carol's CSVs."""
1448
- patient_id_to_insurance_id = load_insurance_data_from_mapat(config, crosswalk)
1449
- if not patient_id_to_insurance_id:
1450
- raise ValueError("Failed to load historical Patient ID to Insurance ID mappings from MAPAT.")
1451
-
1452
- payer_id_to_patient_ids = load_historical_payer_to_patient_mappings(config)
1453
- if not payer_id_to_patient_ids:
1454
- raise ValueError("Failed to load historical Carol's CSVs.")
1455
-
1456
- return patient_id_to_insurance_id, payer_id_to_patient_ids
1457
-
1458
- def map_payer_ids_to_insurance_ids(patient_id_to_insurance_id, payer_id_to_patient_ids):
1459
- """Maps Payer IDs to Insurance IDs based on the historical mappings."""
1460
- payer_id_to_details = {}
1461
- for payer_id, patient_ids in payer_id_to_patient_ids.items():
1462
- medisoft_ids = set()
1463
- for patient_id in patient_ids:
1464
- if patient_id in patient_id_to_insurance_id:
1465
- medisoft_id = patient_id_to_insurance_id[patient_id]
1466
- medisoft_ids.add(medisoft_id)
1467
- MediLink_ConfigLoader.log("Added Medisoft ID {} for Patient ID {} and Payer ID {}".format(medisoft_id, patient_id, payer_id))
1468
- else:
1469
- MediLink_ConfigLoader.log("No matching Insurance ID found for Patient ID {}".format(patient_id))
1470
- if medisoft_ids:
1471
- payer_id_to_details[payer_id] = {
1472
- "endpoint": "OPTUMEDI", # TODO Default, to be refined via API poll. There are 2 of these defaults!
1473
- "medisoft_id": list(medisoft_ids),
1474
- "medisoft_medicare_id": [] # Placeholder for future implementation
1475
- }
1476
- return payer_id_to_details
1477
-
1478
- def _display_mains_file_error(mains_path):
1479
- """
1480
- Helper function to display the critical MAINS file error message.
1481
-
1482
- Args:
1483
- mains_path (str): The path where the MAINS file was expected to be found.
1484
- """
1485
- error_msg = "CRITICAL: MAINS file not found at: {}. This file is required for insurance name to Medisoft ID mapping.".format(mains_path)
1486
- if hasattr(MediLink_ConfigLoader, 'log'):
1487
- MediLink_ConfigLoader.log(error_msg, level="CRITICAL")
1488
- print("\n" + "="*80)
1489
- print("CRITICAL ERROR: MAINS FILE MISSING")
1490
- print("="*80)
1491
- print("\nThe MAINS file is required for the following critical functions:")
1492
- print("* Mapping insurance company names to Medisoft IDs")
1493
- print("* Converting insurance names to payer IDs for claim submission")
1494
- print("* Creating properly formatted 837p claim files")
1495
- print("\nWithout this file, claim submission will fail because:")
1496
- print("* Insurance names cannot be converted to payer IDs")
1497
- print("* 837p claim files cannot be generated")
1498
- print("* Claims cannot be submitted to insurance companies")
1499
- print("\nTO FIX THIS:")
1500
- print("1. Ensure the MAINS file exists at: {}".format(mains_path))
1501
- print("2. If the file is missing, llamar a Dani")
1502
- print("3. The file should contain insurance company data from your Medisoft system")
1503
- print("="*80)
1504
- time.sleep(3) # 3 second pause to allow user to read critical error message
1505
-
1506
-
1507
- def load_insurance_data_from_mains(config):
1508
- """
1509
- Loads insurance data from MAINS and creates a mapping from insurance names to their respective IDs.
1510
- This mapping is critical for the crosswalk update process to correctly associate payer IDs with insurance IDs.
1511
-
1512
- Args:
1513
- config (dict): Configuration object containing necessary paths and parameters.
1514
-
1515
- Returns:
1516
- dict: A dictionary mapping insurance names to insurance IDs.
1517
- """
1518
- # Use cached configuration to avoid repeated loading
1519
- try:
1520
- config, crosswalk = get_cached_configuration()
1521
- except Exception as e:
1522
- print("Warning: Failed to load cached configuration: {}".format(e))
1523
- # Return empty mapping if configuration loading fails
1524
- return {}
1525
-
1526
- # XP Compatibility: Check if MediLink_DataMgmt is available
1527
- if MediLink_DataMgmt is None:
1528
- print("Warning: MediLink_DataMgmt not available. Cannot load MAINS data.")
1529
- return {}
1530
-
1531
- # Retrieve MAINS path and slicing information from the configuration
1532
- # TODO (Low) For secondary insurance, this needs to be pulling from the correct MAINS (there are 2)
1533
- # TODO (Low) Performance: There probably needs to be a dictionary proxy for MAINS that gets updated.
1534
- # Meh, this just has to be part of the new architecture plan where we make Medisoft a downstream
1535
- # recipient from the db.
1536
- # TODO (High) The Medisoft Medicare flag needs to be brought in here.
1537
- try:
1538
- mains_path = config.get('MAINS_MED_PATH', '')
1539
- mains_slices = crosswalk.get('mains_mapping', {}).get('slices', {})
1540
- except (KeyError, AttributeError) as e:
1541
- print("Warning: Failed to get MAINS configuration: {}".format(e))
1542
- return {}
1543
-
1544
- # Initialize the dictionary to hold the insurance to insurance ID mappings
1545
- insurance_to_id = {}
1546
-
1547
- try:
1548
- # Check if MAINS file exists before attempting to read
1549
- if not os.path.exists(mains_path):
1550
- _display_mains_file_error(mains_path)
1551
- return insurance_to_id
1552
-
1553
- # XP Compatibility: Check if MediLink_DataMgmt has the required function
1554
- if not hasattr(MediLink_DataMgmt, 'read_general_fixed_width_data'):
1555
- print("Warning: MediLink_DataMgmt.read_general_fixed_width_data not available. Cannot load MAINS data.")
1556
- return insurance_to_id
1557
-
1558
- # Read data from MAINS using a provided function to handle fixed-width data
1559
- for record, line_number in MediLink_DataMgmt.read_general_fixed_width_data(mains_path, mains_slices):
1560
- insurance_name = record['MAINSNAME']
1561
- # Assuming line_number gives the correct insurance ID without needing adjustment
1562
- insurance_to_id[insurance_name] = line_number
1563
-
1564
- if hasattr(MediLink_ConfigLoader, 'log'):
1565
- MediLink_ConfigLoader.log("Successfully loaded {} insurance records from MAINS".format(len(insurance_to_id)), level="INFO")
1566
- else:
1567
- print("Successfully loaded {} insurance records from MAINS".format(len(insurance_to_id)))
1568
-
1569
- except FileNotFoundError:
1570
- _display_mains_file_error(mains_path)
1571
- except Exception as e:
1572
- error_msg = "Error loading MAINS data: {}. Continuing without MAINS data.".format(str(e))
1573
- if hasattr(MediLink_ConfigLoader, 'log'):
1574
- MediLink_ConfigLoader.log(error_msg, level="ERROR")
1575
- print("Error loading MAINS data: {}. Continuing without MAINS data.".format(str(e)))
1576
-
1577
- return insurance_to_id
1578
-
1579
- def load_insurance_data_from_mapat(config, crosswalk):
1580
- """
1581
- Loads insurance data from MAPAT and creates a mapping from patient ID to insurance ID.
1582
-
1583
- Args:
1584
- config (dict): Configuration object containing necessary paths and parameters.
1585
- crosswalk ... ADD HERE.
1586
-
1587
- Returns:
1588
- dict: A dictionary mapping patient IDs to insurance IDs.
1589
- """
1590
- # Retrieve MAPAT path and slicing information from the configuration
1591
- ac = _ac()
1592
- mapat_path = ac.get_mapat_med_path() if ac else ''
1593
- mapat_slices = crosswalk['mapat_mapping']['slices']
1594
-
1595
- # Initialize the dictionary to hold the patient ID to insurance ID mappings
1596
- patient_id_to_insurance_id = {}
1597
-
1598
- # Read data from MAPAT using a provided function to handle fixed-width data
1599
- for record, _ in MediLink_DataMgmt.read_general_fixed_width_data(mapat_path, mapat_slices):
1600
- patient_id = record['MAPATPXID']
1601
- insurance_id = record['MAPATINID']
1602
- patient_id_to_insurance_id[patient_id] = insurance_id
1603
-
1604
- return patient_id_to_insurance_id
1605
-
1606
- def parse_z_dat(z_dat_path, config): # Why is this in MediBot and not MediLink?
1607
- """
1608
- Parses the Z.dat file to map Patient IDs to Insurance Names using the provided fixed-width file format.
1609
-
1610
- Args:
1611
- z_dat_path (str): Path to the Z.dat file.
1612
- config (dict): Configuration object containing slicing information and other parameters.
1613
-
1614
- Returns:
1615
- dict: A dictionary mapping Patient IDs to Insurance Names.
1616
- """
1617
- patient_id_to_insurance_name = {}
1618
-
1619
- try:
1620
- # Reading blocks of fixed-width data (up to 5 lines per record)
1621
- for personal_info, insurance_info, service_info, service_info_2, service_info_3 in MediLink_DataMgmt.read_fixed_width_data(z_dat_path):
1622
- # Parse Z.dat reserved record format: 3 active + 2 reserved lines
1623
- parsed_data = MediLink_DataMgmt.parse_fixed_width_data(personal_info, insurance_info, service_info, service_info_2, service_info_3, config.get('MediLink_Config', config))
1624
-
1625
- # Extract Patient ID and Insurance Name from parsed data
1626
- patient_id = parsed_data.get('PATID')
1627
- insurance_name = parsed_data.get('INAME')
1628
-
1629
- if patient_id and insurance_name:
1630
- patient_id_to_insurance_name[patient_id] = insurance_name
1631
- MediLink_ConfigLoader.log("Mapped Patient ID {} to Insurance Name {}".format(patient_id, insurance_name), config, level="INFO")
1632
-
1633
- except FileNotFoundError:
1634
- MediLink_ConfigLoader.log("File not found: {}".format(z_dat_path), config, level="INFO")
1635
- except Exception as e:
1636
- MediLink_ConfigLoader.log("Failed to parse Z.dat: {}".format(str(e)), config, level="INFO")
1637
-
1638
- return patient_id_to_insurance_name
1639
-
1640
- def load_historical_payer_to_patient_mappings(config):
1641
- """
1642
- Loads historical mappings from multiple Carol's CSV files in a specified directory,
1643
- mapping Payer IDs to sets of Patient IDs.
1644
-
1645
- Args:
1646
- config (dict): Configuration object containing the directory path for Carol's CSV files
1647
- and other necessary parameters.
1648
-
1649
- Returns:
1650
- dict: A dictionary where each key is a Payer ID and the value is a set of Patient IDs.
1651
- """
1652
- directory_path = os.path.dirname(config['CSV_FILE_PATH'])
1653
- payer_to_patient_ids = defaultdict(set)
1654
-
1655
- try:
1656
- # Check if the directory exists
1657
- if not os.path.isdir(directory_path):
1658
- raise FileNotFoundError("Directory '{}' not found.".format(directory_path))
1659
-
1660
- # Loop through each file in the directory containing Carol's historical CSVs
1661
- for filename in os.listdir(directory_path):
1662
- file_path = os.path.join(directory_path, filename)
1663
- if filename.endswith('.csv'):
1664
- try:
1665
- with open(file_path, 'r', encoding='utf-8') as csvfile:
1666
- reader = csv.DictReader(csvfile)
1667
- patient_count = 0 # Counter for Patient IDs found in this CSV
1668
- for row in reader:
1669
- if 'Patient ID' not in row or 'Ins1 Payer ID' not in row:
1670
- continue # Skip this row if either key is missing
1671
- if not row.get('Patient ID').strip() or not row.get('Ins1 Payer ID').strip():
1672
- continue # Skip this row if either value is missing or empty
1673
-
1674
- payer_id = row['Ins1 Payer ID'].strip()
1675
- patient_id = row['Patient ID'].strip()
1676
- payer_to_patient_ids[payer_id].add(patient_id)
1677
- patient_count += 1 # Increment the counter for each valid mapping
1678
-
1679
- # Log the accumulated count for this CSV file
1680
- if patient_count > 0:
1681
- MediLink_ConfigLoader.log("CSV file '{}' has {} Patient IDs with Payer IDs.".format(filename, patient_count), level="DEBUG")
1682
- else:
1683
- MediLink_ConfigLoader.log("CSV file '{}' is empty or does not have valid Patient ID or Payer ID mappings.".format(filename), level="DEBUG")
1684
- except Exception as e:
1685
- print("Error processing file {}: {}".format(filename, e))
1686
- MediLink_ConfigLoader.log("Error processing file '{}': {}".format(filename, e), level="ERROR")
1687
- except FileNotFoundError as e:
1688
- print("Error: {}".format(e))
1689
-
1690
- if not payer_to_patient_ids:
1691
- print("No historical mappings were generated.")
1692
-
1693
- return dict(payer_to_patient_ids)
1694
-
1695
- def capitalize_all_fields(csv_data):
1696
- """
1697
- Converts all text fields in the CSV data to uppercase.
1698
-
1699
- Parameters:
1700
- csv_data (list of dict): The CSV data where each row is represented as a dictionary.
1701
-
1702
- Returns:
1703
- None: The function modifies the csv_data in place.
1704
- """
1705
- # PERFORMANCE FIX: Optimize uppercase conversion while preserving complex types
1706
- for row in csv_data:
1707
- updated_row = {}
1708
- for key, value in row.items():
1709
- # Preserve internal/derived fields intact (e.g., `_all_surgery_dates`, `_surgery_date_to_diagnosis`)
1710
- if isinstance(key, str) and key.startswith('_'):
1711
- updated_row[key] = value
1712
- continue
1713
- # Uppercase plain strings
1714
- if isinstance(value, str):
1715
- updated_row[key] = value.upper()
1716
- continue
1717
- # Preserve complex containers; optionally uppercase their string contents
1718
- if isinstance(value, list):
1719
- updated_row[key] = [elem.upper() if isinstance(elem, str) else elem for elem in value]
1720
- continue
1721
- if isinstance(value, dict):
1722
- updated_row[key] = {k: (v.upper() if isinstance(v, str) else v) for k, v in value.items()}
1723
- continue
1724
- # Leave datetimes as-is; coerce simple scalars to string upper for consistency
1725
- if isinstance(value, datetime):
1726
- updated_row[key] = value
1727
- else:
1728
- updated_row[key] = str(value).upper() if value is not None else value
1
+ # MediBot_Preprocessor_lib.py
2
+ """
3
+ Core preprocessing library for MediBot
4
+ Contains core preprocessing functions and utilities.
5
+ """
6
+
7
+ import csv, time, os, sys
8
+ from datetime import datetime, timedelta
9
+ from collections import OrderedDict
10
+
11
+ # Try to import chardet for encoding detection
12
+ try:
13
+ import chardet
14
+ except ImportError:
15
+ chardet = None # Fallback if chardet is not available
16
+
17
+ # SORTING STRATEGY CONFIGURATION
18
+ # Set to 'schedule_based' to enable surgery schedule sorting
19
+ # Set to 'date_based' to use current date-based sorting (default)
20
+ SORTING_STRATEGY = 'date_based' # Hard-coded with clear comments
21
+
22
+ # When enabled, patients will be sorted based on their position in the DOCX surgery schedule
23
+ # When disabled, patients will be sorted by earliest surgery date (current behavior)
24
+
25
+ # Use core utilities for standardized imports
26
+ from MediCafe.core_utils import (
27
+ import_medibot_module,
28
+ import_medilink_module,
29
+ get_config_loader_with_fallback
30
+ )
31
+
32
+ # Initialize configuration loader with fallback
33
+ MediLink_ConfigLoader = get_config_loader_with_fallback()
34
+
35
+ # Import MediLink_DataMgmt using centralized import function
36
+ MediLink_DataMgmt = import_medilink_module('MediLink_DataMgmt')
37
+
38
+ # Import MediBot modules using centralized import functions
39
+ MediBot_UI = import_medibot_module('MediBot_UI')
40
+ if MediBot_UI:
41
+ app_control = getattr(MediBot_UI, 'app_control', None)
42
+ get_app_control = getattr(MediBot_UI, '_get_app_control', None)
43
+ def _ac():
44
+ try:
45
+ return get_app_control() if get_app_control else getattr(MediBot_UI, 'app_control', None)
46
+ except Exception:
47
+ return getattr(MediBot_UI, 'app_control', None)
48
+ else:
49
+ app_control = None
50
+
51
+ MediBot_docx_decoder = import_medibot_module('MediBot_docx_decoder')
52
+ if MediBot_docx_decoder:
53
+ parse_docx = getattr(MediBot_docx_decoder, 'parse_docx', None)
54
+ else:
55
+ parse_docx = None
56
+
57
+ # Add the parent directory of the project to the Python path
58
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
59
+
60
+ # Configuration cache to avoid repeated loading
61
+ _config_cache = None
62
+ _crosswalk_cache = None
63
+
64
+ # Use core utilities for standardized imports
65
+ from MediCafe.core_utils import get_shared_config_loader
66
+ MediLink_ConfigLoader = get_shared_config_loader()
67
+
68
+ # Ensure MediLink_ConfigLoader is available
69
+ if MediLink_ConfigLoader is None:
70
+ print("Warning: MediLink_ConfigLoader not available. Some functionality may be limited.")
71
+ # Create a minimal fallback logger
72
+ class FallbackLogger:
73
+ def log(self, message, level="INFO"):
74
+ print("[{}] {}".format(level, message))
75
+ MediLink_ConfigLoader = FallbackLogger()
76
+
77
+ # Import centralized logging configuration
78
+ try:
79
+ from MediCafe.logging_config import PERFORMANCE_LOGGING
80
+ except ImportError:
81
+ # Fallback to local flag if centralized config is not available
82
+ PERFORMANCE_LOGGING = False
83
+
84
+ # XP Compatibility: Add robust fallback for configuration loading
85
+ def get_cached_configuration_xp_safe():
86
+ """
87
+ XP-compatible version of get_cached_configuration with robust fallbacks.
88
+ """
89
+ global _config_cache, _crosswalk_cache
90
+
91
+ # If we already have cached data, return it
92
+ if _config_cache is not None and _crosswalk_cache is not None:
93
+ return _config_cache, _crosswalk_cache
94
+
95
+ # Try to load configuration using the standard method
96
+ try:
97
+ if MediLink_ConfigLoader and hasattr(MediLink_ConfigLoader, 'load_configuration'):
98
+ _config_cache, _crosswalk_cache = MediLink_ConfigLoader.load_configuration()
99
+ return _config_cache, _crosswalk_cache
100
+ except Exception as e:
101
+ print("Warning: Failed to load configuration via MediLink_ConfigLoader: {}".format(e))
102
+
103
+ # Fallback: Try to load configuration files directly
104
+ try:
105
+ import json
106
+ project_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
107
+
108
+ # Try to load config.json
109
+ config_path = os.path.join(project_dir, 'json', 'config.json')
110
+ if os.path.exists(config_path):
111
+ with open(config_path, 'r') as f:
112
+ _config_cache = json.load(f)
113
+ else:
114
+ _config_cache = {}
115
+
116
+ # Try to load crosswalk.json
117
+ crosswalk_path = os.path.join(project_dir, 'json', 'crosswalk.json')
118
+ if os.path.exists(crosswalk_path):
119
+ with open(crosswalk_path, 'r') as f:
120
+ _crosswalk_cache = json.load(f)
121
+ else:
122
+ _crosswalk_cache = {}
123
+
124
+ return _config_cache, _crosswalk_cache
125
+
126
+ except Exception as e:
127
+ print("Warning: Failed to load configuration files directly: {}".format(e))
128
+ # Return empty defaults
129
+ _config_cache = {}
130
+ _crosswalk_cache = {}
131
+ return _config_cache, _crosswalk_cache
132
+
133
+ class InitializationError(Exception):
134
+ def __init__(self, message):
135
+ self.message = message
136
+ super().__init__(self.message)
137
+
138
+ def initialize(config):
139
+ global AHK_EXECUTABLE, CSV_FILE_PATH, field_mapping, page_end_markers
140
+
141
+ required_keys = {
142
+ 'AHK_EXECUTABLE': "",
143
+ 'CSV_FILE_PATH': "",
144
+ 'field_mapping': {},
145
+ 'page_end_markers': []
146
+ }
147
+
148
+ for key, default in required_keys.items():
149
+ try:
150
+ globals()[key] = config.get(key, default) if key != 'field_mapping' else OrderedDict(config.get(key, default))
151
+ except AttributeError:
152
+ raise InitializationError("Error: '{}' not found in config.".format(key))
153
+
154
+ def get_cached_configuration():
155
+ """
156
+ Returns cached configuration and crosswalk data to avoid repeated I/O operations.
157
+ """
158
+ return get_cached_configuration_xp_safe()
159
+
160
+ def open_csv_for_editing(csv_file_path):
161
+ try:
162
+ # Open the CSV file with its associated application
163
+ os.system('start "" "{}"'.format(csv_file_path))
164
+ print("After saving the revised CSV, please re-run MediBot.")
165
+ except Exception as e:
166
+ print("Failed to open CSV file:", e)
167
+
168
+ # Function to clean the headers
169
+ def clean_header(headers):
170
+ """
171
+ Cleans the header strings by removing unwanted characters and trimming whitespace.
172
+
173
+ Parameters:
174
+ headers (list of str): The original header strings.
175
+
176
+ Returns:
177
+ list of str: The cleaned header strings.
178
+ """
179
+ cleaned_headers = []
180
+
181
+ for header in headers:
182
+ # Strip leading and trailing whitespace
183
+ cleaned_header = header.strip()
184
+ # Remove unwanted characters while keeping spaces, alphanumeric characters, hyphens, and underscores
185
+ cleaned_header = ''.join(char for char in cleaned_header if char.isalnum() or char.isspace() or char in ['-', '_'])
186
+ cleaned_headers.append(cleaned_header)
187
+
188
+ # Log the original and cleaned headers for debugging
189
+ MediLink_ConfigLoader.log("Original headers: {}".format(headers), level="INFO")
190
+ MediLink_ConfigLoader.log("Cleaned headers: {}".format(cleaned_headers), level="INFO")
191
+
192
+ # Check if 'Surgery Date' is in the cleaned headers
193
+ if 'Surgery Date' not in cleaned_headers:
194
+ MediLink_ConfigLoader.log("WARNING: 'Surgery Date' header not found after cleaning.", level="WARNING")
195
+ print("WARNING: 'Surgery Date' header not found after cleaning.")
196
+ raise ValueError("Error: 'Surgery Date' header not found after cleaning.")
197
+
198
+ return cleaned_headers
199
+
200
+ # Function to load and process CSV data
201
+ def load_csv_data(csv_file_path):
202
+ try:
203
+ # Check if the file exists
204
+ if not os.path.exists(csv_file_path):
205
+ raise FileNotFoundError("***Error: CSV file '{}' not found.".format(csv_file_path))
206
+
207
+ # Detect the file encoding
208
+ with open(csv_file_path, 'rb') as f:
209
+ raw_data = f.read()
210
+ if chardet:
211
+ result = chardet.detect(raw_data)
212
+ encoding = result['encoding']
213
+ confidence = result['confidence']
214
+ else:
215
+ # Fallback to UTF-8 when chardet is not available
216
+ encoding = 'utf-8'
217
+ confidence = 1.0
218
+ print("Detected encoding: {} (Confidence: {:.2f})".format(encoding, confidence))
219
+
220
+ # Read the CSV file with the detected encoding
221
+ with open(csv_file_path, 'r', encoding=encoding) as csvfile:
222
+ reader = csv.DictReader(csvfile)
223
+ # Clean the headers
224
+ cleaned_headers = clean_header(reader.fieldnames)
225
+
226
+ # PERFORMANCE FIX: Use zip() instead of range(len()) for header mapping
227
+ header_mapping = {clean: orig for clean, orig in zip(cleaned_headers, reader.fieldnames)}
228
+
229
+ # Process the remaining rows - optimize by pre-allocating the list
230
+ csv_data = []
231
+ # Pre-allocate list size if we can estimate it (optional optimization)
232
+ # csv_data = [None] * estimated_size # if we had row count
233
+
234
+ for row in reader:
235
+ # PERFORMANCE FIX: Use zip() instead of range(len()) for row processing
236
+ cleaned_row = {clean: row[header_mapping[clean]] for clean in cleaned_headers}
237
+ csv_data.append(cleaned_row)
238
+
239
+ return csv_data # Return a list of dictionaries
240
+ except FileNotFoundError as e:
241
+ print(e) # Print the informative error message
242
+ print("Hint: Check if CSV file is located in the expected directory or specify a different path in config file.")
243
+ print("Please correct the issue and re-run MediBot.")
244
+ sys.exit(1) # Halt the script
245
+ except IOError as e:
246
+ print("Error reading CSV file: {}. Please check the file path and permissions.".format(e))
247
+ sys.exit(1) # Halt the script in case of other IO errors
248
+
249
+ # CSV Pre-processor Helper functions
250
+ def add_columns(csv_data, column_headers):
251
+ """
252
+ Adds one or multiple columns to the CSV data.
253
+
254
+ Parameters:
255
+ csv_data (list of dict): The CSV data where each row is represented as a dictionary.
256
+ column_headers (list of str or str): A list of column headers to be added to each row, or a single column header.
257
+
258
+ Returns:
259
+ None: The function modifies the csv_data in place.
260
+ """
261
+ if isinstance(column_headers, str):
262
+ column_headers = [column_headers]
263
+ elif not isinstance(column_headers, list):
264
+ raise ValueError("column_headers should be a list or a string")
265
+
266
+ # PERFORMANCE FIX: Optimize column initialization to avoid nested loop
267
+ for row in csv_data:
268
+ # Use dict.update() to set multiple columns at once
269
+ row.update({header: '' for header in column_headers})
270
+
271
+ # Extracting the list to a variable for future refactoring:
272
+ def filter_rows(csv_data):
273
+ # TODO: This should be written in the crosswalk and not hardcoded here.
274
+ excluded_insurance = {'AETNA', 'AETNA MEDICARE', 'HUMANA MED HMO'}
275
+ csv_data[:] = [row for row in csv_data if row.get('Patient ID') and row.get('Primary Insurance') not in excluded_insurance]
276
+
277
+ def detect_date_format(date_str):
278
+ """
279
+ PERFORMANCE OPTIMIZATION: Quickly detect the most likely date format
280
+ to avoid trying all formats for every date string.
281
+
282
+ Parameters:
283
+ - date_str (str): The date string to analyze
284
+
285
+ Returns:
286
+ - str: The most likely format string, or None if unclear
287
+ """
288
+ if not date_str:
289
+ return None
290
+
291
+ # Remove time components if present
292
+ date_only = date_str.split()[0]
293
+
294
+ # Count separators to guess format
295
+ slash_count = date_only.count('/')
296
+ dash_count = date_only.count('-')
297
+
298
+ # Check for 4-digit year (likely YYYY format)
299
+ if len(date_only) >= 10: # YYYY-MM-DD or YYYY/MM/DD
300
+ if dash_count == 2:
301
+ return '%Y-%m-%d'
302
+ elif slash_count == 2:
303
+ return '%Y/%m/%d'
304
+
305
+ # Check for 2-digit year (likely MM/DD/YY or MM-DD-YY)
306
+ if len(date_only) >= 8: # MM/DD/YY or MM-DD-YY
307
+ if dash_count == 2:
308
+ return '%m-%d-%y'
309
+ elif slash_count == 2:
310
+ return '%m/%d/%y'
311
+
312
+ # Default to most common format (MM/DD/YYYY)
313
+ if dash_count == 2:
314
+ return '%m-%d-%Y'
315
+ elif slash_count == 2:
316
+ return '%m/%d/%Y'
317
+
318
+ return None
319
+
320
+ class OptimizedDate:
321
+ """
322
+ Optimized date object that pre-computes all common format variations
323
+ to avoid redundant datetime conversions throughout the application.
324
+ """
325
+ def __init__(self, datetime_obj):
326
+ self.datetime = datetime_obj
327
+ # Pre-compute all common format variations
328
+ self._display_short = datetime_obj.strftime('%m-%d') # For table display
329
+ self._display_full = datetime_obj.strftime('%m-%d-%Y') # Full format
330
+ self._medisoft_format = datetime_obj.strftime('%m%d%Y') # For Medisoft entry
331
+ self._iso_format = datetime_obj.strftime('%Y-%m-%d') # For sorting/comparison
332
+
333
+ @property
334
+ def display_short(self):
335
+ """Short display format: MM-DD"""
336
+ return self._display_short
337
+
338
+ @property
339
+ def display_full(self):
340
+ """Full display format: MM-DD-YYYY"""
341
+ return self._display_full
342
+
343
+ @property
344
+ def medisoft_format(self):
345
+ """Medisoft entry format: MMDDYYYY"""
346
+ return self._medisoft_format
347
+
348
+ @property
349
+ def iso_format(self):
350
+ """ISO format for sorting: YYYY-MM-DD"""
351
+ return self._iso_format
352
+
353
+ def __str__(self):
354
+ return self._display_full
355
+
356
+ def __repr__(self):
357
+ return "OptimizedDate({})".format(self._display_full)
358
+
359
+ def __eq__(self, other):
360
+ if isinstance(other, OptimizedDate):
361
+ return self.datetime == other.datetime
362
+ elif hasattr(other, 'strftime'): # datetime object
363
+ return self.datetime == other
364
+ return False
365
+
366
+ def __lt__(self, other):
367
+ if isinstance(other, OptimizedDate):
368
+ return self.datetime < other.datetime
369
+ elif hasattr(other, 'strftime'): # datetime object
370
+ return self.datetime < other
371
+ return NotImplemented
372
+
373
+ def __gt__(self, other):
374
+ if isinstance(other, OptimizedDate):
375
+ return self.datetime > other.datetime
376
+ elif hasattr(other, 'strftime'): # datetime object
377
+ return self.datetime > other
378
+ return NotImplemented
379
+
380
+ def strftime(self, format_str):
381
+ """Fallback for any custom format needs"""
382
+ return self.datetime.strftime(format_str)
383
+
384
+ @classmethod
385
+ def from_string(cls, date_str, cleaned=False):
386
+ """
387
+ Create OptimizedDate from string, with optional pre-cleaning.
388
+
389
+ Args:
390
+ date_str: Date string to parse
391
+ cleaned: If True, assumes string is already cleaned
392
+
393
+ Returns:
394
+ OptimizedDate object or None if parsing fails
395
+ """
396
+ if not cleaned:
397
+ date_str = clean_surgery_date_string(date_str)
398
+ if not date_str:
399
+ return None
400
+
401
+ # Try standard format first (most common)
402
+ try:
403
+ return cls(datetime.strptime(date_str, '%m/%d/%Y'))
404
+ except ValueError:
405
+ pass
406
+
407
+ # Try alternative formats
408
+ formats = ['%m-%d-%Y', '%m/%d/%y', '%m-%d-%y', '%Y/%m/%d', '%Y-%m-%d']
409
+ for fmt in formats:
410
+ try:
411
+ return cls(datetime.strptime(date_str, fmt))
412
+ except ValueError:
413
+ continue
414
+
415
+ return None
416
+
417
+ def clean_surgery_date_string(date_str):
418
+ """
419
+ Cleans and normalizes surgery date strings to handle damaged data.
420
+
421
+ Parameters:
422
+ - date_str (str): The raw date string from the CSV
423
+
424
+ Returns:
425
+ - str: Cleaned date string in MM/DD/YYYY format, or empty string if unparseable
426
+ """
427
+ if not date_str:
428
+ return ''
429
+
430
+ # Convert to string and strip whitespace
431
+ date_str = str(date_str).strip()
432
+ if not date_str:
433
+ return ''
434
+
435
+ # Remove common problematic characters and normalize
436
+ date_str = date_str.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
437
+ date_str = ' '.join(date_str.split()) # Normalize whitespace
438
+
439
+ # PERFORMANCE OPTIMIZATION: Try detected format first
440
+ detected_format = detect_date_format(date_str)
441
+ if detected_format:
442
+ try:
443
+ parsed_date = datetime.strptime(date_str, detected_format)
444
+ return parsed_date.strftime('%m/%d/%Y')
445
+ except ValueError:
446
+ pass
447
+
448
+ # PERFORMANCE OPTIMIZATION: Try most common format first (MM/DD/YYYY)
449
+ # This reduces the average number of format attempts from 8 to ~1-2
450
+ try:
451
+ parsed_date = datetime.strptime(date_str, '%m/%d/%Y')
452
+ return parsed_date.strftime('%m/%d/%Y')
453
+ except ValueError:
454
+ pass
455
+
456
+ # PERFORMANCE OPTIMIZATION: Try second most common format (MM-DD-YYYY)
457
+ try:
458
+ parsed_date = datetime.strptime(date_str, '%m-%d-%Y')
459
+ return parsed_date.strftime('%m/%d/%Y')
460
+ except ValueError:
461
+ pass
462
+
463
+ # PERFORMANCE OPTIMIZATION: Try 2-digit year formats only if needed
464
+ try:
465
+ parsed_date = datetime.strptime(date_str, '%m/%d/%y')
466
+ return parsed_date.strftime('%m/%d/%Y')
467
+ except ValueError:
468
+ pass
469
+
470
+ try:
471
+ parsed_date = datetime.strptime(date_str, '%m-%d-%y')
472
+ return parsed_date.strftime('%m/%d/%Y')
473
+ except ValueError:
474
+ pass
475
+
476
+ # PERFORMANCE OPTIMIZATION: Try YYYY formats only if needed
477
+ try:
478
+ parsed_date = datetime.strptime(date_str, '%Y/%m/%d')
479
+ return parsed_date.strftime('%m/%d/%Y')
480
+ except ValueError:
481
+ pass
482
+
483
+ try:
484
+ parsed_date = datetime.strptime(date_str, '%Y-%m-%d')
485
+ return parsed_date.strftime('%m/%d/%Y')
486
+ except ValueError:
487
+ pass
488
+
489
+ # PERFORMANCE OPTIMIZATION: Try datetime formats only if needed
490
+ try:
491
+ parsed_date = datetime.strptime(date_str, '%m/%d/%Y %H:%M:%S')
492
+ return parsed_date.strftime('%m/%d/%Y')
493
+ except ValueError:
494
+ pass
495
+
496
+ try:
497
+ parsed_date = datetime.strptime(date_str, '%m-%d-%Y %H:%M:%S')
498
+ return parsed_date.strftime('%m/%d/%Y')
499
+ except ValueError:
500
+ pass
501
+
502
+ # If no format matches, try to extract date components
503
+ try:
504
+ # Remove any time components and extra text
505
+ date_only = date_str.split()[0] # Take first part if there's extra text
506
+
507
+ # Try to extract numeric components
508
+ import re
509
+ numbers = re.findall(r'\d+', date_only)
510
+
511
+ if len(numbers) >= 3:
512
+ # Assume MM/DD/YYYY or MM-DD-YYYY format
513
+ month, day, year = int(numbers[0]), int(numbers[1]), int(numbers[2])
514
+
515
+ # Validate ranges
516
+ if 1 <= month <= 12 and 1 <= day <= 31 and 1900 <= year <= 2100:
517
+ # Handle 2-digit years
518
+ if year < 100:
519
+ year += 2000 if year < 50 else 1900
520
+
521
+ parsed_date = datetime(year, month, day)
522
+ return parsed_date.strftime('%m/%d/%Y')
523
+ except (ValueError, IndexError):
524
+ pass
525
+
526
+ # If all parsing attempts fail, return empty string
527
+ return ''
528
+
529
+ def convert_surgery_date(csv_data):
530
+ """
531
+ Converts surgery date strings to datetime objects with comprehensive data cleaning.
532
+
533
+ Parameters:
534
+ - csv_data (list): List of dictionaries containing CSV row data
535
+ """
536
+ # TIMING: Start surgery date conversion with granular tracking
537
+ total_start_time = time.time()
538
+ date_cleaning_time = 0
539
+ date_parsing_time = 0
540
+ processed_count = 0
541
+ empty_count = 0
542
+ error_count = 0
543
+
544
+ print("Starting surgery date conversion for {} rows...".format(len(csv_data)))
545
+ # LOGGING STRATEGY: Only log start/end of looped events at INFO level, not individual successes
546
+ # MediLink_ConfigLoader.log("Starting surgery date conversion for {} rows...".format(len(csv_data)), level="INFO") # REMOVED
547
+
548
+ # PERFORMANCE OPTIMIZATION: Pre-compile datetime.strptime for the most common format
549
+ # This avoids repeated format string parsing
550
+ standard_format = '%m/%d/%Y'
551
+
552
+ for row_idx, row in enumerate(csv_data, 1):
553
+ surgery_date_str = row.get('Surgery Date', '')
554
+
555
+ if not surgery_date_str:
556
+ empty_count += 1
557
+ # LOGGING STRATEGY: Only log actual errors/failures, not routine empty dates
558
+ # if empty_count <= 5: # Only log first 5 empty dates
559
+ # MediLink_ConfigLoader.log("Warning: Surgery Date not found for row: {}".format(row), level="WARNING")
560
+ # print("Surgery Date not found for row: {}".format(row))
561
+ row['Surgery Date'] = datetime.min # Assign a minimum datetime value if empty
562
+ else:
563
+ # TIMING: Start date string cleaning
564
+ cleaning_start = time.time()
565
+
566
+ # Clean the date string first
567
+ cleaned_date_str = clean_surgery_date_string(surgery_date_str)
568
+
569
+ # TIMING: End date string cleaning
570
+ cleaning_end = time.time()
571
+ date_cleaning_time += (cleaning_end - cleaning_start)
572
+
573
+ if not cleaned_date_str:
574
+ error_count += 1
575
+ # LOGGING STRATEGY: Log actual errors (cleaning failures) at INFO level
576
+ if error_count <= 5: # Only log first 5 errors
577
+ MediLink_ConfigLoader.log("Error: Could not clean Surgery Date '{}' for row: {}".format(surgery_date_str, row), level="INFO")
578
+ print("Could not clean Surgery Date '{}' for row: {}".format(surgery_date_str, row))
579
+ row['Surgery Date'] = datetime.min # Assign a minimum datetime value if cleaning fails
580
+ else:
581
+ # TIMING: Start date parsing
582
+ parsing_start = time.time()
583
+
584
+ try:
585
+ # PERFORMANCE OPTIMIZATION: Use pre-compiled format string
586
+ # Parse the cleaned date string
587
+ row['Surgery Date'] = datetime.strptime(cleaned_date_str, standard_format)
588
+ processed_count += 1
589
+ # LOGGING STRATEGY: Remove success logging - DEBUG is typically silent anyway
590
+ # if processed_count <= 10 or processed_count % 100 == 0: # Log first 10 and every 100th
591
+ # MediLink_ConfigLoader.log("Successfully cleaned and parsed Surgery Date '{}' -> '{}' for row: {}".format(
592
+ # surgery_date_str, cleaned_date_str, row), level="DEBUG")
593
+ except ValueError as e:
594
+ error_count += 1
595
+ # LOGGING STRATEGY: Log actual errors (parsing failures) at INFO level
596
+ if error_count <= 5: # Only log first 5 parsing errors
597
+ MediLink_ConfigLoader.log("Error parsing cleaned Surgery Date '{}': {} for row: {}".format(
598
+ cleaned_date_str, e, row), level="INFO")
599
+ row['Surgery Date'] = datetime.min # Assign a minimum datetime value if parsing fails
600
+
601
+ # TIMING: End date parsing
602
+ parsing_end = time.time()
603
+ date_parsing_time += (parsing_end - parsing_start)
604
+
605
+ # TIMING: End total surgery date conversion
606
+ total_end_time = time.time()
607
+ total_duration = total_end_time - total_start_time
608
+
609
+ if PERFORMANCE_LOGGING:
610
+ print("Surgery date conversion completed:")
611
+ print(" - Total duration: {:.2f} seconds".format(total_duration))
612
+ print(" - Date cleaning time: {:.2f} seconds ({:.1f}%)".format(date_cleaning_time, (date_cleaning_time/total_duration)*100))
613
+ print(" - Date parsing time: {:.2f} seconds ({:.1f}%)".format(date_parsing_time, (date_parsing_time/total_duration)*100))
614
+ print(" - Processed: {} rows, Empty: {} rows, Errors: {} rows".format(processed_count, empty_count, error_count))
615
+
616
+ # LOGGING STRATEGY: Log completion summary at INFO level (end of looped event)
617
+ MediLink_ConfigLoader.log("Surgery date conversion completed - Total: {:.2f}s, Cleaning: {:.2f}s, Parsing: {:.2f}s, Processed: {}, Empty: {}, Errors: {}".format(
618
+ total_duration, date_cleaning_time, date_parsing_time, processed_count, empty_count, error_count), level="INFO")
619
+
620
+ def _create_common_tie_breakers(row):
621
+ """
622
+ Creates common tie-breaker components used across multiple sorting strategies.
623
+ This follows DRY principle by extracting shared logic.
624
+ """
625
+ last_name = ((row.get('Patient Last') or '')).strip().upper()
626
+ first_name = ((row.get('Patient First') or '')).strip().upper()
627
+ patient_id_tiebreak = str(row.get('Patient ID') or '')
628
+ return (last_name, first_name, patient_id_tiebreak)
629
+
630
+ def _normalize_surgery_date(row):
631
+ """
632
+ Normalizes surgery date for consistent sorting across strategies.
633
+ """
634
+ # Prefer earliest surgery date across all known dates for the patient
635
+ earliest = row.get('_earliest_surgery_date')
636
+ if isinstance(earliest, str) and earliest and earliest != 'MISSING':
637
+ try:
638
+ return datetime.strptime(earliest, '%m-%d-%Y')
639
+ except Exception:
640
+ pass
641
+
642
+ # Fallback to the single Surgery Date field
643
+ surgery_date = row.get('Surgery Date')
644
+ if isinstance(surgery_date, datetime):
645
+ return surgery_date
646
+ elif isinstance(surgery_date, str) and surgery_date.strip():
647
+ try:
648
+ return datetime.strptime(surgery_date, '%m/%d/%Y')
649
+ except ValueError:
650
+ try:
651
+ return datetime.strptime(surgery_date, '%m-%d-%Y')
652
+ except ValueError:
653
+ pass
654
+
655
+ return datetime.min
656
+
657
+ def _get_schedule_position(row):
658
+ """
659
+ Gets the schedule position for a patient from stored DOCX data.
660
+ Returns a high number if no schedule data is available (puts at end).
661
+ """
662
+ schedule_positions = row.get('_schedule_positions', {})
663
+ surgery_date = row.get('Surgery Date')
664
+
665
+ # Convert surgery date to string format for lookup
666
+ if isinstance(surgery_date, datetime):
667
+ surgery_date_str = surgery_date.strftime('%m-%d-%Y')
668
+ else:
669
+ surgery_date_str = str(surgery_date)
670
+
671
+ # Return schedule position if available, otherwise high number (end of list)
672
+ return schedule_positions.get(surgery_date_str, 9999)
673
+
674
+ def _get_surgery_date_string(row):
675
+ """
676
+ Gets surgery date as string for consistent sorting.
677
+ """
678
+ surgery_date = row.get('Surgery Date')
679
+ if isinstance(surgery_date, datetime):
680
+ return surgery_date.strftime('%m-%d-%Y')
681
+ else:
682
+ return str(surgery_date)
683
+
684
+ def _create_date_based_sort_key(row):
685
+ """
686
+ Current date-based sorting logic (extracted from existing sort_key function).
687
+ """
688
+ normalized_date = _normalize_surgery_date(row)
689
+ tie_breakers = _create_common_tie_breakers(row)
690
+ return (normalized_date,) + tie_breakers
691
+
692
+ def _create_schedule_based_sort_key(row):
693
+ """
694
+ Schedule-based sorting logic (new strategy).
695
+ Uses patient position in DOCX surgery schedule as primary sort criterion.
696
+ """
697
+ schedule_position = _get_schedule_position(row)
698
+ surgery_date_str = _get_surgery_date_string(row)
699
+ tie_breakers = _create_common_tie_breakers(row)
700
+ return (schedule_position, surgery_date_str) + tie_breakers
701
+
702
+ def create_sort_key_strategy(strategy_type='date_based'):
703
+ """
704
+ Factory function that returns the appropriate sort key function.
705
+ Follows existing strategy patterns in the codebase.
706
+ """
707
+ if strategy_type == 'schedule_based':
708
+ return _create_schedule_based_sort_key
709
+ else:
710
+ return _create_date_based_sort_key
711
+
712
+ def sort_and_deduplicate(csv_data):
713
+ # Create a dictionary to hold unique patients based on Patient ID
714
+ unique_patients = {}
715
+ # Create a dictionary to store multiple surgery dates per patient
716
+ patient_surgery_dates = {}
717
+
718
+ # Iterate through the CSV data and populate the unique_patients dictionary
719
+ for row in csv_data:
720
+ patient_id = row.get('Patient ID')
721
+ surgery_date = row.get('Surgery Date')
722
+
723
+ if patient_id not in unique_patients:
724
+ unique_patients[patient_id] = row
725
+ patient_surgery_dates[patient_id] = [surgery_date]
726
+ else:
727
+ # If the patient ID already exists, compare surgery dates
728
+ existing_row = unique_patients[patient_id]
729
+ existing_date = existing_row['Surgery Date']
730
+
731
+ # Ensure both dates are comparable by converting to datetime objects
732
+ def normalize_date_for_comparison(date_value):
733
+ if isinstance(date_value, datetime):
734
+ return date_value
735
+ elif isinstance(date_value, str) and date_value.strip():
736
+ try:
737
+ # Try to parse the string as a date
738
+ return datetime.strptime(date_value, '%m/%d/%Y')
739
+ except ValueError:
740
+ try:
741
+ return datetime.strptime(date_value, '%m-%d-%Y')
742
+ except ValueError:
743
+ # If parsing fails, return minimum datetime
744
+ return datetime.min
745
+ else:
746
+ # Empty or invalid values get minimum datetime
747
+ return datetime.min
748
+
749
+ normalized_surgery_date = normalize_date_for_comparison(surgery_date)
750
+ normalized_existing_date = normalize_date_for_comparison(existing_date)
751
+
752
+ # Keep the most current demographic data (later surgery date takes precedence)
753
+ if normalized_surgery_date > normalized_existing_date:
754
+ # Store the old row's surgery date before replacing
755
+ old_date = existing_row['Surgery Date']
756
+ # Add the old date to the list if it's not already there
757
+ if old_date not in patient_surgery_dates[patient_id]:
758
+ patient_surgery_dates[patient_id].append(old_date)
759
+ # Replace with newer row (better demographics)
760
+ unique_patients[patient_id] = row
761
+ # Add the new surgery date to the list if it's not already there
762
+ if surgery_date not in patient_surgery_dates[patient_id]:
763
+ patient_surgery_dates[patient_id].append(surgery_date)
764
+ else:
765
+ # Add this surgery date to the list for this patient if it's not already there
766
+ if surgery_date not in patient_surgery_dates[patient_id]:
767
+ patient_surgery_dates[patient_id].append(surgery_date)
768
+
769
+ # Store the surgery dates information in the first row of each patient for later access
770
+ for patient_id, row in unique_patients.items():
771
+ # Convert surgery dates to strings for consistent storage
772
+ surgery_date_strings = []
773
+ for date in patient_surgery_dates[patient_id]:
774
+ if isinstance(date, datetime):
775
+ if date == datetime.min:
776
+ surgery_date_strings.append('MISSING')
777
+ else:
778
+ surgery_date_strings.append(date.strftime('%m-%d-%Y'))
779
+ else:
780
+ surgery_date_strings.append(str(date) if date else 'MISSING')
781
+
782
+ # Remove duplicates and sort
783
+ unique_surgery_dates = list(set(surgery_date_strings))
784
+ sorted_surgery_dates = sorted(unique_surgery_dates, key=lambda x: datetime.strptime(x, '%m-%d-%Y') if x != 'MISSING' else datetime.min)
785
+ row['_all_surgery_dates'] = sorted_surgery_dates
786
+ row['_primary_surgery_date'] = row['Surgery Date'] # Keep track of which date has the demographics
787
+ # Compute and store earliest surgery date for emission sort
788
+ earliest_dt = None
789
+ earliest_str = None
790
+ for d in sorted_surgery_dates:
791
+ if d and d != 'MISSING':
792
+ try:
793
+ earliest_dt = datetime.strptime(d, '%m-%d-%Y')
794
+ earliest_str = d
795
+ break
796
+ except Exception:
797
+ pass
798
+ # Fallback to demographics date if earliest could not be determined
799
+ if earliest_str is None:
800
+ try:
801
+ sd = row.get('Surgery Date')
802
+ if isinstance(sd, datetime) and sd != datetime.min:
803
+ earliest_dt = sd
804
+ earliest_str = sd.strftime('%m-%d-%Y')
805
+ elif isinstance(sd, str) and sd.strip():
806
+ try:
807
+ earliest_dt = datetime.strptime(sd, '%m/%d/%Y')
808
+ except Exception:
809
+ try:
810
+ earliest_dt = datetime.strptime(sd, '%m-%d-%Y')
811
+ except Exception:
812
+ earliest_dt = None
813
+ earliest_str = sd
814
+ except Exception:
815
+ earliest_dt = None
816
+ earliest_str = None
817
+ row['_earliest_surgery_date'] = earliest_str
818
+
819
+
820
+
821
+ # Convert the unique_patients dictionary back to a list and sort it
822
+ # Use strategy pattern for sorting (follows existing codebase patterns)
823
+ sort_key_func = create_sort_key_strategy(SORTING_STRATEGY)
824
+
825
+ csv_data[:] = sorted(unique_patients.values(), key=sort_key_func)
826
+
827
+ # TODO: Consider adding an option in the config to sort based on Surgery Schedules when available.
828
+ # If no schedule is available, the current sorting strategy will be used.
829
+ #
830
+ # IMPLEMENTATION STATUS: Backend infrastructure is ready.
831
+ # To enable surgery schedule sorting, set SORTING_STRATEGY = 'schedule_based' above.
832
+ # The system will automatically fall back to date-based sorting if schedule data is unavailable.
833
+
834
+ def combine_fields(csv_data):
835
+ for row in csv_data:
836
+ # Safely handle the 'Surgery Date' conversion with clear missing indicator
837
+ surgery_date = row.get('Surgery Date')
838
+ try:
839
+ if isinstance(surgery_date, datetime):
840
+ if surgery_date == datetime.min:
841
+ row['Surgery Date'] = 'MISSING'
842
+ else:
843
+ row['Surgery Date'] = surgery_date.strftime('%m-%d-%Y')
844
+ elif surgery_date:
845
+ # Already a non-empty string
846
+ row['Surgery Date'] = str(surgery_date)
847
+ else:
848
+ row['Surgery Date'] = 'MISSING'
849
+ except Exception:
850
+ row['Surgery Date'] = 'MISSING'
851
+
852
+ first_name = '_'.join(part.strip() for part in row.get('Patient First', '').split()) # Join the first name parts with underscores after cleaning.
853
+ middle_name = row.get('Patient Middle', '').strip()
854
+ middle_name = middle_name[0] if len(middle_name) > 1 else '' # Take only the first character or empty
855
+ last_name = '_'.join(part.strip() for part in row.get('Patient Last', '').split()) # Join the last name parts with underscores after cleaning.
856
+ row['Patient Name'] = ', '.join(filter(None, [last_name, first_name])) + (' ' + middle_name if middle_name else '') # Comma between last and first, space before middle
857
+
858
+ address1 = row.get('Patient Address1', '').strip()
859
+ address2 = row.get('Patient Address2', '').strip()
860
+ row['Patient Street'] = ' '.join(filter(None, [address1, address2])) # Join non-empty addresses
861
+
862
+ def apply_replacements(csv_data, crosswalk):
863
+ replacements = crosswalk.get('csv_replacements', {})
864
+ # Pre-define the keys to check for better performance
865
+ keys_to_check = ['Patient SSN', 'Primary Insurance', 'Ins1 Payer ID']
866
+
867
+ for row in csv_data:
868
+ # Use early termination - check each replacement only if needed
869
+ for old_value, new_value in replacements.items():
870
+ replacement_made = False
871
+ for key in keys_to_check:
872
+ if row.get(key) == old_value:
873
+ row[key] = new_value
874
+ replacement_made = True
875
+ break # Exit the key loop once a replacement is made
876
+ if replacement_made:
877
+ break # Exit the replacement loop once any replacement is made
878
+
879
+ import difflib
880
+ from collections import defaultdict
881
+
882
+ def find_best_medisoft_id(insurance_name, medisoft_ids, medisoft_to_mains_names):
883
+ """
884
+ Finds the best matching Medisoft ID for a given insurance name using fuzzy matching.
885
+
886
+ Parameters:
887
+ - insurance_name (str): The insurance name from the CSV row.
888
+ - medisoft_ids (list): List of Medisoft IDs associated with the Payer ID.
889
+ - medisoft_to_mains_names (dict): Mapping from Medisoft ID to list of MAINS names.
890
+
891
+ Returns:
892
+ - int or None: The best matching Medisoft ID or None if no match is found.
893
+ """
894
+ best_match_ratio = 0
895
+ best_medisoft_id = None
896
+
897
+ # Pre-process insurance name once
898
+ processed_insurance = ''.join(c for c in insurance_name if not c.isdigit()).upper()
899
+
900
+ for medisoft_id in medisoft_ids:
901
+ mains_names = medisoft_to_mains_names.get(medisoft_id, [])
902
+ for mains_name in mains_names:
903
+ # Preprocess names by extracting non-numeric characters and converting to uppercase
904
+ # Use more efficient string processing
905
+ processed_mains = ''.join(c for c in mains_name if not c.isdigit()).upper()
906
+
907
+ # Log the processed names before computing the match ratio
908
+ MediLink_ConfigLoader.log("Processing Medisoft ID '{}': Comparing processed insurance '{}' with processed mains '{}'.".format(medisoft_id, processed_insurance, processed_mains), level="DEBUG")
909
+
910
+ # Compute the similarity ratio
911
+ match_ratio = difflib.SequenceMatcher(None, processed_insurance, processed_mains).ratio()
912
+
913
+ # Log the match ratio
914
+ MediLink_ConfigLoader.log("Match ratio for Medisoft ID '{}': {:.2f}".format(medisoft_id, match_ratio), level="DEBUG")
915
+
916
+ if match_ratio > best_match_ratio:
917
+ best_match_ratio = match_ratio
918
+ best_medisoft_id = medisoft_id
919
+ # Log the current best match
920
+ MediLink_ConfigLoader.log("New best match found: Medisoft ID '{}' with match ratio {:.2f}".format(best_medisoft_id, best_match_ratio), level="DEBUG")
921
+
922
+ # Log the final best match ratio and ID
923
+ MediLink_ConfigLoader.log("Final best match ratio: {:.2f} for Medisoft ID '{}'".format(best_match_ratio, best_medisoft_id), level="DEBUG")
924
+
925
+ # No threshold applied, return the best match found
926
+ return best_medisoft_id
927
+
928
+ def NEW_update_insurance_ids(csv_data, config, crosswalk):
929
+ """
930
+ Updates the 'Ins1 Insurance ID' field in each row of csv_data based on the crosswalk and MAINS data.
931
+
932
+ Parameters:
933
+ - csv_data (list of dict): The CSV data where each row is represented as a dictionary.
934
+ - config (dict): Configuration object containing necessary paths and parameters.
935
+ - crosswalk (dict): Crosswalk data containing mappings between Payer IDs and Medisoft IDs.
936
+
937
+ Returns:
938
+ - None: The function modifies the csv_data in place.
939
+ """
940
+ processed_payer_ids = set() # Track processed Payer IDs
941
+ MediLink_ConfigLoader.log("Starting update of insurance IDs.", level="INFO")
942
+
943
+ # PERFORMANCE FIX: Pre-build flattened payer lookup cache to avoid nested dictionary access
944
+ payer_cache = {}
945
+ crosswalk_payers = crosswalk.get('payer_id', {})
946
+ for payer_id, details in crosswalk_payers.items():
947
+ payer_cache[payer_id] = {
948
+ 'medisoft_id': details.get('medisoft_id', []),
949
+ 'medisoft_medicare_id': details.get('medisoft_medicare_id', []),
950
+ 'endpoint': details.get('endpoint', None)
951
+ }
952
+ MediLink_ConfigLoader.log("Built payer cache for {} payers".format(len(payer_cache)), level="DEBUG")
953
+
954
+ # Load MAINS data to get mapping from Medisoft ID to MAINS names
955
+ insurance_to_id = load_insurance_data_from_mains(config) # Assuming it returns a dict mapping insurance names to IDs
956
+ MediLink_ConfigLoader.log("Loaded MAINS data for insurance to ID mapping.", level="DEBUG")
957
+
958
+ # Invert the mapping to get Medisoft ID to MAINS names
959
+ medisoft_to_mains_names = defaultdict(list)
960
+ for insurance_name, medisoft_id in insurance_to_id.items():
961
+ medisoft_to_mains_names[medisoft_id].append(insurance_name)
962
+
963
+ for row_idx, row in enumerate(csv_data, 1):
964
+ # PERFORMANCE FIX: Store row index to avoid O(n) csv_data.index() calls later
965
+ row['_row_index'] = row_idx
966
+ ins1_payer_id = row.get('Ins1 Payer ID', '').strip()
967
+ MediLink_ConfigLoader.log("Processing row with Ins1 Payer ID: '{}'.".format(ins1_payer_id), level="DEBUG")
968
+
969
+ if ins1_payer_id:
970
+ # Mark this Payer ID as processed
971
+ if ins1_payer_id not in processed_payer_ids:
972
+ processed_payer_ids.add(ins1_payer_id) # Add to set
973
+ MediLink_ConfigLoader.log("Marked Payer ID '{}' as processed.".format(ins1_payer_id), level="DEBUG")
974
+
975
+ # PERFORMANCE FIX: Use flattened cache instead of nested dictionary lookups
976
+ payer_info = payer_cache.get(ins1_payer_id, {})
977
+ medisoft_ids = payer_info.get('medisoft_id', [])
978
+ MediLink_ConfigLoader.log("Retrieved Medisoft IDs for Payer ID '{}': {}".format(ins1_payer_id, medisoft_ids), level="DEBUG")
979
+
980
+ if not medisoft_ids:
981
+ MediLink_ConfigLoader.log("No Medisoft IDs available for Payer ID '{}', creating placeholder entry.".format(ins1_payer_id), level="WARNING")
982
+ # Create a placeholder entry in the crosswalk and cache
983
+ placeholder_entry = {
984
+ 'medisoft_id': [], # Placeholder for future Medisoft IDs
985
+ 'medisoft_medicare_id': [], # Placeholder for future Medicare IDs
986
+ 'endpoint': None # Placeholder for future endpoint
987
+ }
988
+ if 'payer_id' not in crosswalk:
989
+ crosswalk['payer_id'] = {}
990
+ crosswalk['payer_id'][ins1_payer_id] = placeholder_entry
991
+ # PERFORMANCE FIX: Update cache with placeholder entry
992
+ payer_cache[ins1_payer_id] = placeholder_entry
993
+ continue # Skip further processing for this Payer ID
994
+
995
+ # If only one Medisoft ID is associated, assign it directly
996
+ if len(medisoft_ids) == 1:
997
+ try:
998
+ medisoft_id = int(medisoft_ids[0])
999
+ row['Ins1 Insurance ID'] = medisoft_id
1000
+ # PERFORMANCE FIX: Use enumerate index instead of csv_data.index() which is O(n)
1001
+ row_number = getattr(row, '_row_index', 'Unknown')
1002
+ MediLink_ConfigLoader.log("Assigned Medisoft ID '{}' to row number {} with Payer ID '{}'.".format(medisoft_id, row_number, ins1_payer_id), level="DEBUG")
1003
+ except ValueError as e:
1004
+ MediLink_ConfigLoader.log("Error converting Medisoft ID '{}' to integer for Payer ID '{}': {}".format(medisoft_ids[0], ins1_payer_id, e), level="ERROR")
1005
+ row['Ins1 Insurance ID'] = None
1006
+ continue # Move to the next row
1007
+
1008
+ # If multiple Medisoft IDs are associated, perform fuzzy matching
1009
+ insurance_name = row.get('Primary Insurance', '').strip()
1010
+ if not insurance_name:
1011
+ MediLink_ConfigLoader.log("Row with Payer ID '{}' missing 'Primary Insurance', skipping assignment.".format(ins1_payer_id), level="WARNING")
1012
+ continue # Skip if insurance name is missing
1013
+
1014
+ best_medisoft_id = find_best_medisoft_id(insurance_name, medisoft_ids, medisoft_to_mains_names)
1015
+
1016
+ if best_medisoft_id:
1017
+ row['Ins1 Insurance ID'] = best_medisoft_id
1018
+ MediLink_ConfigLoader.log("Assigned Medisoft ID '{}' to row with Payer ID '{}' based on fuzzy match.".format(best_medisoft_id, ins1_payer_id), level="INFO")
1019
+ else:
1020
+ # Default to the first Medisoft ID if no good match is found
1021
+ try:
1022
+ default_medisoft_id = int(medisoft_ids[0])
1023
+ row['Ins1 Insurance ID'] = default_medisoft_id
1024
+ MediLink_ConfigLoader.log("No suitable match found. Defaulted to Medisoft ID '{}' for Payer ID '{}'.".format(default_medisoft_id, ins1_payer_id), level="INFO")
1025
+ except ValueError as e:
1026
+ MediLink_ConfigLoader.log("Error converting default Medisoft ID '{}' to integer for Payer ID '{}': {}".format(medisoft_ids[0], ins1_payer_id, e), level="ERROR")
1027
+ row['Ins1 Insurance ID'] = None
1028
+
1029
+ def update_insurance_ids(csv_data, config, crosswalk):
1030
+ # LOGGING STRATEGY: Remove DEBUG level function start log - DEBUG is typically silent anyway
1031
+ # MediLink_ConfigLoader.log("Starting update_insurance_ids function.", level="DEBUG")
1032
+
1033
+ # TIMING: Start insurance ID updates with granular tracking
1034
+ total_start_time = time.time()
1035
+ lookup_build_time = 0
1036
+ csv_processing_time = 0
1037
+ processed_count = 0
1038
+ medicare_count = 0
1039
+ regular_count = 0
1040
+ placeholder_count = 0
1041
+
1042
+ print("Starting insurance ID updates for {} rows...".format(len(csv_data)))
1043
+ # LOGGING STRATEGY: Only log start/end of looped events at INFO level, not individual successes
1044
+ # MediLink_ConfigLoader.log("Starting insurance ID updates for {} rows...".format(len(csv_data)), level="INFO") # REMOVED
1045
+
1046
+ # TIMING: Start lookup dictionary building
1047
+ lookup_start_time = time.time()
1048
+
1049
+ # PERFORMANCE FIX: Pre-build optimized lookup dictionaries for both regular and Medicare IDs
1050
+ # This reduces Medicare processing overhead by building lookups once instead of repeated processing
1051
+ payer_id_to_medisoft = {}
1052
+ payer_id_to_medicare = {}
1053
+ # LOGGING STRATEGY: Remove DEBUG level initialization log - DEBUG is typically silent anyway
1054
+ # MediLink_ConfigLoader.log("Initialized optimized lookup dictionaries for Medicare and regular IDs.", level="DEBUG")
1055
+
1056
+ # Build both lookup dictionaries simultaneously to avoid multiple iterations
1057
+ for payer_id, details in crosswalk.get('payer_id', {}).items():
1058
+ # Get both regular and Medicare IDs
1059
+ medisoft_ids = details.get('medisoft_id', [])
1060
+ medicare_ids = details.get('medisoft_medicare_id', [])
1061
+
1062
+ # Filter empty strings once for each type
1063
+ medisoft_ids = [id for id in medisoft_ids if id] if medisoft_ids else []
1064
+ medicare_ids = [id for id in medicare_ids if id] if medicare_ids else []
1065
+
1066
+ # Store first valid ID for quick lookup (Medicare takes precedence if available)
1067
+ payer_id_to_medisoft[payer_id] = int(medisoft_ids[0]) if medisoft_ids else None
1068
+ payer_id_to_medicare[payer_id] = int(medicare_ids[0]) if medicare_ids else None
1069
+
1070
+ # LOGGING STRATEGY: Remove success logging - DEBUG is typically silent anyway
1071
+ # if len(payer_id_to_medisoft) <= 10 or len(payer_id_to_medisoft) % 50 == 0: # Log first 10 and every 50th
1072
+ # MediLink_ConfigLoader.log("Processed Payer ID '{}': Regular IDs: {}, Medicare IDs: {}".format(
1073
+ # payer_id, medisoft_ids, medicare_ids), level="DEBUG")
1074
+
1075
+ # TIMING: End lookup dictionary building
1076
+ lookup_end_time = time.time()
1077
+ lookup_build_time = lookup_end_time - lookup_start_time
1078
+
1079
+ if PERFORMANCE_LOGGING:
1080
+ print("Built lookup dictionaries in {:.2f} seconds for {} payer IDs".format(lookup_build_time, len(payer_id_to_medisoft)))
1081
+
1082
+
1083
+ # TIMING: Start CSV processing
1084
+ csv_start_time = time.time()
1085
+
1086
+ # PERFORMANCE FIX: Single pass through CSV data with optimized Medicare ID resolution
1087
+ for row_idx, row in enumerate(csv_data, 1):
1088
+ ins1_payer_id = row.get('Ins1 Payer ID', '').strip()
1089
+ # LOGGING STRATEGY: Remove success logging - DEBUG is typically silent anyway
1090
+ # if row_idx <= 10 or row_idx % 100 == 0: # Log first 10 and every 100th
1091
+ # MediLink_ConfigLoader.log("Processing row #{} with Ins1 Payer ID '{}'.".format(row_idx, ins1_payer_id), level="DEBUG")
1092
+
1093
+ # Try Medicare ID first, then fall back to regular ID (optimized Medicare processing)
1094
+ insurance_id = (payer_id_to_medicare.get(ins1_payer_id) or
1095
+ payer_id_to_medisoft.get(ins1_payer_id))
1096
+
1097
+ if insurance_id is None and ins1_payer_id not in payer_id_to_medisoft:
1098
+ # Add placeholder entry for new payer ID (preserve original functionality)
1099
+ payer_id_to_medisoft[ins1_payer_id] = None
1100
+ payer_id_to_medicare[ins1_payer_id] = None
1101
+ crosswalk.setdefault('payer_id', {})[ins1_payer_id] = {
1102
+ 'medisoft_id': [], # Placeholder for future Medisoft IDs
1103
+ 'medisoft_medicare_id': [], # Placeholder for future Medicare IDs
1104
+ 'endpoint': None # Placeholder for future endpoint
1105
+ }
1106
+ placeholder_count += 1
1107
+ # LOGGING STRATEGY: Log actual events (new payer IDs) at INFO level
1108
+ if placeholder_count <= 5: # Only log first 5 placeholders
1109
+ MediLink_ConfigLoader.log("Added placeholder entry for new Payer ID '{}'.".format(ins1_payer_id), level="INFO")
1110
+ elif insurance_id == payer_id_to_medicare.get(ins1_payer_id):
1111
+ medicare_count += 1
1112
+ else:
1113
+ regular_count += 1
1114
+
1115
+ # Assign the resolved insurance ID to the row
1116
+ row['Ins1 Insurance ID'] = insurance_id
1117
+ # TODO (SECONDARY QUEUE): When building a secondary-claims queue after Medicare crossover,
1118
+ # set claim_type='secondary' and attach prior payer fields here from the Medicare primary outcome:
1119
+ # - row['prior_payer_name'] = 'MEDICARE'
1120
+ # - row['prior_payer_id'] = best Medicare ID from config/crosswalk
1121
+ # - optionally row['primary_paid_amount'], row['cas_adjustments'] extracted from 835
1122
+ processed_count += 1
1123
+ # LOGGING STRATEGY: Remove success logging - DEBUG is typically silent anyway
1124
+ # if processed_count <= 10 or processed_count % 100 == 0: # Log first 10 and every 100th
1125
+ # MediLink_ConfigLoader.log("Assigned Insurance ID '{}' to row with Ins1 Payer ID '{}'.".format(insurance_id, ins1_payer_id), level="DEBUG")
1126
+
1127
+ # TIMING: End CSV processing
1128
+ csv_end_time = time.time()
1129
+ csv_processing_time = csv_end_time - csv_start_time
1130
+
1131
+ # TIMING: End total insurance ID updates
1132
+ total_end_time = time.time()
1133
+ total_duration = total_end_time - total_start_time
1134
+
1135
+ if PERFORMANCE_LOGGING:
1136
+ print("Insurance ID updates completed:")
1137
+ print(" - Total duration: {:.2f} seconds".format(total_duration))
1138
+ print(" - Lookup building time: {:.2f} seconds ({:.1f}%)".format(lookup_build_time, (lookup_build_time/total_duration)*100))
1139
+ print(" - CSV processing time: {:.2f} seconds ({:.1f}%)".format(csv_processing_time, (csv_processing_time/total_duration)*100))
1140
+ print(" - Processed: {} rows, Medicare: {} rows, Regular: {} rows, Placeholders: {} rows".format(
1141
+ processed_count, medicare_count, regular_count, placeholder_count))
1142
+
1143
+ # LOGGING STRATEGY: Log completion summary at INFO level (end of looped event)
1144
+ MediLink_ConfigLoader.log("Insurance ID updates completed - Total: {:.2f}s, Lookup: {:.2f}s, Processing: {:.2f}s, Processed: {}, Medicare: {}, Regular: {}, Placeholders: {}".format(
1145
+ total_duration, lookup_build_time, csv_processing_time, processed_count, medicare_count, regular_count, placeholder_count), level="INFO")
1146
+
1147
+ def update_procedure_codes(csv_data, crosswalk):
1148
+
1149
+ # Get Medisoft shorthand dictionary from crosswalk and reverse it
1150
+ diagnosis_to_medisoft = crosswalk.get('diagnosis_to_medisoft', {}) # BUG We need to be careful here in case we decide we need to change the crosswalk data specifically with regard to the T8/H usage.
1151
+ medisoft_to_diagnosis = {v: k for k, v in diagnosis_to_medisoft.items()}
1152
+
1153
+ # Get procedure code to diagnosis dictionary from crosswalk and reverse it for easier lookup
1154
+ diagnosis_to_procedure = {
1155
+ diagnosis_code: procedure_code
1156
+ for procedure_code, diagnosis_codes in crosswalk.get('procedure_to_diagnosis', {}).items()
1157
+ for diagnosis_code in diagnosis_codes
1158
+ }
1159
+
1160
+ # Initialize counters for tracking
1161
+ updated_count = 0
1162
+ missing_medisoft_codes = set()
1163
+ missing_procedure_mappings = set()
1164
+
1165
+ # Update the "Procedure Code" column in the CSV data
1166
+ for row_num, row in enumerate(csv_data, start=1):
1167
+ try:
1168
+ medisoft_code = row.get('Default Diagnosis #1', '').strip()
1169
+ diagnosis_code = medisoft_to_diagnosis.get(medisoft_code)
1170
+
1171
+ if diagnosis_code:
1172
+ procedure_code = diagnosis_to_procedure.get(diagnosis_code)
1173
+ if procedure_code:
1174
+ row['Procedure Code'] = procedure_code
1175
+ updated_count += 1
1176
+ else:
1177
+ # Track missing procedure mapping
1178
+ missing_procedure_mappings.add(diagnosis_code)
1179
+ row['Procedure Code'] = "Unknown" # Will be handled by 837p encoder
1180
+ MediLink_ConfigLoader.log("Missing procedure mapping for diagnosis code '{}' (Medisoft code: '{}') in row {}".format(
1181
+ diagnosis_code, medisoft_code, row_num), level="WARNING")
1182
+ else:
1183
+ # Track missing Medisoft code mapping
1184
+ if medisoft_code: # Only track if there's actually a code
1185
+ missing_medisoft_codes.add(medisoft_code)
1186
+ row['Procedure Code'] = "Unknown" # Will be handled by 837p encoder
1187
+ MediLink_ConfigLoader.log("Missing Medisoft code mapping for '{}' in row {}".format(
1188
+ medisoft_code, row_num), level="WARNING")
1189
+ except Exception as e:
1190
+ MediLink_ConfigLoader.log("In update_procedure_codes, Error processing row {}: {}".format(row_num, e), level="ERROR")
1191
+
1192
+ # Log summary statistics
1193
+ MediLink_ConfigLoader.log("Total {} 'Procedure Code' rows updated.".format(updated_count), level="INFO")
1194
+
1195
+ if missing_medisoft_codes:
1196
+ MediLink_ConfigLoader.log("Missing Medisoft code mappings: {}".format(sorted(missing_medisoft_codes)), level="WARNING")
1197
+ print("WARNING: {} Medisoft codes need to be added to diagnosis_to_medisoft mapping: {}".format(
1198
+ len(missing_medisoft_codes), sorted(missing_medisoft_codes)))
1199
+
1200
+ if missing_procedure_mappings:
1201
+ MediLink_ConfigLoader.log("Missing procedure mappings for diagnosis codes: {}".format(sorted(missing_procedure_mappings)), level="WARNING")
1202
+ print("WARNING: {} diagnosis codes need to be added to procedure_to_diagnosis mapping: {}".format(
1203
+ len(missing_procedure_mappings), sorted(missing_procedure_mappings)))
1204
+
1205
+ return True
1206
+
1207
+ def update_diagnosis_codes(csv_data):
1208
+ try:
1209
+ # TIMING: Start surgery schedule parsing timing
1210
+ parsing_start_time = time.time()
1211
+ print("Starting surgery schedule parsing at: {}".format(time.strftime("%H:%M:%S")))
1212
+ MediLink_ConfigLoader.log("Starting surgery schedule parsing at: {}".format(time.strftime("%H:%M:%S")), level="INFO")
1213
+
1214
+ # Use cached configuration instead of loading repeatedly
1215
+ config, crosswalk = get_cached_configuration()
1216
+
1217
+ # Extract the local storage path from the configuration
1218
+ local_storage_path = config['MediLink_Config']['local_storage_path']
1219
+
1220
+ # Initialize a dictionary to hold diagnosis codes from all DOCX files
1221
+ all_patient_data = {}
1222
+ all_schedule_positions = {} # NEW: Store schedule positions for future sorting
1223
+
1224
+ # Convert surgery dates in CSV data
1225
+ convert_surgery_date(csv_data)
1226
+
1227
+ # Extract all valid surgery dates from csv_data
1228
+ surgery_dates = [row['Surgery Date'] for row in csv_data if row['Surgery Date'] != datetime.min]
1229
+
1230
+ if not surgery_dates:
1231
+ raise ValueError("No valid surgery dates found in csv_data.")
1232
+
1233
+ # Determine the minimum and maximum surgery dates
1234
+ min_surgery_date = min(surgery_dates)
1235
+ max_surgery_date = max(surgery_dates)
1236
+
1237
+ # Apply a +/-8-day margin to the surgery dates... Increased from 5 days.
1238
+ margin = timedelta(days=8)
1239
+ threshold_start = min_surgery_date - margin
1240
+ threshold_end = max_surgery_date + margin
1241
+
1242
+ # TODO (Low) This is a bad idea. We need a better way to handle this because it leaves
1243
+ # us with a situation where if we take 'too long' to download the DOCX files, it will presume that the DOCX files are out of range because
1244
+ # the modfied date is a bad proxy for the date of the surgery which would be contained inside the DOCX file. The processing overhead for extracting the
1245
+ # date of the surgery from the DOCX file is non-trivial and computationally expensive so we need a smarter way to handle this.
1246
+
1247
+ MediLink_ConfigLoader.log("BAD IDEA: Processing DOCX files modified between {} and {}.".format(threshold_start, threshold_end), level="INFO")
1248
+
1249
+ # TIMING: Start file system operations
1250
+ filesystem_start_time = time.time()
1251
+
1252
+ # PERFORMANCE OPTIMIZATION: Batch file system operations with caching
1253
+ # Pre-convert threshold timestamps for efficient comparison (Windows XP compatible)
1254
+ threshold_start_ts = threshold_start.timestamp() if hasattr(threshold_start, 'timestamp') else time.mktime(threshold_start.timetuple())
1255
+ threshold_end_ts = threshold_end.timestamp() if hasattr(threshold_end, 'timestamp') else time.mktime(threshold_end.timetuple())
1256
+
1257
+ valid_files = []
1258
+ try:
1259
+ # Use os.listdir() with optimized timestamp comparison (XP/3.4.4 compatible)
1260
+ for filename in os.listdir(local_storage_path):
1261
+ if filename.endswith('.docx'):
1262
+ filepath = os.path.join(local_storage_path, filename)
1263
+ # Get file modification time in single operation
1264
+ try:
1265
+ stat_info = os.stat(filepath)
1266
+ # Direct timestamp comparison avoids datetime conversion overhead
1267
+ if threshold_start_ts <= stat_info.st_mtime <= threshold_end_ts:
1268
+ valid_files.append(filepath)
1269
+ except (OSError, ValueError):
1270
+ # Skip files with invalid modification times
1271
+ continue
1272
+ except OSError:
1273
+ MediLink_ConfigLoader.log("Error accessing directory: {}".format(local_storage_path), level="ERROR")
1274
+ return
1275
+
1276
+ # TIMING: End file system operations
1277
+ filesystem_end_time = time.time()
1278
+ filesystem_duration = filesystem_end_time - filesystem_start_time
1279
+
1280
+ # PERFORMANCE OPTIMIZATION: Log file count for debugging without processing overhead
1281
+ MediLink_ConfigLoader.log("Found {} DOCX files within date threshold".format(len(valid_files)), level="INFO")
1282
+
1283
+ # TIMING: Start CSV data preprocessing
1284
+ csv_prep_start_time = time.time()
1285
+
1286
+ # PERFORMANCE OPTIMIZATION: Pre-process patient IDs for efficient lookup
1287
+ # Create a set of patient IDs from CSV data for faster lookups
1288
+ patient_ids_in_csv = {row.get('Patient ID', '').strip() for row in csv_data}
1289
+
1290
+ # PERFORMANCE OPTIMIZATION: Pre-convert surgery dates to string format
1291
+ # Convert all surgery dates to string format once to avoid repeated conversions in loops
1292
+ surgery_date_strings = {}
1293
+ for row in csv_data:
1294
+ patient_id = row.get('Patient ID', '').strip()
1295
+ surgery_date = row.get('Surgery Date')
1296
+ if surgery_date != datetime.min:
1297
+ surgery_date_strings[patient_id] = surgery_date.strftime("%m-%d-%Y")
1298
+ else:
1299
+ surgery_date_strings[patient_id] = ''
1300
+
1301
+ # TIMING: End CSV data preprocessing
1302
+ csv_prep_end_time = time.time()
1303
+ csv_prep_duration = csv_prep_end_time - csv_prep_start_time
1304
+
1305
+ # TIMING: Log before processing DOCX files
1306
+ docx_processing_start_time = time.time()
1307
+ print("Found {} DOCX files to process. Starting DOCX parsing...".format(len(valid_files)))
1308
+ MediLink_ConfigLoader.log("Found {} DOCX files to process. Starting DOCX parsing...".format(len(valid_files)), level="INFO")
1309
+
1310
+ # TIMING: Track individual DOCX file processing
1311
+ docx_files_processed = 0
1312
+ docx_files_skipped = 0
1313
+ docx_parse_errors = 0
1314
+
1315
+ # Process valid DOCX files
1316
+ for filepath in valid_files:
1317
+ # TIMING: Start individual file processing
1318
+ file_start_time = time.time()
1319
+
1320
+ try:
1321
+ if SORTING_STRATEGY == 'schedule_based':
1322
+ # Enhanced parsing to capture schedule positions
1323
+ patient_data, schedule_positions = parse_docx(filepath, surgery_dates, capture_schedule_positions=True) # Pass surgery_dates to parse_docx
1324
+ # Store schedule positions for future sorting
1325
+ for patient_id, dates in schedule_positions.items():
1326
+ if patient_id not in all_schedule_positions:
1327
+ all_schedule_positions[patient_id] = {}
1328
+ all_schedule_positions[patient_id].update(dates)
1329
+ else:
1330
+ # Standard parsing (maintains backward compatibility)
1331
+ patient_data = parse_docx(filepath, surgery_dates, capture_schedule_positions=False) # Pass surgery_dates to parse_docx
1332
+
1333
+ docx_files_processed += 1
1334
+
1335
+ # PERFORMANCE OPTIMIZATION: Use defaultdict for more efficient dictionary operations
1336
+ for patient_id, service_dates in patient_data.items():
1337
+ if patient_id not in all_patient_data:
1338
+ all_patient_data[patient_id] = {}
1339
+ for date_of_service, diagnosis_data in service_dates.items():
1340
+ # TODO: SURGERY SCHEDULE CONFLICT RESOLUTION
1341
+ # Implement enhanced conflict detection and logging as outlined in
1342
+ # surgery_schedule_conflict_resolution_strategy.md
1343
+ #
1344
+ # Current behavior: Silent overwriting with latest file wins
1345
+ # Proposed enhancement:
1346
+ # 1. Detect when multiple files contain data for same date
1347
+ # 2. Log conflicts with date-organized notifications showing:
1348
+ # - Source files (with modification timestamps)
1349
+ # - Patients affected (added/removed/modified)
1350
+ # - Specific changes (diagnosis, laterality, etc.)
1351
+ # 3. Use file modification time to determine priority
1352
+ # 4. Generate summary report organized by surgery date
1353
+ #
1354
+ # Example notification format:
1355
+ # "SURGERY SCHEDULE CONFLICTS DETECTED FOR: 12/15/2023"
1356
+ # " Original: file1.docx (modified: 08:30:00)"
1357
+ # " Revised: file2.docx (modified: 14:45:00)"
1358
+ # " Patients affected: 3 modified, 1 added, 1 removed"
1359
+ # " Resolution: Using latest file (file2.docx)"
1360
+ #
1361
+ # This will provide transparency when revised schedules overwrite
1362
+ # original schedules, organized by the affected surgery dates.
1363
+ all_patient_data[patient_id][date_of_service] = diagnosis_data
1364
+ except Exception as e:
1365
+ docx_parse_errors += 1
1366
+ MediLink_ConfigLoader.log("Error parsing DOCX file {}: {}".format(filepath, e), level="ERROR")
1367
+
1368
+ # TIMING: End individual file processing
1369
+ file_end_time = time.time()
1370
+ file_duration = file_end_time - file_start_time
1371
+
1372
+ # Log slow files (taking more than 1 second)
1373
+ if file_duration > 1.0 and PERFORMANCE_LOGGING:
1374
+ print(" - Slow file: {} (Duration: {:.2f} seconds)".format(os.path.basename(filepath), file_duration))
1375
+
1376
+ # TIMING: Log DOCX processing completion
1377
+ docx_processing_end_time = time.time()
1378
+ docx_processing_duration = docx_processing_end_time - docx_processing_start_time
1379
+ if PERFORMANCE_LOGGING:
1380
+ print("DOCX parsing completed at: {} (Duration: {:.2f} seconds)".format(
1381
+ time.strftime("%H:%M:%S"), docx_processing_duration))
1382
+ print(" - Files processed: {}, Files skipped: {}, Parse errors: {}".format(
1383
+ docx_files_processed, docx_files_skipped, docx_parse_errors))
1384
+ MediLink_ConfigLoader.log("DOCX parsing completed at: {} (Duration: {:.2f} seconds)".format(
1385
+ time.strftime("%H:%M:%S"), docx_processing_duration), level="INFO")
1386
+
1387
+ # Log if no valid files were found
1388
+ if not valid_files:
1389
+ MediLink_ConfigLoader.log("No valid DOCX files found within the modification time threshold.", level="INFO")
1390
+
1391
+ # Debug logging for all_patient_data
1392
+ MediLink_ConfigLoader.log("All patient data collected from DOCX files: {}".format(all_patient_data), level="DEBUG")
1393
+
1394
+ # Check if any patient data was collected
1395
+ if not all_patient_data or not patient_ids_in_csv.intersection(all_patient_data.keys()):
1396
+ MediLink_ConfigLoader.log("No patient data collected or no matching Patient IDs found. Skipping further processing.", level="INFO")
1397
+ return # Exit the function early if no data is available
1398
+
1399
+ # TIMING: Start CSV data matching
1400
+ csv_matching_start_time = time.time()
1401
+
1402
+ # Get Medisoft shorthand dictionary from crosswalk.
1403
+ diagnosis_to_medisoft = crosswalk.get('diagnosis_to_medisoft', {})
1404
+
1405
+ # Initialize counter for updated rows
1406
+ updated_count = 0
1407
+
1408
+ # PERFORMANCE OPTIMIZATION: Single pass through CSV data with pre-processed lookups
1409
+ # Update the "Default Diagnosis #1" column in the CSV data and store diagnosis codes for all surgery dates
1410
+ for row_num, row in enumerate(csv_data, start=1):
1411
+ patient_id = row.get('Patient ID', '').strip()
1412
+ # Use pre-processed patient ID lookup for efficiency
1413
+ if patient_id not in patient_ids_in_csv:
1414
+ continue # Skip rows that do not match any patient ID
1415
+
1416
+ MediLink_ConfigLoader.log("Processing row number {}.".format(row_num), level="DEBUG")
1417
+
1418
+ # Get all surgery dates for this patient
1419
+ all_surgery_dates = row.get('_all_surgery_dates', [row.get('Surgery Date')])
1420
+
1421
+ # Create a mapping of surgery dates to diagnosis codes for this patient
1422
+ surgery_date_to_diagnosis = {}
1423
+
1424
+ if patient_id in all_patient_data:
1425
+ # Process each surgery date for this patient
1426
+ for surgery_date in all_surgery_dates:
1427
+ # Convert surgery date to string format for lookup
1428
+ try:
1429
+ if hasattr(surgery_date, 'strftime'):
1430
+ surgery_date_str = surgery_date.strftime('%m-%d-%Y')
1431
+ else:
1432
+ surgery_date_str = str(surgery_date)
1433
+ except Exception:
1434
+ surgery_date_str = str(surgery_date)
1435
+
1436
+ MediLink_ConfigLoader.log("Patient ID: {}, Surgery Date: {}".format(patient_id, surgery_date_str), level="DEBUG")
1437
+
1438
+ if surgery_date_str in all_patient_data[patient_id]:
1439
+ diagnosis_data = all_patient_data[patient_id][surgery_date_str]
1440
+ # XP SP3 + Py3.4.4 compatible tuple unpacking with safety check
1441
+ try:
1442
+ if isinstance(diagnosis_data, (list, tuple)) and len(diagnosis_data) >= 3:
1443
+ diagnosis_code, left_or_right_eye, femto_yes_or_no = diagnosis_data
1444
+ else:
1445
+ # Handle case where diagnosis_data is not a proper tuple
1446
+ diagnosis_code = diagnosis_data if diagnosis_data else None
1447
+ left_or_right_eye = None
1448
+ femto_yes_or_no = None
1449
+ except Exception as e:
1450
+ MediLink_ConfigLoader.log("Error unpacking diagnosis data for Patient ID: {}, Surgery Date: {}: {}".format(
1451
+ patient_id, surgery_date_str, str(e)), level="WARNING")
1452
+ diagnosis_code = None
1453
+ left_or_right_eye = None
1454
+ femto_yes_or_no = None
1455
+
1456
+ MediLink_ConfigLoader.log("Found diagnosis data for Patient ID: {}, Surgery Date: {}".format(patient_id, surgery_date_str), level="DEBUG")
1457
+
1458
+ # Convert diagnosis code to Medisoft shorthand format.
1459
+ # XP SP3 + Py3.4.4 compatible null check
1460
+ if diagnosis_code is None:
1461
+ medisoft_shorthand = 'N/A'
1462
+ MediLink_ConfigLoader.log("Diagnosis code is None for Patient ID: {}, Surgery Date: {}".format(
1463
+ patient_id, surgery_date_str), level="WARNING")
1464
+ else:
1465
+ medisoft_shorthand = diagnosis_to_medisoft.get(diagnosis_code, None)
1466
+ if medisoft_shorthand is None and diagnosis_code:
1467
+ # Use fallback logic for missing mapping (XP SP3 + Py3.4.4 compatible)
1468
+ try:
1469
+ defaulted_code = diagnosis_code.lstrip('H').lstrip('T8').replace('.', '')[-5:]
1470
+ # Basic validation: ensure code is not empty and has reasonable length
1471
+ if defaulted_code and len(defaulted_code) >= 3:
1472
+ medisoft_shorthand = defaulted_code
1473
+ MediLink_ConfigLoader.log("Missing diagnosis mapping for '{}', using fallback code '{}'".format(
1474
+ diagnosis_code, medisoft_shorthand), level="WARNING")
1475
+ else:
1476
+ medisoft_shorthand = 'N/A'
1477
+ MediLink_ConfigLoader.log("Fallback diagnosis code validation failed for '{}', using 'N/A'".format(
1478
+ diagnosis_code), level="WARNING")
1479
+ except Exception as e:
1480
+ medisoft_shorthand = 'N/A'
1481
+ MediLink_ConfigLoader.log("Error in fallback diagnosis code generation for '{}': {}".format(
1482
+ diagnosis_code, str(e)), level="WARNING")
1483
+
1484
+ MediLink_ConfigLoader.log("Converted diagnosis code to Medisoft shorthand: {}".format(medisoft_shorthand), level="DEBUG")
1485
+
1486
+ surgery_date_to_diagnosis[surgery_date_str] = medisoft_shorthand
1487
+ else:
1488
+ MediLink_ConfigLoader.log("No matching surgery date found for Patient ID: {} on date {}.".format(patient_id, surgery_date_str), level="INFO")
1489
+ surgery_date_to_diagnosis[surgery_date_str] = 'N/A'
1490
+
1491
+ # Store the diagnosis mapping for all surgery dates
1492
+ row['_surgery_date_to_diagnosis'] = surgery_date_to_diagnosis
1493
+
1494
+ # NEW: Store schedule positions for future sorting if available
1495
+ if SORTING_STRATEGY == 'schedule_based' and patient_id in all_schedule_positions:
1496
+ row['_schedule_positions'] = all_schedule_positions[patient_id]
1497
+
1498
+ # Set the primary diagnosis code (for the main surgery date)
1499
+ primary_surgery_date = row.get('Surgery Date')
1500
+ # Convert primary surgery date to string for lookup
1501
+ if isinstance(primary_surgery_date, datetime):
1502
+ primary_surgery_date_str = primary_surgery_date.strftime('%m-%d-%Y')
1503
+ else:
1504
+ primary_surgery_date_str = str(primary_surgery_date)
1505
+ primary_diagnosis = surgery_date_to_diagnosis.get(primary_surgery_date_str, 'N/A')
1506
+ row['Default Diagnosis #1'] = primary_diagnosis
1507
+
1508
+ updated_count += 1
1509
+ MediLink_ConfigLoader.log("Updated row number {} with diagnosis codes for {} surgery dates.".format(row_num, len(all_surgery_dates)), level="INFO")
1510
+ else:
1511
+ MediLink_ConfigLoader.log("Patient ID: {} not found in DOCX data for row {}.".format(patient_id, row_num), level="INFO")
1512
+
1513
+ # TIMING: End CSV data matching
1514
+ csv_matching_end_time = time.time()
1515
+ csv_matching_duration = csv_matching_end_time - csv_matching_start_time
1516
+
1517
+ # Log total count of updated rows
1518
+ MediLink_ConfigLoader.log("Total {} 'Default Diagnosis #1' rows updated.".format(updated_count), level="INFO")
1519
+
1520
+ # TIMING: End surgery schedule parsing timing
1521
+ parsing_end_time = time.time()
1522
+ parsing_duration = parsing_end_time - parsing_start_time
1523
+ if PERFORMANCE_LOGGING:
1524
+ print("Surgery schedule parsing completed at: {} (Duration: {:.2f} seconds)".format(
1525
+ time.strftime("%H:%M:%S"), parsing_duration))
1526
+ print(" - File system operations: {:.2f} seconds ({:.1f}%)".format(filesystem_duration, (filesystem_duration/parsing_duration)*100))
1527
+ print(" - CSV data preprocessing: {:.2f} seconds ({:.1f}%)".format(csv_prep_duration, (csv_prep_duration/parsing_duration)*100))
1528
+ print(" - DOCX file processing: {:.2f} seconds ({:.1f}%)".format(docx_processing_duration, (docx_processing_duration/parsing_duration)*100))
1529
+ print(" - CSV data matching: {:.2f} seconds ({:.1f}%)".format(csv_matching_duration, (csv_matching_duration/parsing_duration)*100))
1530
+ print(" - Files processed: {}, Files skipped: {}, Parse errors: {}".format(docx_files_processed, docx_files_skipped, docx_parse_errors))
1531
+ MediLink_ConfigLoader.log("Surgery schedule parsing completed at: {} (Duration: {:.2f} seconds)".format(
1532
+ time.strftime("%H:%M:%S"), parsing_duration), level="INFO")
1533
+
1534
+ except Exception as e:
1535
+ message = "An error occurred while updating diagnosis codes. Please check the DOCX files and configuration: {}".format(e)
1536
+ MediLink_ConfigLoader.log(message, level="ERROR")
1537
+ print(message)
1538
+
1539
+ def load_data_sources(config, crosswalk):
1540
+ """Loads historical mappings from MAPAT and Carol's CSVs."""
1541
+ patient_id_to_insurance_id = load_insurance_data_from_mapat(config, crosswalk)
1542
+ if not patient_id_to_insurance_id:
1543
+ raise ValueError("Failed to load historical Patient ID to Insurance ID mappings from MAPAT.")
1544
+
1545
+ payer_id_to_patient_ids = load_historical_payer_to_patient_mappings(config)
1546
+ if not payer_id_to_patient_ids:
1547
+ raise ValueError("Failed to load historical Carol's CSVs.")
1548
+
1549
+ return patient_id_to_insurance_id, payer_id_to_patient_ids
1550
+
1551
+ def map_payer_ids_to_insurance_ids(patient_id_to_insurance_id, payer_id_to_patient_ids):
1552
+ """Maps Payer IDs to Insurance IDs based on the historical mappings."""
1553
+ payer_id_to_details = {}
1554
+ for payer_id, patient_ids in payer_id_to_patient_ids.items():
1555
+ medisoft_ids = set()
1556
+ for patient_id in patient_ids:
1557
+ if patient_id in patient_id_to_insurance_id:
1558
+ medisoft_id = patient_id_to_insurance_id[patient_id]
1559
+ medisoft_ids.add(medisoft_id)
1560
+ MediLink_ConfigLoader.log("Added Medisoft ID {} for Patient ID {} and Payer ID {}".format(medisoft_id, patient_id, payer_id))
1561
+ else:
1562
+ MediLink_ConfigLoader.log("No matching Insurance ID found for Patient ID {}".format(patient_id))
1563
+ if medisoft_ids:
1564
+ payer_id_to_details[payer_id] = {
1565
+ "endpoint": "OPTUMEDI", # TODO Default, to be refined via API poll. There are 2 of these defaults!
1566
+ "medisoft_id": list(medisoft_ids),
1567
+ "medisoft_medicare_id": [] # Placeholder for future implementation
1568
+ }
1569
+ return payer_id_to_details
1570
+
1571
+ def _display_mains_file_error(mains_path):
1572
+ """
1573
+ Helper function to display the critical MAINS file error message.
1574
+
1575
+ Args:
1576
+ mains_path (str): The path where the MAINS file was expected to be found.
1577
+ """
1578
+ error_msg = "CRITICAL: MAINS file not found at: {}. This file is required for insurance name to Medisoft ID mapping.".format(mains_path)
1579
+ if hasattr(MediLink_ConfigLoader, 'log'):
1580
+ MediLink_ConfigLoader.log(error_msg, level="CRITICAL")
1581
+ print("\n" + "="*80)
1582
+ print("CRITICAL ERROR: MAINS FILE MISSING")
1583
+ print("="*80)
1584
+ print("\nThe MAINS file is required for the following critical functions:")
1585
+ print("* Mapping insurance company names to Medisoft IDs")
1586
+ print("* Converting insurance names to payer IDs for claim submission")
1587
+ print("* Creating properly formatted 837p claim files")
1588
+ print("\nWithout this file, claim submission will fail because:")
1589
+ print("* Insurance names cannot be converted to payer IDs")
1590
+ print("* 837p claim files cannot be generated")
1591
+ print("* Claims cannot be submitted to insurance companies")
1592
+ print("\nTO FIX THIS:")
1593
+ print("1. Ensure the MAINS file exists at: {}".format(mains_path))
1594
+ print("2. If the file is missing, llamar a Dani")
1595
+ print("3. The file should contain insurance company data from your Medisoft system")
1596
+ print("="*80)
1597
+ time.sleep(3) # 3 second pause to allow user to read critical error message
1598
+
1599
+
1600
+ def load_insurance_data_from_mains(config):
1601
+ """
1602
+ Loads insurance data from MAINS and creates a mapping from insurance names to their respective IDs.
1603
+ This mapping is critical for the crosswalk update process to correctly associate payer IDs with insurance IDs.
1604
+
1605
+ Args:
1606
+ config (dict): Configuration object containing necessary paths and parameters.
1607
+
1608
+ Returns:
1609
+ dict: A dictionary mapping insurance names to insurance IDs.
1610
+ """
1611
+ # Use cached configuration to avoid repeated loading
1612
+ try:
1613
+ config, crosswalk = get_cached_configuration()
1614
+ except Exception as e:
1615
+ print("Warning: Failed to load cached configuration: {}".format(e))
1616
+ # Return empty mapping if configuration loading fails
1617
+ return {}
1618
+
1619
+ # XP Compatibility: Check if MediLink_DataMgmt is available
1620
+ if MediLink_DataMgmt is None:
1621
+ print("Warning: MediLink_DataMgmt not available. Cannot load MAINS data.")
1622
+ return {}
1623
+
1624
+ # Retrieve MAINS path and slicing information from the configuration
1625
+ # TODO (Low) For secondary insurance, this needs to be pulling from the correct MAINS (there are 2)
1626
+ # TODO (Low) Performance: There probably needs to be a dictionary proxy for MAINS that gets updated.
1627
+ # Meh, this just has to be part of the new architecture plan where we make Medisoft a downstream
1628
+ # recipient from the db.
1629
+ # TODO (High) The Medisoft Medicare flag needs to be brought in here.
1630
+ try:
1631
+ mains_path = config.get('MAINS_MED_PATH', '')
1632
+ mains_slices = crosswalk.get('mains_mapping', {}).get('slices', {})
1633
+ except (KeyError, AttributeError) as e:
1634
+ print("Warning: Failed to get MAINS configuration: {}".format(e))
1635
+ return {}
1636
+
1637
+ # Initialize the dictionary to hold the insurance to insurance ID mappings
1638
+ insurance_to_id = {}
1639
+
1640
+ try:
1641
+ # Check if MAINS file exists before attempting to read
1642
+ if not os.path.exists(mains_path):
1643
+ _display_mains_file_error(mains_path)
1644
+ return insurance_to_id
1645
+
1646
+ # XP Compatibility: Check if MediLink_DataMgmt has the required function
1647
+ if not hasattr(MediLink_DataMgmt, 'read_general_fixed_width_data'):
1648
+ print("Warning: MediLink_DataMgmt.read_general_fixed_width_data not available. Cannot load MAINS data.")
1649
+ return insurance_to_id
1650
+
1651
+ # Read data from MAINS using a provided function to handle fixed-width data
1652
+ for record, line_number in MediLink_DataMgmt.read_general_fixed_width_data(mains_path, mains_slices):
1653
+ insurance_name = record['MAINSNAME']
1654
+ # Assuming line_number gives the correct insurance ID without needing adjustment
1655
+ insurance_to_id[insurance_name] = line_number
1656
+
1657
+ if hasattr(MediLink_ConfigLoader, 'log'):
1658
+ MediLink_ConfigLoader.log("Successfully loaded {} insurance records from MAINS".format(len(insurance_to_id)), level="INFO")
1659
+ else:
1660
+ print("Successfully loaded {} insurance records from MAINS".format(len(insurance_to_id)))
1661
+
1662
+ except FileNotFoundError:
1663
+ _display_mains_file_error(mains_path)
1664
+ except Exception as e:
1665
+ error_msg = "Error loading MAINS data: {}. Continuing without MAINS data.".format(str(e))
1666
+ if hasattr(MediLink_ConfigLoader, 'log'):
1667
+ MediLink_ConfigLoader.log(error_msg, level="ERROR")
1668
+ print("Error loading MAINS data: {}. Continuing without MAINS data.".format(str(e)))
1669
+
1670
+ return insurance_to_id
1671
+
1672
+ def load_insurance_data_from_mapat(config, crosswalk):
1673
+ """
1674
+ Loads insurance data from MAPAT and creates a mapping from patient ID to insurance ID.
1675
+
1676
+ Args:
1677
+ config (dict): Configuration object containing necessary paths and parameters.
1678
+ crosswalk ... ADD HERE.
1679
+
1680
+ Returns:
1681
+ dict: A dictionary mapping patient IDs to insurance IDs.
1682
+ """
1683
+ # Retrieve MAPAT path and slicing information from the configuration
1684
+ ac = _ac()
1685
+ mapat_path = ac.get_mapat_med_path() if ac else ''
1686
+ mapat_slices = crosswalk['mapat_mapping']['slices']
1687
+
1688
+ # Initialize the dictionary to hold the patient ID to insurance ID mappings
1689
+ patient_id_to_insurance_id = {}
1690
+
1691
+ # Read data from MAPAT using a provided function to handle fixed-width data
1692
+ for record, _ in MediLink_DataMgmt.read_general_fixed_width_data(mapat_path, mapat_slices):
1693
+ patient_id = record['MAPATPXID']
1694
+ insurance_id = record['MAPATINID']
1695
+ patient_id_to_insurance_id[patient_id] = insurance_id
1696
+
1697
+ return patient_id_to_insurance_id
1698
+
1699
+ def parse_z_dat(z_dat_path, config): # Why is this in MediBot and not MediLink?
1700
+ """
1701
+ Parses the Z.dat file to map Patient IDs to Insurance Names using the provided fixed-width file format.
1702
+
1703
+ Args:
1704
+ z_dat_path (str): Path to the Z.dat file.
1705
+ config (dict): Configuration object containing slicing information and other parameters.
1706
+
1707
+ Returns:
1708
+ dict: A dictionary mapping Patient IDs to Insurance Names.
1709
+ """
1710
+ patient_id_to_insurance_name = {}
1711
+
1712
+ try:
1713
+ # Reading blocks of fixed-width data (up to 5 lines per record)
1714
+ for personal_info, insurance_info, service_info, service_info_2, service_info_3 in MediLink_DataMgmt.read_fixed_width_data(z_dat_path):
1715
+ # Parse Z.dat reserved record format: 3 active + 2 reserved lines
1716
+ parsed_data = MediLink_DataMgmt.parse_fixed_width_data(personal_info, insurance_info, service_info, service_info_2, service_info_3, config.get('MediLink_Config', config))
1717
+
1718
+ # Extract Patient ID and Insurance Name from parsed data
1719
+ patient_id = parsed_data.get('PATID')
1720
+ insurance_name = parsed_data.get('INAME')
1721
+
1722
+ if patient_id and insurance_name:
1723
+ patient_id_to_insurance_name[patient_id] = insurance_name
1724
+ MediLink_ConfigLoader.log("Mapped Patient ID {} to Insurance Name {}".format(patient_id, insurance_name), config, level="INFO")
1725
+
1726
+ except FileNotFoundError:
1727
+ MediLink_ConfigLoader.log("File not found: {}".format(z_dat_path), config, level="INFO")
1728
+ except Exception as e:
1729
+ MediLink_ConfigLoader.log("Failed to parse Z.dat: {}".format(str(e)), config, level="INFO")
1730
+
1731
+ return patient_id_to_insurance_name
1732
+
1733
+ def load_historical_payer_to_patient_mappings(config):
1734
+ """
1735
+ Loads historical mappings from multiple Carol's CSV files in a specified directory,
1736
+ mapping Payer IDs to sets of Patient IDs.
1737
+
1738
+ Args:
1739
+ config (dict): Configuration object containing the directory path for Carol's CSV files
1740
+ and other necessary parameters.
1741
+
1742
+ Returns:
1743
+ dict: A dictionary where each key is a Payer ID and the value is a set of Patient IDs.
1744
+ """
1745
+ directory_path = os.path.dirname(config['CSV_FILE_PATH'])
1746
+ payer_to_patient_ids = defaultdict(set)
1747
+
1748
+ try:
1749
+ # Check if the directory exists
1750
+ if not os.path.isdir(directory_path):
1751
+ raise FileNotFoundError("Directory '{}' not found.".format(directory_path))
1752
+
1753
+ # Loop through each file in the directory containing Carol's historical CSVs
1754
+ for filename in os.listdir(directory_path):
1755
+ file_path = os.path.join(directory_path, filename)
1756
+ if filename.endswith('.csv'):
1757
+ try:
1758
+ with open(file_path, 'r', encoding='utf-8') as csvfile:
1759
+ reader = csv.DictReader(csvfile)
1760
+ patient_count = 0 # Counter for Patient IDs found in this CSV
1761
+ for row in reader:
1762
+ if 'Patient ID' not in row or 'Ins1 Payer ID' not in row:
1763
+ continue # Skip this row if either key is missing
1764
+ if not row.get('Patient ID').strip() or not row.get('Ins1 Payer ID').strip():
1765
+ continue # Skip this row if either value is missing or empty
1766
+
1767
+ payer_id = row['Ins1 Payer ID'].strip()
1768
+ patient_id = row['Patient ID'].strip()
1769
+ payer_to_patient_ids[payer_id].add(patient_id)
1770
+ patient_count += 1 # Increment the counter for each valid mapping
1771
+
1772
+ # Log the accumulated count for this CSV file
1773
+ if patient_count > 0:
1774
+ MediLink_ConfigLoader.log("CSV file '{}' has {} Patient IDs with Payer IDs.".format(filename, patient_count), level="DEBUG")
1775
+ else:
1776
+ MediLink_ConfigLoader.log("CSV file '{}' is empty or does not have valid Patient ID or Payer ID mappings.".format(filename), level="DEBUG")
1777
+ except Exception as e:
1778
+ print("Error processing file {}: {}".format(filename, e))
1779
+ MediLink_ConfigLoader.log("Error processing file '{}': {}".format(filename, e), level="ERROR")
1780
+ except FileNotFoundError as e:
1781
+ print("Error: {}".format(e))
1782
+
1783
+ if not payer_to_patient_ids:
1784
+ print("No historical mappings were generated.")
1785
+
1786
+ return dict(payer_to_patient_ids)
1787
+
1788
+ def capitalize_all_fields(csv_data):
1789
+ """
1790
+ Converts all text fields in the CSV data to uppercase.
1791
+
1792
+ Parameters:
1793
+ csv_data (list of dict): The CSV data where each row is represented as a dictionary.
1794
+
1795
+ Returns:
1796
+ None: The function modifies the csv_data in place.
1797
+ """
1798
+ # PERFORMANCE FIX: Optimize uppercase conversion while preserving complex types
1799
+ for row in csv_data:
1800
+ updated_row = {}
1801
+ for key, value in row.items():
1802
+ # Preserve internal/derived fields intact (e.g., `_all_surgery_dates`, `_surgery_date_to_diagnosis`)
1803
+ if isinstance(key, str) and key.startswith('_'):
1804
+ updated_row[key] = value
1805
+ continue
1806
+ # Uppercase plain strings
1807
+ if isinstance(value, str):
1808
+ updated_row[key] = value.upper()
1809
+ continue
1810
+ # Preserve complex containers; optionally uppercase their string contents
1811
+ if isinstance(value, list):
1812
+ updated_row[key] = [elem.upper() if isinstance(elem, str) else elem for elem in value]
1813
+ continue
1814
+ if isinstance(value, dict):
1815
+ updated_row[key] = {k: (v.upper() if isinstance(v, str) else v) for k, v in value.items()}
1816
+ continue
1817
+ # Leave datetimes as-is; coerce simple scalars to string upper for consistency
1818
+ if isinstance(value, datetime):
1819
+ updated_row[key] = value
1820
+ else:
1821
+ updated_row[key] = str(value).upper() if value is not None else value
1729
1822
  row.update(updated_row)