medicafe 0.251017.1__py3-none-any.whl → 0.251026.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of medicafe might be problematic. Click here for more details.

@@ -1,1822 +1,1896 @@
1
- # MediBot_Preprocessor_lib.py
2
- """
3
- Core preprocessing library for MediBot
4
- Contains core preprocessing functions and utilities.
5
- """
6
-
7
- import csv, time, os, sys
8
- from datetime import datetime, timedelta
9
- from collections import OrderedDict
10
-
11
- # Try to import chardet for encoding detection
12
- try:
13
- import chardet
14
- except ImportError:
15
- chardet = None # Fallback if chardet is not available
16
-
17
- # SORTING STRATEGY CONFIGURATION
18
- # Set to 'schedule_based' to enable surgery schedule sorting
19
- # Set to 'date_based' to use current date-based sorting (default)
20
- SORTING_STRATEGY = 'date_based' # Hard-coded with clear comments
21
-
22
- # When enabled, patients will be sorted based on their position in the DOCX surgery schedule
23
- # When disabled, patients will be sorted by earliest surgery date (current behavior)
24
-
25
- # Use core utilities for standardized imports
26
- from MediCafe.core_utils import (
27
- import_medibot_module,
28
- import_medilink_module,
29
- get_config_loader_with_fallback
30
- )
31
-
32
- # Initialize configuration loader with fallback
33
- MediLink_ConfigLoader = get_config_loader_with_fallback()
34
-
35
- # Import MediLink_DataMgmt using centralized import function
36
- MediLink_DataMgmt = import_medilink_module('MediLink_DataMgmt')
37
-
38
- # Import MediBot modules using centralized import functions
39
- MediBot_UI = import_medibot_module('MediBot_UI')
40
- if MediBot_UI:
41
- app_control = getattr(MediBot_UI, 'app_control', None)
42
- get_app_control = getattr(MediBot_UI, '_get_app_control', None)
43
- def _ac():
44
- try:
45
- return get_app_control() if get_app_control else getattr(MediBot_UI, 'app_control', None)
46
- except Exception:
47
- return getattr(MediBot_UI, 'app_control', None)
48
- else:
49
- app_control = None
50
-
51
- MediBot_docx_decoder = import_medibot_module('MediBot_docx_decoder')
52
- if MediBot_docx_decoder:
53
- parse_docx = getattr(MediBot_docx_decoder, 'parse_docx', None)
54
- else:
55
- parse_docx = None
56
-
57
- # Add the parent directory of the project to the Python path
58
- sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
59
-
60
- # Configuration cache to avoid repeated loading
61
- _config_cache = None
62
- _crosswalk_cache = None
63
-
64
- # Use core utilities for standardized imports
65
- from MediCafe.core_utils import get_shared_config_loader
66
- MediLink_ConfigLoader = get_shared_config_loader()
67
-
68
- # Ensure MediLink_ConfigLoader is available
69
- if MediLink_ConfigLoader is None:
70
- print("Warning: MediLink_ConfigLoader not available. Some functionality may be limited.")
71
- # Create a minimal fallback logger
72
- class FallbackLogger:
73
- def log(self, message, level="INFO"):
74
- print("[{}] {}".format(level, message))
75
- MediLink_ConfigLoader = FallbackLogger()
76
-
77
- # Import centralized logging configuration
78
- try:
79
- from MediCafe.logging_config import PERFORMANCE_LOGGING
80
- except ImportError:
81
- # Fallback to local flag if centralized config is not available
82
- PERFORMANCE_LOGGING = False
83
-
84
- # XP Compatibility: Add robust fallback for configuration loading
85
- def get_cached_configuration_xp_safe():
86
- """
87
- XP-compatible version of get_cached_configuration with robust fallbacks.
88
- """
89
- global _config_cache, _crosswalk_cache
90
-
91
- # If we already have cached data, return it
92
- if _config_cache is not None and _crosswalk_cache is not None:
93
- return _config_cache, _crosswalk_cache
94
-
95
- # Try to load configuration using the standard method
96
- try:
97
- if MediLink_ConfigLoader and hasattr(MediLink_ConfigLoader, 'load_configuration'):
98
- _config_cache, _crosswalk_cache = MediLink_ConfigLoader.load_configuration()
99
- return _config_cache, _crosswalk_cache
100
- except Exception as e:
101
- print("Warning: Failed to load configuration via MediLink_ConfigLoader: {}".format(e))
102
-
103
- # Fallback: Try to load configuration files directly
104
- try:
105
- import json
106
- project_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
107
-
108
- # Try to load config.json
109
- config_path = os.path.join(project_dir, 'json', 'config.json')
110
- if os.path.exists(config_path):
111
- with open(config_path, 'r') as f:
112
- _config_cache = json.load(f)
113
- else:
114
- _config_cache = {}
115
-
116
- # Try to load crosswalk.json
117
- crosswalk_path = os.path.join(project_dir, 'json', 'crosswalk.json')
118
- if os.path.exists(crosswalk_path):
119
- with open(crosswalk_path, 'r') as f:
120
- _crosswalk_cache = json.load(f)
121
- else:
122
- _crosswalk_cache = {}
123
-
124
- return _config_cache, _crosswalk_cache
125
-
126
- except Exception as e:
127
- print("Warning: Failed to load configuration files directly: {}".format(e))
128
- # Return empty defaults
129
- _config_cache = {}
130
- _crosswalk_cache = {}
131
- return _config_cache, _crosswalk_cache
132
-
133
- class InitializationError(Exception):
134
- def __init__(self, message):
135
- self.message = message
136
- super().__init__(self.message)
137
-
138
- def initialize(config):
139
- global AHK_EXECUTABLE, CSV_FILE_PATH, field_mapping, page_end_markers
140
-
141
- required_keys = {
142
- 'AHK_EXECUTABLE': "",
143
- 'CSV_FILE_PATH': "",
144
- 'field_mapping': {},
145
- 'page_end_markers': []
146
- }
147
-
148
- for key, default in required_keys.items():
149
- try:
150
- globals()[key] = config.get(key, default) if key != 'field_mapping' else OrderedDict(config.get(key, default))
151
- except AttributeError:
152
- raise InitializationError("Error: '{}' not found in config.".format(key))
153
-
154
- def get_cached_configuration():
155
- """
156
- Returns cached configuration and crosswalk data to avoid repeated I/O operations.
157
- """
158
- return get_cached_configuration_xp_safe()
159
-
160
- def open_csv_for_editing(csv_file_path):
161
- try:
162
- # Open the CSV file with its associated application
163
- os.system('start "" "{}"'.format(csv_file_path))
164
- print("After saving the revised CSV, please re-run MediBot.")
165
- except Exception as e:
166
- print("Failed to open CSV file:", e)
167
-
168
- # Function to clean the headers
169
- def clean_header(headers):
170
- """
171
- Cleans the header strings by removing unwanted characters and trimming whitespace.
172
-
173
- Parameters:
174
- headers (list of str): The original header strings.
175
-
176
- Returns:
177
- list of str: The cleaned header strings.
178
- """
179
- cleaned_headers = []
180
-
181
- for header in headers:
182
- # Strip leading and trailing whitespace
183
- cleaned_header = header.strip()
184
- # Remove unwanted characters while keeping spaces, alphanumeric characters, hyphens, and underscores
185
- cleaned_header = ''.join(char for char in cleaned_header if char.isalnum() or char.isspace() or char in ['-', '_'])
186
- cleaned_headers.append(cleaned_header)
187
-
188
- # Log the original and cleaned headers for debugging
189
- MediLink_ConfigLoader.log("Original headers: {}".format(headers), level="INFO")
190
- MediLink_ConfigLoader.log("Cleaned headers: {}".format(cleaned_headers), level="INFO")
191
-
192
- # Check if 'Surgery Date' is in the cleaned headers
193
- if 'Surgery Date' not in cleaned_headers:
194
- MediLink_ConfigLoader.log("WARNING: 'Surgery Date' header not found after cleaning.", level="WARNING")
195
- print("WARNING: 'Surgery Date' header not found after cleaning.")
196
- raise ValueError("Error: 'Surgery Date' header not found after cleaning.")
197
-
198
- return cleaned_headers
199
-
200
- # Function to load and process CSV data
201
- def load_csv_data(csv_file_path):
202
- try:
203
- # Check if the file exists
204
- if not os.path.exists(csv_file_path):
205
- raise FileNotFoundError("***Error: CSV file '{}' not found.".format(csv_file_path))
206
-
207
- # Detect the file encoding
208
- with open(csv_file_path, 'rb') as f:
209
- raw_data = f.read()
210
- if chardet:
211
- result = chardet.detect(raw_data)
212
- encoding = result['encoding']
213
- confidence = result['confidence']
214
- else:
215
- # Fallback to UTF-8 when chardet is not available
216
- encoding = 'utf-8'
217
- confidence = 1.0
218
- print("Detected encoding: {} (Confidence: {:.2f})".format(encoding, confidence))
219
-
220
- # Read the CSV file with the detected encoding
221
- with open(csv_file_path, 'r', encoding=encoding) as csvfile:
222
- reader = csv.DictReader(csvfile)
223
- # Clean the headers
224
- cleaned_headers = clean_header(reader.fieldnames)
225
-
226
- # PERFORMANCE FIX: Use zip() instead of range(len()) for header mapping
227
- header_mapping = {clean: orig for clean, orig in zip(cleaned_headers, reader.fieldnames)}
228
-
229
- # Process the remaining rows - optimize by pre-allocating the list
230
- csv_data = []
231
- # Pre-allocate list size if we can estimate it (optional optimization)
232
- # csv_data = [None] * estimated_size # if we had row count
233
-
234
- for row in reader:
235
- # PERFORMANCE FIX: Use zip() instead of range(len()) for row processing
236
- cleaned_row = {clean: row[header_mapping[clean]] for clean in cleaned_headers}
237
- csv_data.append(cleaned_row)
238
-
239
- return csv_data # Return a list of dictionaries
240
- except FileNotFoundError as e:
241
- print(e) # Print the informative error message
242
- print("Hint: Check if CSV file is located in the expected directory or specify a different path in config file.")
243
- print("Please correct the issue and re-run MediBot.")
244
- sys.exit(1) # Halt the script
245
- except IOError as e:
246
- print("Error reading CSV file: {}. Please check the file path and permissions.".format(e))
247
- sys.exit(1) # Halt the script in case of other IO errors
248
-
249
- # CSV Pre-processor Helper functions
250
- def add_columns(csv_data, column_headers):
251
- """
252
- Adds one or multiple columns to the CSV data.
253
-
254
- Parameters:
255
- csv_data (list of dict): The CSV data where each row is represented as a dictionary.
256
- column_headers (list of str or str): A list of column headers to be added to each row, or a single column header.
257
-
258
- Returns:
259
- None: The function modifies the csv_data in place.
260
- """
261
- if isinstance(column_headers, str):
262
- column_headers = [column_headers]
263
- elif not isinstance(column_headers, list):
264
- raise ValueError("column_headers should be a list or a string")
265
-
266
- # PERFORMANCE FIX: Optimize column initialization to avoid nested loop
267
- for row in csv_data:
268
- # Use dict.update() to set multiple columns at once
269
- row.update({header: '' for header in column_headers})
270
-
271
- # Extracting the list to a variable for future refactoring:
272
- def filter_rows(csv_data):
273
- # TODO: This should be written in the crosswalk and not hardcoded here.
274
- excluded_insurance = {'AETNA', 'AETNA MEDICARE', 'HUMANA MED HMO'}
275
- csv_data[:] = [row for row in csv_data if row.get('Patient ID') and row.get('Primary Insurance') not in excluded_insurance]
276
-
277
- def detect_date_format(date_str):
278
- """
279
- PERFORMANCE OPTIMIZATION: Quickly detect the most likely date format
280
- to avoid trying all formats for every date string.
281
-
282
- Parameters:
283
- - date_str (str): The date string to analyze
284
-
285
- Returns:
286
- - str: The most likely format string, or None if unclear
287
- """
288
- if not date_str:
289
- return None
290
-
291
- # Remove time components if present
292
- date_only = date_str.split()[0]
293
-
294
- # Count separators to guess format
295
- slash_count = date_only.count('/')
296
- dash_count = date_only.count('-')
297
-
298
- # Check for 4-digit year (likely YYYY format)
299
- if len(date_only) >= 10: # YYYY-MM-DD or YYYY/MM/DD
300
- if dash_count == 2:
301
- return '%Y-%m-%d'
302
- elif slash_count == 2:
303
- return '%Y/%m/%d'
304
-
305
- # Check for 2-digit year (likely MM/DD/YY or MM-DD-YY)
306
- if len(date_only) >= 8: # MM/DD/YY or MM-DD-YY
307
- if dash_count == 2:
308
- return '%m-%d-%y'
309
- elif slash_count == 2:
310
- return '%m/%d/%y'
311
-
312
- # Default to most common format (MM/DD/YYYY)
313
- if dash_count == 2:
314
- return '%m-%d-%Y'
315
- elif slash_count == 2:
316
- return '%m/%d/%Y'
317
-
318
- return None
319
-
320
- class OptimizedDate:
321
- """
322
- Optimized date object that pre-computes all common format variations
323
- to avoid redundant datetime conversions throughout the application.
324
- """
325
- def __init__(self, datetime_obj):
326
- self.datetime = datetime_obj
327
- # Pre-compute all common format variations
328
- self._display_short = datetime_obj.strftime('%m-%d') # For table display
329
- self._display_full = datetime_obj.strftime('%m-%d-%Y') # Full format
330
- self._medisoft_format = datetime_obj.strftime('%m%d%Y') # For Medisoft entry
331
- self._iso_format = datetime_obj.strftime('%Y-%m-%d') # For sorting/comparison
332
-
333
- @property
334
- def display_short(self):
335
- """Short display format: MM-DD"""
336
- return self._display_short
337
-
338
- @property
339
- def display_full(self):
340
- """Full display format: MM-DD-YYYY"""
341
- return self._display_full
342
-
343
- @property
344
- def medisoft_format(self):
345
- """Medisoft entry format: MMDDYYYY"""
346
- return self._medisoft_format
347
-
348
- @property
349
- def iso_format(self):
350
- """ISO format for sorting: YYYY-MM-DD"""
351
- return self._iso_format
352
-
353
- def __str__(self):
354
- return self._display_full
355
-
356
- def __repr__(self):
357
- return "OptimizedDate({})".format(self._display_full)
358
-
359
- def __eq__(self, other):
360
- if isinstance(other, OptimizedDate):
361
- return self.datetime == other.datetime
362
- elif hasattr(other, 'strftime'): # datetime object
363
- return self.datetime == other
364
- return False
365
-
366
- def __lt__(self, other):
367
- if isinstance(other, OptimizedDate):
368
- return self.datetime < other.datetime
369
- elif hasattr(other, 'strftime'): # datetime object
370
- return self.datetime < other
371
- return NotImplemented
372
-
373
- def __gt__(self, other):
374
- if isinstance(other, OptimizedDate):
375
- return self.datetime > other.datetime
376
- elif hasattr(other, 'strftime'): # datetime object
377
- return self.datetime > other
378
- return NotImplemented
379
-
380
- def strftime(self, format_str):
381
- """Fallback for any custom format needs"""
382
- return self.datetime.strftime(format_str)
383
-
384
- @classmethod
385
- def from_string(cls, date_str, cleaned=False):
386
- """
387
- Create OptimizedDate from string, with optional pre-cleaning.
388
-
389
- Args:
390
- date_str: Date string to parse
391
- cleaned: If True, assumes string is already cleaned
392
-
393
- Returns:
394
- OptimizedDate object or None if parsing fails
395
- """
396
- if not cleaned:
397
- date_str = clean_surgery_date_string(date_str)
398
- if not date_str:
399
- return None
400
-
401
- # Try standard format first (most common)
402
- try:
403
- return cls(datetime.strptime(date_str, '%m/%d/%Y'))
404
- except ValueError:
405
- pass
406
-
407
- # Try alternative formats
408
- formats = ['%m-%d-%Y', '%m/%d/%y', '%m-%d-%y', '%Y/%m/%d', '%Y-%m-%d']
409
- for fmt in formats:
410
- try:
411
- return cls(datetime.strptime(date_str, fmt))
412
- except ValueError:
413
- continue
414
-
415
- return None
416
-
417
- def clean_surgery_date_string(date_str):
418
- """
419
- Cleans and normalizes surgery date strings to handle damaged data.
420
-
421
- Parameters:
422
- - date_str (str): The raw date string from the CSV
423
-
424
- Returns:
425
- - str: Cleaned date string in MM/DD/YYYY format, or empty string if unparseable
426
- """
427
- if not date_str:
428
- return ''
429
-
430
- # Convert to string and strip whitespace
431
- date_str = str(date_str).strip()
432
- if not date_str:
433
- return ''
434
-
435
- # Remove common problematic characters and normalize
436
- date_str = date_str.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
437
- date_str = ' '.join(date_str.split()) # Normalize whitespace
438
-
439
- # PERFORMANCE OPTIMIZATION: Try detected format first
440
- detected_format = detect_date_format(date_str)
441
- if detected_format:
442
- try:
443
- parsed_date = datetime.strptime(date_str, detected_format)
444
- return parsed_date.strftime('%m/%d/%Y')
445
- except ValueError:
446
- pass
447
-
448
- # PERFORMANCE OPTIMIZATION: Try most common format first (MM/DD/YYYY)
449
- # This reduces the average number of format attempts from 8 to ~1-2
450
- try:
451
- parsed_date = datetime.strptime(date_str, '%m/%d/%Y')
452
- return parsed_date.strftime('%m/%d/%Y')
453
- except ValueError:
454
- pass
455
-
456
- # PERFORMANCE OPTIMIZATION: Try second most common format (MM-DD-YYYY)
457
- try:
458
- parsed_date = datetime.strptime(date_str, '%m-%d-%Y')
459
- return parsed_date.strftime('%m/%d/%Y')
460
- except ValueError:
461
- pass
462
-
463
- # PERFORMANCE OPTIMIZATION: Try 2-digit year formats only if needed
464
- try:
465
- parsed_date = datetime.strptime(date_str, '%m/%d/%y')
466
- return parsed_date.strftime('%m/%d/%Y')
467
- except ValueError:
468
- pass
469
-
470
- try:
471
- parsed_date = datetime.strptime(date_str, '%m-%d-%y')
472
- return parsed_date.strftime('%m/%d/%Y')
473
- except ValueError:
474
- pass
475
-
476
- # PERFORMANCE OPTIMIZATION: Try YYYY formats only if needed
477
- try:
478
- parsed_date = datetime.strptime(date_str, '%Y/%m/%d')
479
- return parsed_date.strftime('%m/%d/%Y')
480
- except ValueError:
481
- pass
482
-
483
- try:
484
- parsed_date = datetime.strptime(date_str, '%Y-%m-%d')
485
- return parsed_date.strftime('%m/%d/%Y')
486
- except ValueError:
487
- pass
488
-
489
- # PERFORMANCE OPTIMIZATION: Try datetime formats only if needed
490
- try:
491
- parsed_date = datetime.strptime(date_str, '%m/%d/%Y %H:%M:%S')
492
- return parsed_date.strftime('%m/%d/%Y')
493
- except ValueError:
494
- pass
495
-
496
- try:
497
- parsed_date = datetime.strptime(date_str, '%m-%d-%Y %H:%M:%S')
498
- return parsed_date.strftime('%m/%d/%Y')
499
- except ValueError:
500
- pass
501
-
502
- # If no format matches, try to extract date components
503
- try:
504
- # Remove any time components and extra text
505
- date_only = date_str.split()[0] # Take first part if there's extra text
506
-
507
- # Try to extract numeric components
508
- import re
509
- numbers = re.findall(r'\d+', date_only)
510
-
511
- if len(numbers) >= 3:
512
- # Assume MM/DD/YYYY or MM-DD-YYYY format
513
- month, day, year = int(numbers[0]), int(numbers[1]), int(numbers[2])
514
-
515
- # Validate ranges
516
- if 1 <= month <= 12 and 1 <= day <= 31 and 1900 <= year <= 2100:
517
- # Handle 2-digit years
518
- if year < 100:
519
- year += 2000 if year < 50 else 1900
520
-
521
- parsed_date = datetime(year, month, day)
522
- return parsed_date.strftime('%m/%d/%Y')
523
- except (ValueError, IndexError):
524
- pass
525
-
526
- # If all parsing attempts fail, return empty string
527
- return ''
528
-
529
- def convert_surgery_date(csv_data):
530
- """
531
- Converts surgery date strings to datetime objects with comprehensive data cleaning.
532
-
533
- Parameters:
534
- - csv_data (list): List of dictionaries containing CSV row data
535
- """
536
- # TIMING: Start surgery date conversion with granular tracking
537
- total_start_time = time.time()
538
- date_cleaning_time = 0
539
- date_parsing_time = 0
540
- processed_count = 0
541
- empty_count = 0
542
- error_count = 0
543
-
544
- print("Starting surgery date conversion for {} rows...".format(len(csv_data)))
545
- # LOGGING STRATEGY: Only log start/end of looped events at INFO level, not individual successes
546
- # MediLink_ConfigLoader.log("Starting surgery date conversion for {} rows...".format(len(csv_data)), level="INFO") # REMOVED
547
-
548
- # PERFORMANCE OPTIMIZATION: Pre-compile datetime.strptime for the most common format
549
- # This avoids repeated format string parsing
550
- standard_format = '%m/%d/%Y'
551
-
552
- for row_idx, row in enumerate(csv_data, 1):
553
- surgery_date_str = row.get('Surgery Date', '')
554
-
555
- if not surgery_date_str:
556
- empty_count += 1
557
- # LOGGING STRATEGY: Only log actual errors/failures, not routine empty dates
558
- # if empty_count <= 5: # Only log first 5 empty dates
559
- # MediLink_ConfigLoader.log("Warning: Surgery Date not found for row: {}".format(row), level="WARNING")
560
- # print("Surgery Date not found for row: {}".format(row))
561
- row['Surgery Date'] = datetime.min # Assign a minimum datetime value if empty
562
- else:
563
- # TIMING: Start date string cleaning
564
- cleaning_start = time.time()
565
-
566
- # Clean the date string first
567
- cleaned_date_str = clean_surgery_date_string(surgery_date_str)
568
-
569
- # TIMING: End date string cleaning
570
- cleaning_end = time.time()
571
- date_cleaning_time += (cleaning_end - cleaning_start)
572
-
573
- if not cleaned_date_str:
574
- error_count += 1
575
- # LOGGING STRATEGY: Log actual errors (cleaning failures) at INFO level
576
- if error_count <= 5: # Only log first 5 errors
577
- MediLink_ConfigLoader.log("Error: Could not clean Surgery Date '{}' for row: {}".format(surgery_date_str, row), level="INFO")
578
- print("Could not clean Surgery Date '{}' for row: {}".format(surgery_date_str, row))
579
- row['Surgery Date'] = datetime.min # Assign a minimum datetime value if cleaning fails
580
- else:
581
- # TIMING: Start date parsing
582
- parsing_start = time.time()
583
-
584
- try:
585
- # PERFORMANCE OPTIMIZATION: Use pre-compiled format string
586
- # Parse the cleaned date string
587
- row['Surgery Date'] = datetime.strptime(cleaned_date_str, standard_format)
588
- processed_count += 1
589
- # LOGGING STRATEGY: Remove success logging - DEBUG is typically silent anyway
590
- # if processed_count <= 10 or processed_count % 100 == 0: # Log first 10 and every 100th
591
- # MediLink_ConfigLoader.log("Successfully cleaned and parsed Surgery Date '{}' -> '{}' for row: {}".format(
592
- # surgery_date_str, cleaned_date_str, row), level="DEBUG")
593
- except ValueError as e:
594
- error_count += 1
595
- # LOGGING STRATEGY: Log actual errors (parsing failures) at INFO level
596
- if error_count <= 5: # Only log first 5 parsing errors
597
- MediLink_ConfigLoader.log("Error parsing cleaned Surgery Date '{}': {} for row: {}".format(
598
- cleaned_date_str, e, row), level="INFO")
599
- row['Surgery Date'] = datetime.min # Assign a minimum datetime value if parsing fails
600
-
601
- # TIMING: End date parsing
602
- parsing_end = time.time()
603
- date_parsing_time += (parsing_end - parsing_start)
604
-
605
- # TIMING: End total surgery date conversion
606
- total_end_time = time.time()
607
- total_duration = total_end_time - total_start_time
608
-
609
- if PERFORMANCE_LOGGING:
610
- print("Surgery date conversion completed:")
611
- print(" - Total duration: {:.2f} seconds".format(total_duration))
612
- print(" - Date cleaning time: {:.2f} seconds ({:.1f}%)".format(date_cleaning_time, (date_cleaning_time/total_duration)*100))
613
- print(" - Date parsing time: {:.2f} seconds ({:.1f}%)".format(date_parsing_time, (date_parsing_time/total_duration)*100))
614
- print(" - Processed: {} rows, Empty: {} rows, Errors: {} rows".format(processed_count, empty_count, error_count))
615
-
616
- # LOGGING STRATEGY: Log completion summary at INFO level (end of looped event)
617
- MediLink_ConfigLoader.log("Surgery date conversion completed - Total: {:.2f}s, Cleaning: {:.2f}s, Parsing: {:.2f}s, Processed: {}, Empty: {}, Errors: {}".format(
618
- total_duration, date_cleaning_time, date_parsing_time, processed_count, empty_count, error_count), level="INFO")
619
-
620
- def _create_common_tie_breakers(row):
621
- """
622
- Creates common tie-breaker components used across multiple sorting strategies.
623
- This follows DRY principle by extracting shared logic.
624
- """
625
- last_name = ((row.get('Patient Last') or '')).strip().upper()
626
- first_name = ((row.get('Patient First') or '')).strip().upper()
627
- patient_id_tiebreak = str(row.get('Patient ID') or '')
628
- return (last_name, first_name, patient_id_tiebreak)
629
-
630
- def _normalize_surgery_date(row):
631
- """
632
- Normalizes surgery date for consistent sorting across strategies.
633
- """
634
- # Prefer earliest surgery date across all known dates for the patient
635
- earliest = row.get('_earliest_surgery_date')
636
- if isinstance(earliest, str) and earliest and earliest != 'MISSING':
637
- try:
638
- return datetime.strptime(earliest, '%m-%d-%Y')
639
- except Exception:
640
- pass
641
-
642
- # Fallback to the single Surgery Date field
643
- surgery_date = row.get('Surgery Date')
644
- if isinstance(surgery_date, datetime):
645
- return surgery_date
646
- elif isinstance(surgery_date, str) and surgery_date.strip():
647
- try:
648
- return datetime.strptime(surgery_date, '%m/%d/%Y')
649
- except ValueError:
650
- try:
651
- return datetime.strptime(surgery_date, '%m-%d-%Y')
652
- except ValueError:
653
- pass
654
-
655
- return datetime.min
656
-
657
- def _get_schedule_position(row):
658
- """
659
- Gets the schedule position for a patient from stored DOCX data.
660
- Returns a high number if no schedule data is available (puts at end).
661
- """
662
- schedule_positions = row.get('_schedule_positions', {})
663
- surgery_date = row.get('Surgery Date')
664
-
665
- # Convert surgery date to string format for lookup
666
- if isinstance(surgery_date, datetime):
667
- surgery_date_str = surgery_date.strftime('%m-%d-%Y')
668
- else:
669
- surgery_date_str = str(surgery_date)
670
-
671
- # Return schedule position if available, otherwise high number (end of list)
672
- return schedule_positions.get(surgery_date_str, 9999)
673
-
674
- def _get_surgery_date_string(row):
675
- """
676
- Gets surgery date as string for consistent sorting.
677
- """
678
- surgery_date = row.get('Surgery Date')
679
- if isinstance(surgery_date, datetime):
680
- return surgery_date.strftime('%m-%d-%Y')
681
- else:
682
- return str(surgery_date)
683
-
684
- def _create_date_based_sort_key(row):
685
- """
686
- Current date-based sorting logic (extracted from existing sort_key function).
687
- """
688
- normalized_date = _normalize_surgery_date(row)
689
- tie_breakers = _create_common_tie_breakers(row)
690
- return (normalized_date,) + tie_breakers
691
-
692
- def _create_schedule_based_sort_key(row):
693
- """
694
- Schedule-based sorting logic (new strategy).
695
- Uses patient position in DOCX surgery schedule as primary sort criterion.
696
- """
697
- schedule_position = _get_schedule_position(row)
698
- surgery_date_str = _get_surgery_date_string(row)
699
- tie_breakers = _create_common_tie_breakers(row)
700
- return (schedule_position, surgery_date_str) + tie_breakers
701
-
702
- def create_sort_key_strategy(strategy_type='date_based'):
703
- """
704
- Factory function that returns the appropriate sort key function.
705
- Follows existing strategy patterns in the codebase.
706
- """
707
- if strategy_type == 'schedule_based':
708
- return _create_schedule_based_sort_key
709
- else:
710
- return _create_date_based_sort_key
711
-
712
- def sort_and_deduplicate(csv_data):
713
- # Create a dictionary to hold unique patients based on Patient ID
714
- unique_patients = {}
715
- # Create a dictionary to store multiple surgery dates per patient
716
- patient_surgery_dates = {}
717
-
718
- # Iterate through the CSV data and populate the unique_patients dictionary
719
- for row in csv_data:
720
- patient_id = row.get('Patient ID')
721
- surgery_date = row.get('Surgery Date')
722
-
723
- if patient_id not in unique_patients:
724
- unique_patients[patient_id] = row
725
- patient_surgery_dates[patient_id] = [surgery_date]
726
- else:
727
- # If the patient ID already exists, compare surgery dates
728
- existing_row = unique_patients[patient_id]
729
- existing_date = existing_row['Surgery Date']
730
-
731
- # Ensure both dates are comparable by converting to datetime objects
732
- def normalize_date_for_comparison(date_value):
733
- if isinstance(date_value, datetime):
734
- return date_value
735
- elif isinstance(date_value, str) and date_value.strip():
736
- try:
737
- # Try to parse the string as a date
738
- return datetime.strptime(date_value, '%m/%d/%Y')
739
- except ValueError:
740
- try:
741
- return datetime.strptime(date_value, '%m-%d-%Y')
742
- except ValueError:
743
- # If parsing fails, return minimum datetime
744
- return datetime.min
745
- else:
746
- # Empty or invalid values get minimum datetime
747
- return datetime.min
748
-
749
- normalized_surgery_date = normalize_date_for_comparison(surgery_date)
750
- normalized_existing_date = normalize_date_for_comparison(existing_date)
751
-
752
- # Keep the most current demographic data (later surgery date takes precedence)
753
- if normalized_surgery_date > normalized_existing_date:
754
- # Store the old row's surgery date before replacing
755
- old_date = existing_row['Surgery Date']
756
- # Add the old date to the list if it's not already there
757
- if old_date not in patient_surgery_dates[patient_id]:
758
- patient_surgery_dates[patient_id].append(old_date)
759
- # Replace with newer row (better demographics)
760
- unique_patients[patient_id] = row
761
- # Add the new surgery date to the list if it's not already there
762
- if surgery_date not in patient_surgery_dates[patient_id]:
763
- patient_surgery_dates[patient_id].append(surgery_date)
764
- else:
765
- # Add this surgery date to the list for this patient if it's not already there
766
- if surgery_date not in patient_surgery_dates[patient_id]:
767
- patient_surgery_dates[patient_id].append(surgery_date)
768
-
769
- # Store the surgery dates information in the first row of each patient for later access
770
- for patient_id, row in unique_patients.items():
771
- # Convert surgery dates to strings for consistent storage
772
- surgery_date_strings = []
773
- for date in patient_surgery_dates[patient_id]:
774
- if isinstance(date, datetime):
775
- if date == datetime.min:
776
- surgery_date_strings.append('MISSING')
777
- else:
778
- surgery_date_strings.append(date.strftime('%m-%d-%Y'))
779
- else:
780
- surgery_date_strings.append(str(date) if date else 'MISSING')
781
-
782
- # Remove duplicates and sort
783
- unique_surgery_dates = list(set(surgery_date_strings))
784
- sorted_surgery_dates = sorted(unique_surgery_dates, key=lambda x: datetime.strptime(x, '%m-%d-%Y') if x != 'MISSING' else datetime.min)
785
- row['_all_surgery_dates'] = sorted_surgery_dates
786
- row['_primary_surgery_date'] = row['Surgery Date'] # Keep track of which date has the demographics
787
- # Compute and store earliest surgery date for emission sort
788
- earliest_dt = None
789
- earliest_str = None
790
- for d in sorted_surgery_dates:
791
- if d and d != 'MISSING':
792
- try:
793
- earliest_dt = datetime.strptime(d, '%m-%d-%Y')
794
- earliest_str = d
795
- break
796
- except Exception:
797
- pass
798
- # Fallback to demographics date if earliest could not be determined
799
- if earliest_str is None:
800
- try:
801
- sd = row.get('Surgery Date')
802
- if isinstance(sd, datetime) and sd != datetime.min:
803
- earliest_dt = sd
804
- earliest_str = sd.strftime('%m-%d-%Y')
805
- elif isinstance(sd, str) and sd.strip():
806
- try:
807
- earliest_dt = datetime.strptime(sd, '%m/%d/%Y')
808
- except Exception:
809
- try:
810
- earliest_dt = datetime.strptime(sd, '%m-%d-%Y')
811
- except Exception:
812
- earliest_dt = None
813
- earliest_str = sd
814
- except Exception:
815
- earliest_dt = None
816
- earliest_str = None
817
- row['_earliest_surgery_date'] = earliest_str
818
-
819
-
820
-
821
- # Convert the unique_patients dictionary back to a list and sort it
822
- # Use strategy pattern for sorting (follows existing codebase patterns)
823
- sort_key_func = create_sort_key_strategy(SORTING_STRATEGY)
824
-
825
- csv_data[:] = sorted(unique_patients.values(), key=sort_key_func)
826
-
827
- # TODO: Consider adding an option in the config to sort based on Surgery Schedules when available.
828
- # If no schedule is available, the current sorting strategy will be used.
829
- #
830
- # IMPLEMENTATION STATUS: Backend infrastructure is ready.
831
- # To enable surgery schedule sorting, set SORTING_STRATEGY = 'schedule_based' above.
832
- # The system will automatically fall back to date-based sorting if schedule data is unavailable.
833
-
834
- def combine_fields(csv_data):
835
- for row in csv_data:
836
- # Safely handle the 'Surgery Date' conversion with clear missing indicator
837
- surgery_date = row.get('Surgery Date')
838
- try:
839
- if isinstance(surgery_date, datetime):
840
- if surgery_date == datetime.min:
841
- row['Surgery Date'] = 'MISSING'
842
- else:
843
- row['Surgery Date'] = surgery_date.strftime('%m-%d-%Y')
844
- elif surgery_date:
845
- # Already a non-empty string
846
- row['Surgery Date'] = str(surgery_date)
847
- else:
848
- row['Surgery Date'] = 'MISSING'
849
- except Exception:
850
- row['Surgery Date'] = 'MISSING'
851
-
852
- first_name = '_'.join(part.strip() for part in row.get('Patient First', '').split()) # Join the first name parts with underscores after cleaning.
853
- middle_name = row.get('Patient Middle', '').strip()
854
- middle_name = middle_name[0] if len(middle_name) > 1 else '' # Take only the first character or empty
855
- last_name = '_'.join(part.strip() for part in row.get('Patient Last', '').split()) # Join the last name parts with underscores after cleaning.
856
- row['Patient Name'] = ', '.join(filter(None, [last_name, first_name])) + (' ' + middle_name if middle_name else '') # Comma between last and first, space before middle
857
-
858
- address1 = row.get('Patient Address1', '').strip()
859
- address2 = row.get('Patient Address2', '').strip()
860
- row['Patient Street'] = ' '.join(filter(None, [address1, address2])) # Join non-empty addresses
861
-
862
- def apply_replacements(csv_data, crosswalk):
863
- replacements = crosswalk.get('csv_replacements', {})
864
- # Pre-define the keys to check for better performance
865
- keys_to_check = ['Patient SSN', 'Primary Insurance', 'Ins1 Payer ID']
866
-
867
- for row in csv_data:
868
- # Use early termination - check each replacement only if needed
869
- for old_value, new_value in replacements.items():
870
- replacement_made = False
871
- for key in keys_to_check:
872
- if row.get(key) == old_value:
873
- row[key] = new_value
874
- replacement_made = True
875
- break # Exit the key loop once a replacement is made
876
- if replacement_made:
877
- break # Exit the replacement loop once any replacement is made
878
-
879
- import difflib
880
- from collections import defaultdict
881
-
882
- def find_best_medisoft_id(insurance_name, medisoft_ids, medisoft_to_mains_names):
883
- """
884
- Finds the best matching Medisoft ID for a given insurance name using fuzzy matching.
885
-
886
- Parameters:
887
- - insurance_name (str): The insurance name from the CSV row.
888
- - medisoft_ids (list): List of Medisoft IDs associated with the Payer ID.
889
- - medisoft_to_mains_names (dict): Mapping from Medisoft ID to list of MAINS names.
890
-
891
- Returns:
892
- - int or None: The best matching Medisoft ID or None if no match is found.
893
- """
894
- best_match_ratio = 0
895
- best_medisoft_id = None
896
-
897
- # Pre-process insurance name once
898
- processed_insurance = ''.join(c for c in insurance_name if not c.isdigit()).upper()
899
-
900
- for medisoft_id in medisoft_ids:
901
- mains_names = medisoft_to_mains_names.get(medisoft_id, [])
902
- for mains_name in mains_names:
903
- # Preprocess names by extracting non-numeric characters and converting to uppercase
904
- # Use more efficient string processing
905
- processed_mains = ''.join(c for c in mains_name if not c.isdigit()).upper()
906
-
907
- # Log the processed names before computing the match ratio
908
- MediLink_ConfigLoader.log("Processing Medisoft ID '{}': Comparing processed insurance '{}' with processed mains '{}'.".format(medisoft_id, processed_insurance, processed_mains), level="DEBUG")
909
-
910
- # Compute the similarity ratio
911
- match_ratio = difflib.SequenceMatcher(None, processed_insurance, processed_mains).ratio()
912
-
913
- # Log the match ratio
914
- MediLink_ConfigLoader.log("Match ratio for Medisoft ID '{}': {:.2f}".format(medisoft_id, match_ratio), level="DEBUG")
915
-
916
- if match_ratio > best_match_ratio:
917
- best_match_ratio = match_ratio
918
- best_medisoft_id = medisoft_id
919
- # Log the current best match
920
- MediLink_ConfigLoader.log("New best match found: Medisoft ID '{}' with match ratio {:.2f}".format(best_medisoft_id, best_match_ratio), level="DEBUG")
921
-
922
- # Log the final best match ratio and ID
923
- MediLink_ConfigLoader.log("Final best match ratio: {:.2f} for Medisoft ID '{}'".format(best_match_ratio, best_medisoft_id), level="DEBUG")
924
-
925
- # No threshold applied, return the best match found
926
- return best_medisoft_id
927
-
928
- def NEW_update_insurance_ids(csv_data, config, crosswalk):
929
- """
930
- Updates the 'Ins1 Insurance ID' field in each row of csv_data based on the crosswalk and MAINS data.
931
-
932
- Parameters:
933
- - csv_data (list of dict): The CSV data where each row is represented as a dictionary.
934
- - config (dict): Configuration object containing necessary paths and parameters.
935
- - crosswalk (dict): Crosswalk data containing mappings between Payer IDs and Medisoft IDs.
936
-
937
- Returns:
938
- - None: The function modifies the csv_data in place.
939
- """
940
- processed_payer_ids = set() # Track processed Payer IDs
941
- MediLink_ConfigLoader.log("Starting update of insurance IDs.", level="INFO")
942
-
943
- # PERFORMANCE FIX: Pre-build flattened payer lookup cache to avoid nested dictionary access
944
- payer_cache = {}
945
- crosswalk_payers = crosswalk.get('payer_id', {})
946
- for payer_id, details in crosswalk_payers.items():
947
- payer_cache[payer_id] = {
948
- 'medisoft_id': details.get('medisoft_id', []),
949
- 'medisoft_medicare_id': details.get('medisoft_medicare_id', []),
950
- 'endpoint': details.get('endpoint', None)
951
- }
952
- MediLink_ConfigLoader.log("Built payer cache for {} payers".format(len(payer_cache)), level="DEBUG")
953
-
954
- # Load MAINS data to get mapping from Medisoft ID to MAINS names
955
- insurance_to_id = load_insurance_data_from_mains(config) # Assuming it returns a dict mapping insurance names to IDs
956
- MediLink_ConfigLoader.log("Loaded MAINS data for insurance to ID mapping.", level="DEBUG")
957
-
958
- # Invert the mapping to get Medisoft ID to MAINS names
959
- medisoft_to_mains_names = defaultdict(list)
960
- for insurance_name, medisoft_id in insurance_to_id.items():
961
- medisoft_to_mains_names[medisoft_id].append(insurance_name)
962
-
963
- for row_idx, row in enumerate(csv_data, 1):
964
- # PERFORMANCE FIX: Store row index to avoid O(n) csv_data.index() calls later
965
- row['_row_index'] = row_idx
966
- ins1_payer_id = row.get('Ins1 Payer ID', '').strip()
967
- MediLink_ConfigLoader.log("Processing row with Ins1 Payer ID: '{}'.".format(ins1_payer_id), level="DEBUG")
968
-
969
- if ins1_payer_id:
970
- # Mark this Payer ID as processed
971
- if ins1_payer_id not in processed_payer_ids:
972
- processed_payer_ids.add(ins1_payer_id) # Add to set
973
- MediLink_ConfigLoader.log("Marked Payer ID '{}' as processed.".format(ins1_payer_id), level="DEBUG")
974
-
975
- # PERFORMANCE FIX: Use flattened cache instead of nested dictionary lookups
976
- payer_info = payer_cache.get(ins1_payer_id, {})
977
- medisoft_ids = payer_info.get('medisoft_id', [])
978
- MediLink_ConfigLoader.log("Retrieved Medisoft IDs for Payer ID '{}': {}".format(ins1_payer_id, medisoft_ids), level="DEBUG")
979
-
980
- if not medisoft_ids:
981
- MediLink_ConfigLoader.log("No Medisoft IDs available for Payer ID '{}', creating placeholder entry.".format(ins1_payer_id), level="WARNING")
982
- # Create a placeholder entry in the crosswalk and cache
983
- placeholder_entry = {
984
- 'medisoft_id': [], # Placeholder for future Medisoft IDs
985
- 'medisoft_medicare_id': [], # Placeholder for future Medicare IDs
986
- 'endpoint': None # Placeholder for future endpoint
987
- }
988
- if 'payer_id' not in crosswalk:
989
- crosswalk['payer_id'] = {}
990
- crosswalk['payer_id'][ins1_payer_id] = placeholder_entry
991
- # PERFORMANCE FIX: Update cache with placeholder entry
992
- payer_cache[ins1_payer_id] = placeholder_entry
993
- continue # Skip further processing for this Payer ID
994
-
995
- # If only one Medisoft ID is associated, assign it directly
996
- if len(medisoft_ids) == 1:
997
- try:
998
- medisoft_id = int(medisoft_ids[0])
999
- row['Ins1 Insurance ID'] = medisoft_id
1000
- # PERFORMANCE FIX: Use enumerate index instead of csv_data.index() which is O(n)
1001
- row_number = getattr(row, '_row_index', 'Unknown')
1002
- MediLink_ConfigLoader.log("Assigned Medisoft ID '{}' to row number {} with Payer ID '{}'.".format(medisoft_id, row_number, ins1_payer_id), level="DEBUG")
1003
- except ValueError as e:
1004
- MediLink_ConfigLoader.log("Error converting Medisoft ID '{}' to integer for Payer ID '{}': {}".format(medisoft_ids[0], ins1_payer_id, e), level="ERROR")
1005
- row['Ins1 Insurance ID'] = None
1006
- continue # Move to the next row
1007
-
1008
- # If multiple Medisoft IDs are associated, perform fuzzy matching
1009
- insurance_name = row.get('Primary Insurance', '').strip()
1010
- if not insurance_name:
1011
- MediLink_ConfigLoader.log("Row with Payer ID '{}' missing 'Primary Insurance', skipping assignment.".format(ins1_payer_id), level="WARNING")
1012
- continue # Skip if insurance name is missing
1013
-
1014
- best_medisoft_id = find_best_medisoft_id(insurance_name, medisoft_ids, medisoft_to_mains_names)
1015
-
1016
- if best_medisoft_id:
1017
- row['Ins1 Insurance ID'] = best_medisoft_id
1018
- MediLink_ConfigLoader.log("Assigned Medisoft ID '{}' to row with Payer ID '{}' based on fuzzy match.".format(best_medisoft_id, ins1_payer_id), level="INFO")
1019
- else:
1020
- # Default to the first Medisoft ID if no good match is found
1021
- try:
1022
- default_medisoft_id = int(medisoft_ids[0])
1023
- row['Ins1 Insurance ID'] = default_medisoft_id
1024
- MediLink_ConfigLoader.log("No suitable match found. Defaulted to Medisoft ID '{}' for Payer ID '{}'.".format(default_medisoft_id, ins1_payer_id), level="INFO")
1025
- except ValueError as e:
1026
- MediLink_ConfigLoader.log("Error converting default Medisoft ID '{}' to integer for Payer ID '{}': {}".format(medisoft_ids[0], ins1_payer_id, e), level="ERROR")
1027
- row['Ins1 Insurance ID'] = None
1028
-
1029
- def update_insurance_ids(csv_data, config, crosswalk):
1030
- # LOGGING STRATEGY: Remove DEBUG level function start log - DEBUG is typically silent anyway
1031
- # MediLink_ConfigLoader.log("Starting update_insurance_ids function.", level="DEBUG")
1032
-
1033
- # TIMING: Start insurance ID updates with granular tracking
1034
- total_start_time = time.time()
1035
- lookup_build_time = 0
1036
- csv_processing_time = 0
1037
- processed_count = 0
1038
- medicare_count = 0
1039
- regular_count = 0
1040
- placeholder_count = 0
1041
-
1042
- print("Starting insurance ID updates for {} rows...".format(len(csv_data)))
1043
- # LOGGING STRATEGY: Only log start/end of looped events at INFO level, not individual successes
1044
- # MediLink_ConfigLoader.log("Starting insurance ID updates for {} rows...".format(len(csv_data)), level="INFO") # REMOVED
1045
-
1046
- # TIMING: Start lookup dictionary building
1047
- lookup_start_time = time.time()
1048
-
1049
- # PERFORMANCE FIX: Pre-build optimized lookup dictionaries for both regular and Medicare IDs
1050
- # This reduces Medicare processing overhead by building lookups once instead of repeated processing
1051
- payer_id_to_medisoft = {}
1052
- payer_id_to_medicare = {}
1053
- # LOGGING STRATEGY: Remove DEBUG level initialization log - DEBUG is typically silent anyway
1054
- # MediLink_ConfigLoader.log("Initialized optimized lookup dictionaries for Medicare and regular IDs.", level="DEBUG")
1055
-
1056
- # Build both lookup dictionaries simultaneously to avoid multiple iterations
1057
- for payer_id, details in crosswalk.get('payer_id', {}).items():
1058
- # Get both regular and Medicare IDs
1059
- medisoft_ids = details.get('medisoft_id', [])
1060
- medicare_ids = details.get('medisoft_medicare_id', [])
1061
-
1062
- # Filter empty strings once for each type
1063
- medisoft_ids = [id for id in medisoft_ids if id] if medisoft_ids else []
1064
- medicare_ids = [id for id in medicare_ids if id] if medicare_ids else []
1065
-
1066
- # Store first valid ID for quick lookup (Medicare takes precedence if available)
1067
- payer_id_to_medisoft[payer_id] = int(medisoft_ids[0]) if medisoft_ids else None
1068
- payer_id_to_medicare[payer_id] = int(medicare_ids[0]) if medicare_ids else None
1069
-
1070
- # LOGGING STRATEGY: Remove success logging - DEBUG is typically silent anyway
1071
- # if len(payer_id_to_medisoft) <= 10 or len(payer_id_to_medisoft) % 50 == 0: # Log first 10 and every 50th
1072
- # MediLink_ConfigLoader.log("Processed Payer ID '{}': Regular IDs: {}, Medicare IDs: {}".format(
1073
- # payer_id, medisoft_ids, medicare_ids), level="DEBUG")
1074
-
1075
- # TIMING: End lookup dictionary building
1076
- lookup_end_time = time.time()
1077
- lookup_build_time = lookup_end_time - lookup_start_time
1078
-
1079
- if PERFORMANCE_LOGGING:
1080
- print("Built lookup dictionaries in {:.2f} seconds for {} payer IDs".format(lookup_build_time, len(payer_id_to_medisoft)))
1081
-
1082
-
1083
- # TIMING: Start CSV processing
1084
- csv_start_time = time.time()
1085
-
1086
- # PERFORMANCE FIX: Single pass through CSV data with optimized Medicare ID resolution
1087
- for row_idx, row in enumerate(csv_data, 1):
1088
- ins1_payer_id = row.get('Ins1 Payer ID', '').strip()
1089
- # LOGGING STRATEGY: Remove success logging - DEBUG is typically silent anyway
1090
- # if row_idx <= 10 or row_idx % 100 == 0: # Log first 10 and every 100th
1091
- # MediLink_ConfigLoader.log("Processing row #{} with Ins1 Payer ID '{}'.".format(row_idx, ins1_payer_id), level="DEBUG")
1092
-
1093
- # Try Medicare ID first, then fall back to regular ID (optimized Medicare processing)
1094
- insurance_id = (payer_id_to_medicare.get(ins1_payer_id) or
1095
- payer_id_to_medisoft.get(ins1_payer_id))
1096
-
1097
- if insurance_id is None and ins1_payer_id not in payer_id_to_medisoft:
1098
- # Add placeholder entry for new payer ID (preserve original functionality)
1099
- payer_id_to_medisoft[ins1_payer_id] = None
1100
- payer_id_to_medicare[ins1_payer_id] = None
1101
- crosswalk.setdefault('payer_id', {})[ins1_payer_id] = {
1102
- 'medisoft_id': [], # Placeholder for future Medisoft IDs
1103
- 'medisoft_medicare_id': [], # Placeholder for future Medicare IDs
1104
- 'endpoint': None # Placeholder for future endpoint
1105
- }
1106
- placeholder_count += 1
1107
- # LOGGING STRATEGY: Log actual events (new payer IDs) at INFO level
1108
- if placeholder_count <= 5: # Only log first 5 placeholders
1109
- MediLink_ConfigLoader.log("Added placeholder entry for new Payer ID '{}'.".format(ins1_payer_id), level="INFO")
1110
- elif insurance_id == payer_id_to_medicare.get(ins1_payer_id):
1111
- medicare_count += 1
1112
- else:
1113
- regular_count += 1
1114
-
1115
- # Assign the resolved insurance ID to the row
1116
- row['Ins1 Insurance ID'] = insurance_id
1117
- # TODO (SECONDARY QUEUE): When building a secondary-claims queue after Medicare crossover,
1118
- # set claim_type='secondary' and attach prior payer fields here from the Medicare primary outcome:
1119
- # - row['prior_payer_name'] = 'MEDICARE'
1120
- # - row['prior_payer_id'] = best Medicare ID from config/crosswalk
1121
- # - optionally row['primary_paid_amount'], row['cas_adjustments'] extracted from 835
1122
- processed_count += 1
1123
- # LOGGING STRATEGY: Remove success logging - DEBUG is typically silent anyway
1124
- # if processed_count <= 10 or processed_count % 100 == 0: # Log first 10 and every 100th
1125
- # MediLink_ConfigLoader.log("Assigned Insurance ID '{}' to row with Ins1 Payer ID '{}'.".format(insurance_id, ins1_payer_id), level="DEBUG")
1126
-
1127
- # TIMING: End CSV processing
1128
- csv_end_time = time.time()
1129
- csv_processing_time = csv_end_time - csv_start_time
1130
-
1131
- # TIMING: End total insurance ID updates
1132
- total_end_time = time.time()
1133
- total_duration = total_end_time - total_start_time
1134
-
1135
- if PERFORMANCE_LOGGING:
1136
- print("Insurance ID updates completed:")
1137
- print(" - Total duration: {:.2f} seconds".format(total_duration))
1138
- print(" - Lookup building time: {:.2f} seconds ({:.1f}%)".format(lookup_build_time, (lookup_build_time/total_duration)*100))
1139
- print(" - CSV processing time: {:.2f} seconds ({:.1f}%)".format(csv_processing_time, (csv_processing_time/total_duration)*100))
1140
- print(" - Processed: {} rows, Medicare: {} rows, Regular: {} rows, Placeholders: {} rows".format(
1141
- processed_count, medicare_count, regular_count, placeholder_count))
1142
-
1143
- # LOGGING STRATEGY: Log completion summary at INFO level (end of looped event)
1144
- MediLink_ConfigLoader.log("Insurance ID updates completed - Total: {:.2f}s, Lookup: {:.2f}s, Processing: {:.2f}s, Processed: {}, Medicare: {}, Regular: {}, Placeholders: {}".format(
1145
- total_duration, lookup_build_time, csv_processing_time, processed_count, medicare_count, regular_count, placeholder_count), level="INFO")
1146
-
1147
- def update_procedure_codes(csv_data, crosswalk):
1148
-
1149
- # Get Medisoft shorthand dictionary from crosswalk and reverse it
1150
- diagnosis_to_medisoft = crosswalk.get('diagnosis_to_medisoft', {}) # BUG We need to be careful here in case we decide we need to change the crosswalk data specifically with regard to the T8/H usage.
1151
- medisoft_to_diagnosis = {v: k for k, v in diagnosis_to_medisoft.items()}
1152
-
1153
- # Get procedure code to diagnosis dictionary from crosswalk and reverse it for easier lookup
1154
- diagnosis_to_procedure = {
1155
- diagnosis_code: procedure_code
1156
- for procedure_code, diagnosis_codes in crosswalk.get('procedure_to_diagnosis', {}).items()
1157
- for diagnosis_code in diagnosis_codes
1158
- }
1159
-
1160
- # Initialize counters for tracking
1161
- updated_count = 0
1162
- missing_medisoft_codes = set()
1163
- missing_procedure_mappings = set()
1164
-
1165
- # Update the "Procedure Code" column in the CSV data
1166
- for row_num, row in enumerate(csv_data, start=1):
1167
- try:
1168
- medisoft_code = row.get('Default Diagnosis #1', '').strip()
1169
- diagnosis_code = medisoft_to_diagnosis.get(medisoft_code)
1170
-
1171
- if diagnosis_code:
1172
- procedure_code = diagnosis_to_procedure.get(diagnosis_code)
1173
- if procedure_code:
1174
- row['Procedure Code'] = procedure_code
1175
- updated_count += 1
1176
- else:
1177
- # Track missing procedure mapping
1178
- missing_procedure_mappings.add(diagnosis_code)
1179
- row['Procedure Code'] = "Unknown" # Will be handled by 837p encoder
1180
- MediLink_ConfigLoader.log("Missing procedure mapping for diagnosis code '{}' (Medisoft code: '{}') in row {}".format(
1181
- diagnosis_code, medisoft_code, row_num), level="WARNING")
1182
- else:
1183
- # Track missing Medisoft code mapping
1184
- if medisoft_code: # Only track if there's actually a code
1185
- missing_medisoft_codes.add(medisoft_code)
1186
- row['Procedure Code'] = "Unknown" # Will be handled by 837p encoder
1187
- MediLink_ConfigLoader.log("Missing Medisoft code mapping for '{}' in row {}".format(
1188
- medisoft_code, row_num), level="WARNING")
1189
- except Exception as e:
1190
- MediLink_ConfigLoader.log("In update_procedure_codes, Error processing row {}: {}".format(row_num, e), level="ERROR")
1191
-
1192
- # Log summary statistics
1193
- MediLink_ConfigLoader.log("Total {} 'Procedure Code' rows updated.".format(updated_count), level="INFO")
1194
-
1195
- if missing_medisoft_codes:
1196
- MediLink_ConfigLoader.log("Missing Medisoft code mappings: {}".format(sorted(missing_medisoft_codes)), level="WARNING")
1197
- print("WARNING: {} Medisoft codes need to be added to diagnosis_to_medisoft mapping: {}".format(
1198
- len(missing_medisoft_codes), sorted(missing_medisoft_codes)))
1199
-
1200
- if missing_procedure_mappings:
1201
- MediLink_ConfigLoader.log("Missing procedure mappings for diagnosis codes: {}".format(sorted(missing_procedure_mappings)), level="WARNING")
1202
- print("WARNING: {} diagnosis codes need to be added to procedure_to_diagnosis mapping: {}".format(
1203
- len(missing_procedure_mappings), sorted(missing_procedure_mappings)))
1204
-
1205
- return True
1206
-
1207
- def update_diagnosis_codes(csv_data):
1208
- try:
1209
- # TIMING: Start surgery schedule parsing timing
1210
- parsing_start_time = time.time()
1211
- print("Starting surgery schedule parsing at: {}".format(time.strftime("%H:%M:%S")))
1212
- MediLink_ConfigLoader.log("Starting surgery schedule parsing at: {}".format(time.strftime("%H:%M:%S")), level="INFO")
1213
-
1214
- # Use cached configuration instead of loading repeatedly
1215
- config, crosswalk = get_cached_configuration()
1216
-
1217
- # Extract the local storage path from the configuration
1218
- local_storage_path = config['MediLink_Config']['local_storage_path']
1219
-
1220
- # Initialize a dictionary to hold diagnosis codes from all DOCX files
1221
- all_patient_data = {}
1222
- all_schedule_positions = {} # NEW: Store schedule positions for future sorting
1223
-
1224
- # Convert surgery dates in CSV data
1225
- convert_surgery_date(csv_data)
1226
-
1227
- # Extract all valid surgery dates from csv_data
1228
- surgery_dates = [row['Surgery Date'] for row in csv_data if row['Surgery Date'] != datetime.min]
1229
-
1230
- if not surgery_dates:
1231
- raise ValueError("No valid surgery dates found in csv_data.")
1232
-
1233
- # Determine the minimum and maximum surgery dates
1234
- min_surgery_date = min(surgery_dates)
1235
- max_surgery_date = max(surgery_dates)
1236
-
1237
- # Apply a +/-8-day margin to the surgery dates... Increased from 5 days.
1238
- margin = timedelta(days=8)
1239
- threshold_start = min_surgery_date - margin
1240
- threshold_end = max_surgery_date + margin
1241
-
1242
- # TODO (Low) This is a bad idea. We need a better way to handle this because it leaves
1243
- # us with a situation where if we take 'too long' to download the DOCX files, it will presume that the DOCX files are out of range because
1244
- # the modfied date is a bad proxy for the date of the surgery which would be contained inside the DOCX file. The processing overhead for extracting the
1245
- # date of the surgery from the DOCX file is non-trivial and computationally expensive so we need a smarter way to handle this.
1246
-
1247
- MediLink_ConfigLoader.log("BAD IDEA: Processing DOCX files modified between {} and {}.".format(threshold_start, threshold_end), level="INFO")
1248
-
1249
- # TIMING: Start file system operations
1250
- filesystem_start_time = time.time()
1251
-
1252
- # PERFORMANCE OPTIMIZATION: Batch file system operations with caching
1253
- # Pre-convert threshold timestamps for efficient comparison (Windows XP compatible)
1254
- threshold_start_ts = threshold_start.timestamp() if hasattr(threshold_start, 'timestamp') else time.mktime(threshold_start.timetuple())
1255
- threshold_end_ts = threshold_end.timestamp() if hasattr(threshold_end, 'timestamp') else time.mktime(threshold_end.timetuple())
1256
-
1257
- valid_files = []
1258
- try:
1259
- # Use os.listdir() with optimized timestamp comparison (XP/3.4.4 compatible)
1260
- for filename in os.listdir(local_storage_path):
1261
- if filename.endswith('.docx'):
1262
- filepath = os.path.join(local_storage_path, filename)
1263
- # Get file modification time in single operation
1264
- try:
1265
- stat_info = os.stat(filepath)
1266
- # Direct timestamp comparison avoids datetime conversion overhead
1267
- if threshold_start_ts <= stat_info.st_mtime <= threshold_end_ts:
1268
- valid_files.append(filepath)
1269
- except (OSError, ValueError):
1270
- # Skip files with invalid modification times
1271
- continue
1272
- except OSError:
1273
- MediLink_ConfigLoader.log("Error accessing directory: {}".format(local_storage_path), level="ERROR")
1274
- return
1275
-
1276
- # TIMING: End file system operations
1277
- filesystem_end_time = time.time()
1278
- filesystem_duration = filesystem_end_time - filesystem_start_time
1279
-
1280
- # PERFORMANCE OPTIMIZATION: Log file count for debugging without processing overhead
1281
- MediLink_ConfigLoader.log("Found {} DOCX files within date threshold".format(len(valid_files)), level="INFO")
1282
-
1283
- # TIMING: Start CSV data preprocessing
1284
- csv_prep_start_time = time.time()
1285
-
1286
- # PERFORMANCE OPTIMIZATION: Pre-process patient IDs for efficient lookup
1287
- # Create a set of patient IDs from CSV data for faster lookups
1288
- patient_ids_in_csv = {row.get('Patient ID', '').strip() for row in csv_data}
1289
-
1290
- # PERFORMANCE OPTIMIZATION: Pre-convert surgery dates to string format
1291
- # Convert all surgery dates to string format once to avoid repeated conversions in loops
1292
- surgery_date_strings = {}
1293
- for row in csv_data:
1294
- patient_id = row.get('Patient ID', '').strip()
1295
- surgery_date = row.get('Surgery Date')
1296
- if surgery_date != datetime.min:
1297
- surgery_date_strings[patient_id] = surgery_date.strftime("%m-%d-%Y")
1298
- else:
1299
- surgery_date_strings[patient_id] = ''
1300
-
1301
- # TIMING: End CSV data preprocessing
1302
- csv_prep_end_time = time.time()
1303
- csv_prep_duration = csv_prep_end_time - csv_prep_start_time
1304
-
1305
- # TIMING: Log before processing DOCX files
1306
- docx_processing_start_time = time.time()
1307
- print("Found {} DOCX files to process. Starting DOCX parsing...".format(len(valid_files)))
1308
- MediLink_ConfigLoader.log("Found {} DOCX files to process. Starting DOCX parsing...".format(len(valid_files)), level="INFO")
1309
-
1310
- # TIMING: Track individual DOCX file processing
1311
- docx_files_processed = 0
1312
- docx_files_skipped = 0
1313
- docx_parse_errors = 0
1314
-
1315
- # Process valid DOCX files
1316
- for filepath in valid_files:
1317
- # TIMING: Start individual file processing
1318
- file_start_time = time.time()
1319
-
1320
- try:
1321
- if SORTING_STRATEGY == 'schedule_based':
1322
- # Enhanced parsing to capture schedule positions
1323
- patient_data, schedule_positions = parse_docx(filepath, surgery_dates, capture_schedule_positions=True) # Pass surgery_dates to parse_docx
1324
- # Store schedule positions for future sorting
1325
- for patient_id, dates in schedule_positions.items():
1326
- if patient_id not in all_schedule_positions:
1327
- all_schedule_positions[patient_id] = {}
1328
- all_schedule_positions[patient_id].update(dates)
1329
- else:
1330
- # Standard parsing (maintains backward compatibility)
1331
- patient_data = parse_docx(filepath, surgery_dates, capture_schedule_positions=False) # Pass surgery_dates to parse_docx
1332
-
1333
- docx_files_processed += 1
1334
-
1335
- # PERFORMANCE OPTIMIZATION: Use defaultdict for more efficient dictionary operations
1336
- for patient_id, service_dates in patient_data.items():
1337
- if patient_id not in all_patient_data:
1338
- all_patient_data[patient_id] = {}
1339
- for date_of_service, diagnosis_data in service_dates.items():
1340
- # TODO: SURGERY SCHEDULE CONFLICT RESOLUTION
1341
- # Implement enhanced conflict detection and logging as outlined in
1342
- # surgery_schedule_conflict_resolution_strategy.md
1343
- #
1344
- # Current behavior: Silent overwriting with latest file wins
1345
- # Proposed enhancement:
1346
- # 1. Detect when multiple files contain data for same date
1347
- # 2. Log conflicts with date-organized notifications showing:
1348
- # - Source files (with modification timestamps)
1349
- # - Patients affected (added/removed/modified)
1350
- # - Specific changes (diagnosis, laterality, etc.)
1351
- # 3. Use file modification time to determine priority
1352
- # 4. Generate summary report organized by surgery date
1353
- #
1354
- # Example notification format:
1355
- # "SURGERY SCHEDULE CONFLICTS DETECTED FOR: 12/15/2023"
1356
- # " Original: file1.docx (modified: 08:30:00)"
1357
- # " Revised: file2.docx (modified: 14:45:00)"
1358
- # " Patients affected: 3 modified, 1 added, 1 removed"
1359
- # " Resolution: Using latest file (file2.docx)"
1360
- #
1361
- # This will provide transparency when revised schedules overwrite
1362
- # original schedules, organized by the affected surgery dates.
1363
- all_patient_data[patient_id][date_of_service] = diagnosis_data
1364
- except Exception as e:
1365
- docx_parse_errors += 1
1366
- MediLink_ConfigLoader.log("Error parsing DOCX file {}: {}".format(filepath, e), level="ERROR")
1367
-
1368
- # TIMING: End individual file processing
1369
- file_end_time = time.time()
1370
- file_duration = file_end_time - file_start_time
1371
-
1372
- # Log slow files (taking more than 1 second)
1373
- if file_duration > 1.0 and PERFORMANCE_LOGGING:
1374
- print(" - Slow file: {} (Duration: {:.2f} seconds)".format(os.path.basename(filepath), file_duration))
1375
-
1376
- # TIMING: Log DOCX processing completion
1377
- docx_processing_end_time = time.time()
1378
- docx_processing_duration = docx_processing_end_time - docx_processing_start_time
1379
- if PERFORMANCE_LOGGING:
1380
- print("DOCX parsing completed at: {} (Duration: {:.2f} seconds)".format(
1381
- time.strftime("%H:%M:%S"), docx_processing_duration))
1382
- print(" - Files processed: {}, Files skipped: {}, Parse errors: {}".format(
1383
- docx_files_processed, docx_files_skipped, docx_parse_errors))
1384
- MediLink_ConfigLoader.log("DOCX parsing completed at: {} (Duration: {:.2f} seconds)".format(
1385
- time.strftime("%H:%M:%S"), docx_processing_duration), level="INFO")
1386
-
1387
- # Log if no valid files were found
1388
- if not valid_files:
1389
- MediLink_ConfigLoader.log("No valid DOCX files found within the modification time threshold.", level="INFO")
1390
-
1391
- # Debug logging for all_patient_data
1392
- MediLink_ConfigLoader.log("All patient data collected from DOCX files: {}".format(all_patient_data), level="DEBUG")
1393
-
1394
- # Check if any patient data was collected
1395
- if not all_patient_data or not patient_ids_in_csv.intersection(all_patient_data.keys()):
1396
- MediLink_ConfigLoader.log("No patient data collected or no matching Patient IDs found. Skipping further processing.", level="INFO")
1397
- return # Exit the function early if no data is available
1398
-
1399
- # TIMING: Start CSV data matching
1400
- csv_matching_start_time = time.time()
1401
-
1402
- # Get Medisoft shorthand dictionary from crosswalk.
1403
- diagnosis_to_medisoft = crosswalk.get('diagnosis_to_medisoft', {})
1404
-
1405
- # Initialize counter for updated rows
1406
- updated_count = 0
1407
-
1408
- # PERFORMANCE OPTIMIZATION: Single pass through CSV data with pre-processed lookups
1409
- # Update the "Default Diagnosis #1" column in the CSV data and store diagnosis codes for all surgery dates
1410
- for row_num, row in enumerate(csv_data, start=1):
1411
- patient_id = row.get('Patient ID', '').strip()
1412
- # Use pre-processed patient ID lookup for efficiency
1413
- if patient_id not in patient_ids_in_csv:
1414
- continue # Skip rows that do not match any patient ID
1415
-
1416
- MediLink_ConfigLoader.log("Processing row number {}.".format(row_num), level="DEBUG")
1417
-
1418
- # Get all surgery dates for this patient
1419
- all_surgery_dates = row.get('_all_surgery_dates', [row.get('Surgery Date')])
1420
-
1421
- # Create a mapping of surgery dates to diagnosis codes for this patient
1422
- surgery_date_to_diagnosis = {}
1423
-
1424
- if patient_id in all_patient_data:
1425
- # Process each surgery date for this patient
1426
- for surgery_date in all_surgery_dates:
1427
- # Convert surgery date to string format for lookup
1428
- try:
1429
- if hasattr(surgery_date, 'strftime'):
1430
- surgery_date_str = surgery_date.strftime('%m-%d-%Y')
1431
- else:
1432
- surgery_date_str = str(surgery_date)
1433
- except Exception:
1434
- surgery_date_str = str(surgery_date)
1435
-
1436
- MediLink_ConfigLoader.log("Patient ID: {}, Surgery Date: {}".format(patient_id, surgery_date_str), level="DEBUG")
1437
-
1438
- if surgery_date_str in all_patient_data[patient_id]:
1439
- diagnosis_data = all_patient_data[patient_id][surgery_date_str]
1440
- # XP SP3 + Py3.4.4 compatible tuple unpacking with safety check
1441
- try:
1442
- if isinstance(diagnosis_data, (list, tuple)) and len(diagnosis_data) >= 3:
1443
- diagnosis_code, left_or_right_eye, femto_yes_or_no = diagnosis_data
1444
- else:
1445
- # Handle case where diagnosis_data is not a proper tuple
1446
- diagnosis_code = diagnosis_data if diagnosis_data else None
1447
- left_or_right_eye = None
1448
- femto_yes_or_no = None
1449
- except Exception as e:
1450
- MediLink_ConfigLoader.log("Error unpacking diagnosis data for Patient ID: {}, Surgery Date: {}: {}".format(
1451
- patient_id, surgery_date_str, str(e)), level="WARNING")
1452
- diagnosis_code = None
1453
- left_or_right_eye = None
1454
- femto_yes_or_no = None
1455
-
1456
- MediLink_ConfigLoader.log("Found diagnosis data for Patient ID: {}, Surgery Date: {}".format(patient_id, surgery_date_str), level="DEBUG")
1457
-
1458
- # Convert diagnosis code to Medisoft shorthand format.
1459
- # XP SP3 + Py3.4.4 compatible null check
1460
- if diagnosis_code is None:
1461
- medisoft_shorthand = 'N/A'
1462
- MediLink_ConfigLoader.log("Diagnosis code is None for Patient ID: {}, Surgery Date: {}".format(
1463
- patient_id, surgery_date_str), level="WARNING")
1464
- else:
1465
- medisoft_shorthand = diagnosis_to_medisoft.get(diagnosis_code, None)
1466
- if medisoft_shorthand is None and diagnosis_code:
1467
- # Use fallback logic for missing mapping (XP SP3 + Py3.4.4 compatible)
1468
- try:
1469
- defaulted_code = diagnosis_code.lstrip('H').lstrip('T8').replace('.', '')[-5:]
1470
- # Basic validation: ensure code is not empty and has reasonable length
1471
- if defaulted_code and len(defaulted_code) >= 3:
1472
- medisoft_shorthand = defaulted_code
1473
- MediLink_ConfigLoader.log("Missing diagnosis mapping for '{}', using fallback code '{}'".format(
1474
- diagnosis_code, medisoft_shorthand), level="WARNING")
1475
- else:
1476
- medisoft_shorthand = 'N/A'
1477
- MediLink_ConfigLoader.log("Fallback diagnosis code validation failed for '{}', using 'N/A'".format(
1478
- diagnosis_code), level="WARNING")
1479
- except Exception as e:
1480
- medisoft_shorthand = 'N/A'
1481
- MediLink_ConfigLoader.log("Error in fallback diagnosis code generation for '{}': {}".format(
1482
- diagnosis_code, str(e)), level="WARNING")
1483
-
1484
- MediLink_ConfigLoader.log("Converted diagnosis code to Medisoft shorthand: {}".format(medisoft_shorthand), level="DEBUG")
1485
-
1486
- surgery_date_to_diagnosis[surgery_date_str] = medisoft_shorthand
1487
- else:
1488
- MediLink_ConfigLoader.log("No matching surgery date found for Patient ID: {} on date {}.".format(patient_id, surgery_date_str), level="INFO")
1489
- surgery_date_to_diagnosis[surgery_date_str] = 'N/A'
1490
-
1491
- # Store the diagnosis mapping for all surgery dates
1492
- row['_surgery_date_to_diagnosis'] = surgery_date_to_diagnosis
1493
-
1494
- # NEW: Store schedule positions for future sorting if available
1495
- if SORTING_STRATEGY == 'schedule_based' and patient_id in all_schedule_positions:
1496
- row['_schedule_positions'] = all_schedule_positions[patient_id]
1497
-
1498
- # Set the primary diagnosis code (for the main surgery date)
1499
- primary_surgery_date = row.get('Surgery Date')
1500
- # Convert primary surgery date to string for lookup
1501
- if isinstance(primary_surgery_date, datetime):
1502
- primary_surgery_date_str = primary_surgery_date.strftime('%m-%d-%Y')
1503
- else:
1504
- primary_surgery_date_str = str(primary_surgery_date)
1505
- primary_diagnosis = surgery_date_to_diagnosis.get(primary_surgery_date_str, 'N/A')
1506
- row['Default Diagnosis #1'] = primary_diagnosis
1507
-
1508
- updated_count += 1
1509
- MediLink_ConfigLoader.log("Updated row number {} with diagnosis codes for {} surgery dates.".format(row_num, len(all_surgery_dates)), level="INFO")
1510
- else:
1511
- MediLink_ConfigLoader.log("Patient ID: {} not found in DOCX data for row {}.".format(patient_id, row_num), level="INFO")
1512
-
1513
- # TIMING: End CSV data matching
1514
- csv_matching_end_time = time.time()
1515
- csv_matching_duration = csv_matching_end_time - csv_matching_start_time
1516
-
1517
- # Log total count of updated rows
1518
- MediLink_ConfigLoader.log("Total {} 'Default Diagnosis #1' rows updated.".format(updated_count), level="INFO")
1519
-
1520
- # TIMING: End surgery schedule parsing timing
1521
- parsing_end_time = time.time()
1522
- parsing_duration = parsing_end_time - parsing_start_time
1523
- if PERFORMANCE_LOGGING:
1524
- print("Surgery schedule parsing completed at: {} (Duration: {:.2f} seconds)".format(
1525
- time.strftime("%H:%M:%S"), parsing_duration))
1526
- print(" - File system operations: {:.2f} seconds ({:.1f}%)".format(filesystem_duration, (filesystem_duration/parsing_duration)*100))
1527
- print(" - CSV data preprocessing: {:.2f} seconds ({:.1f}%)".format(csv_prep_duration, (csv_prep_duration/parsing_duration)*100))
1528
- print(" - DOCX file processing: {:.2f} seconds ({:.1f}%)".format(docx_processing_duration, (docx_processing_duration/parsing_duration)*100))
1529
- print(" - CSV data matching: {:.2f} seconds ({:.1f}%)".format(csv_matching_duration, (csv_matching_duration/parsing_duration)*100))
1530
- print(" - Files processed: {}, Files skipped: {}, Parse errors: {}".format(docx_files_processed, docx_files_skipped, docx_parse_errors))
1531
- MediLink_ConfigLoader.log("Surgery schedule parsing completed at: {} (Duration: {:.2f} seconds)".format(
1532
- time.strftime("%H:%M:%S"), parsing_duration), level="INFO")
1533
-
1534
- except Exception as e:
1535
- message = "An error occurred while updating diagnosis codes. Please check the DOCX files and configuration: {}".format(e)
1536
- MediLink_ConfigLoader.log(message, level="ERROR")
1537
- print(message)
1538
-
1539
- def load_data_sources(config, crosswalk):
1540
- """Loads historical mappings from MAPAT and Carol's CSVs."""
1541
- patient_id_to_insurance_id = load_insurance_data_from_mapat(config, crosswalk)
1542
- if not patient_id_to_insurance_id:
1543
- raise ValueError("Failed to load historical Patient ID to Insurance ID mappings from MAPAT.")
1544
-
1545
- payer_id_to_patient_ids = load_historical_payer_to_patient_mappings(config)
1546
- if not payer_id_to_patient_ids:
1547
- raise ValueError("Failed to load historical Carol's CSVs.")
1548
-
1549
- return patient_id_to_insurance_id, payer_id_to_patient_ids
1550
-
1551
- def map_payer_ids_to_insurance_ids(patient_id_to_insurance_id, payer_id_to_patient_ids):
1552
- """Maps Payer IDs to Insurance IDs based on the historical mappings."""
1553
- payer_id_to_details = {}
1554
- for payer_id, patient_ids in payer_id_to_patient_ids.items():
1555
- medisoft_ids = set()
1556
- for patient_id in patient_ids:
1557
- if patient_id in patient_id_to_insurance_id:
1558
- medisoft_id = patient_id_to_insurance_id[patient_id]
1559
- medisoft_ids.add(medisoft_id)
1560
- MediLink_ConfigLoader.log("Added Medisoft ID {} for Patient ID {} and Payer ID {}".format(medisoft_id, patient_id, payer_id))
1561
- else:
1562
- MediLink_ConfigLoader.log("No matching Insurance ID found for Patient ID {}".format(patient_id))
1563
- if medisoft_ids:
1564
- payer_id_to_details[payer_id] = {
1565
- "endpoint": "OPTUMEDI", # TODO Default, to be refined via API poll. There are 2 of these defaults!
1566
- "medisoft_id": list(medisoft_ids),
1567
- "medisoft_medicare_id": [] # Placeholder for future implementation
1568
- }
1569
- return payer_id_to_details
1570
-
1571
- def _display_mains_file_error(mains_path):
1572
- """
1573
- Helper function to display the critical MAINS file error message.
1574
-
1575
- Args:
1576
- mains_path (str): The path where the MAINS file was expected to be found.
1577
- """
1578
- error_msg = "CRITICAL: MAINS file not found at: {}. This file is required for insurance name to Medisoft ID mapping.".format(mains_path)
1579
- if hasattr(MediLink_ConfigLoader, 'log'):
1580
- MediLink_ConfigLoader.log(error_msg, level="CRITICAL")
1581
- print("\n" + "="*80)
1582
- print("CRITICAL ERROR: MAINS FILE MISSING")
1583
- print("="*80)
1584
- print("\nThe MAINS file is required for the following critical functions:")
1585
- print("* Mapping insurance company names to Medisoft IDs")
1586
- print("* Converting insurance names to payer IDs for claim submission")
1587
- print("* Creating properly formatted 837p claim files")
1588
- print("\nWithout this file, claim submission will fail because:")
1589
- print("* Insurance names cannot be converted to payer IDs")
1590
- print("* 837p claim files cannot be generated")
1591
- print("* Claims cannot be submitted to insurance companies")
1592
- print("\nTO FIX THIS:")
1593
- print("1. Ensure the MAINS file exists at: {}".format(mains_path))
1594
- print("2. If the file is missing, llamar a Dani")
1595
- print("3. The file should contain insurance company data from your Medisoft system")
1596
- print("="*80)
1597
- time.sleep(3) # 3 second pause to allow user to read critical error message
1598
-
1599
-
1600
- def load_insurance_data_from_mains(config):
1601
- """
1602
- Loads insurance data from MAINS and creates a mapping from insurance names to their respective IDs.
1603
- This mapping is critical for the crosswalk update process to correctly associate payer IDs with insurance IDs.
1604
-
1605
- Args:
1606
- config (dict): Configuration object containing necessary paths and parameters.
1607
-
1608
- Returns:
1609
- dict: A dictionary mapping insurance names to insurance IDs.
1610
- """
1611
- # Use cached configuration to avoid repeated loading
1612
- try:
1613
- config, crosswalk = get_cached_configuration()
1614
- except Exception as e:
1615
- print("Warning: Failed to load cached configuration: {}".format(e))
1616
- # Return empty mapping if configuration loading fails
1617
- return {}
1618
-
1619
- # XP Compatibility: Check if MediLink_DataMgmt is available
1620
- if MediLink_DataMgmt is None:
1621
- print("Warning: MediLink_DataMgmt not available. Cannot load MAINS data.")
1622
- return {}
1623
-
1624
- # Retrieve MAINS path and slicing information from the configuration
1625
- # TODO (Low) For secondary insurance, this needs to be pulling from the correct MAINS (there are 2)
1626
- # TODO (Low) Performance: There probably needs to be a dictionary proxy for MAINS that gets updated.
1627
- # Meh, this just has to be part of the new architecture plan where we make Medisoft a downstream
1628
- # recipient from the db.
1629
- # TODO (High) The Medisoft Medicare flag needs to be brought in here.
1630
- try:
1631
- mains_path = config.get('MAINS_MED_PATH', '')
1632
- mains_slices = crosswalk.get('mains_mapping', {}).get('slices', {})
1633
- except (KeyError, AttributeError) as e:
1634
- print("Warning: Failed to get MAINS configuration: {}".format(e))
1635
- return {}
1636
-
1637
- # Initialize the dictionary to hold the insurance to insurance ID mappings
1638
- insurance_to_id = {}
1639
-
1640
- try:
1641
- # Check if MAINS file exists before attempting to read
1642
- if not os.path.exists(mains_path):
1643
- _display_mains_file_error(mains_path)
1644
- return insurance_to_id
1645
-
1646
- # XP Compatibility: Check if MediLink_DataMgmt has the required function
1647
- if not hasattr(MediLink_DataMgmt, 'read_general_fixed_width_data'):
1648
- print("Warning: MediLink_DataMgmt.read_general_fixed_width_data not available. Cannot load MAINS data.")
1649
- return insurance_to_id
1650
-
1651
- # Read data from MAINS using a provided function to handle fixed-width data
1652
- for record, line_number in MediLink_DataMgmt.read_general_fixed_width_data(mains_path, mains_slices):
1653
- insurance_name = record['MAINSNAME']
1654
- # Assuming line_number gives the correct insurance ID without needing adjustment
1655
- insurance_to_id[insurance_name] = line_number
1656
-
1657
- if hasattr(MediLink_ConfigLoader, 'log'):
1658
- MediLink_ConfigLoader.log("Successfully loaded {} insurance records from MAINS".format(len(insurance_to_id)), level="INFO")
1659
- else:
1660
- print("Successfully loaded {} insurance records from MAINS".format(len(insurance_to_id)))
1661
-
1662
- except FileNotFoundError:
1663
- _display_mains_file_error(mains_path)
1664
- except Exception as e:
1665
- error_msg = "Error loading MAINS data: {}. Continuing without MAINS data.".format(str(e))
1666
- if hasattr(MediLink_ConfigLoader, 'log'):
1667
- MediLink_ConfigLoader.log(error_msg, level="ERROR")
1668
- print("Error loading MAINS data: {}. Continuing without MAINS data.".format(str(e)))
1669
-
1670
- return insurance_to_id
1671
-
1672
- def load_insurance_data_from_mapat(config, crosswalk):
1673
- """
1674
- Loads insurance data from MAPAT and creates a mapping from patient ID to insurance ID.
1675
-
1676
- Args:
1677
- config (dict): Configuration object containing necessary paths and parameters.
1678
- crosswalk ... ADD HERE.
1679
-
1680
- Returns:
1681
- dict: A dictionary mapping patient IDs to insurance IDs.
1682
- """
1683
- # Retrieve MAPAT path and slicing information from the configuration
1684
- ac = _ac()
1685
- mapat_path = ac.get_mapat_med_path() if ac else ''
1686
- mapat_slices = crosswalk['mapat_mapping']['slices']
1687
-
1688
- # Initialize the dictionary to hold the patient ID to insurance ID mappings
1689
- patient_id_to_insurance_id = {}
1690
-
1691
- # Read data from MAPAT using a provided function to handle fixed-width data
1692
- for record, _ in MediLink_DataMgmt.read_general_fixed_width_data(mapat_path, mapat_slices):
1693
- patient_id = record['MAPATPXID']
1694
- insurance_id = record['MAPATINID']
1695
- patient_id_to_insurance_id[patient_id] = insurance_id
1696
-
1697
- return patient_id_to_insurance_id
1698
-
1699
- def parse_z_dat(z_dat_path, config): # Why is this in MediBot and not MediLink?
1700
- """
1701
- Parses the Z.dat file to map Patient IDs to Insurance Names using the provided fixed-width file format.
1702
-
1703
- Args:
1704
- z_dat_path (str): Path to the Z.dat file.
1705
- config (dict): Configuration object containing slicing information and other parameters.
1706
-
1707
- Returns:
1708
- dict: A dictionary mapping Patient IDs to Insurance Names.
1709
- """
1710
- patient_id_to_insurance_name = {}
1711
-
1712
- try:
1713
- # Reading blocks of fixed-width data (up to 5 lines per record)
1714
- for personal_info, insurance_info, service_info, service_info_2, service_info_3 in MediLink_DataMgmt.read_fixed_width_data(z_dat_path):
1715
- # Parse Z.dat reserved record format: 3 active + 2 reserved lines
1716
- parsed_data = MediLink_DataMgmt.parse_fixed_width_data(personal_info, insurance_info, service_info, service_info_2, service_info_3, config.get('MediLink_Config', config))
1717
-
1718
- # Extract Patient ID and Insurance Name from parsed data
1719
- patient_id = parsed_data.get('PATID')
1720
- insurance_name = parsed_data.get('INAME')
1721
-
1722
- if patient_id and insurance_name:
1723
- patient_id_to_insurance_name[patient_id] = insurance_name
1724
- MediLink_ConfigLoader.log("Mapped Patient ID {} to Insurance Name {}".format(patient_id, insurance_name), config, level="INFO")
1725
-
1726
- except FileNotFoundError:
1727
- MediLink_ConfigLoader.log("File not found: {}".format(z_dat_path), config, level="INFO")
1728
- except Exception as e:
1729
- MediLink_ConfigLoader.log("Failed to parse Z.dat: {}".format(str(e)), config, level="INFO")
1730
-
1731
- return patient_id_to_insurance_name
1732
-
1733
- def load_historical_payer_to_patient_mappings(config):
1734
- """
1735
- Loads historical mappings from multiple Carol's CSV files in a specified directory,
1736
- mapping Payer IDs to sets of Patient IDs.
1737
-
1738
- Args:
1739
- config (dict): Configuration object containing the directory path for Carol's CSV files
1740
- and other necessary parameters.
1741
-
1742
- Returns:
1743
- dict: A dictionary where each key is a Payer ID and the value is a set of Patient IDs.
1744
- """
1745
- directory_path = os.path.dirname(config['CSV_FILE_PATH'])
1746
- payer_to_patient_ids = defaultdict(set)
1747
-
1748
- try:
1749
- # Check if the directory exists
1750
- if not os.path.isdir(directory_path):
1751
- raise FileNotFoundError("Directory '{}' not found.".format(directory_path))
1752
-
1753
- # Loop through each file in the directory containing Carol's historical CSVs
1754
- for filename in os.listdir(directory_path):
1755
- file_path = os.path.join(directory_path, filename)
1756
- if filename.endswith('.csv'):
1757
- try:
1758
- with open(file_path, 'r', encoding='utf-8') as csvfile:
1759
- reader = csv.DictReader(csvfile)
1760
- patient_count = 0 # Counter for Patient IDs found in this CSV
1761
- for row in reader:
1762
- if 'Patient ID' not in row or 'Ins1 Payer ID' not in row:
1763
- continue # Skip this row if either key is missing
1764
- if not row.get('Patient ID').strip() or not row.get('Ins1 Payer ID').strip():
1765
- continue # Skip this row if either value is missing or empty
1766
-
1767
- payer_id = row['Ins1 Payer ID'].strip()
1768
- patient_id = row['Patient ID'].strip()
1769
- payer_to_patient_ids[payer_id].add(patient_id)
1770
- patient_count += 1 # Increment the counter for each valid mapping
1771
-
1772
- # Log the accumulated count for this CSV file
1773
- if patient_count > 0:
1774
- MediLink_ConfigLoader.log("CSV file '{}' has {} Patient IDs with Payer IDs.".format(filename, patient_count), level="DEBUG")
1775
- else:
1776
- MediLink_ConfigLoader.log("CSV file '{}' is empty or does not have valid Patient ID or Payer ID mappings.".format(filename), level="DEBUG")
1777
- except Exception as e:
1778
- print("Error processing file {}: {}".format(filename, e))
1779
- MediLink_ConfigLoader.log("Error processing file '{}': {}".format(filename, e), level="ERROR")
1780
- except FileNotFoundError as e:
1781
- print("Error: {}".format(e))
1782
-
1783
- if not payer_to_patient_ids:
1784
- print("No historical mappings were generated.")
1785
-
1786
- return dict(payer_to_patient_ids)
1787
-
1788
- def capitalize_all_fields(csv_data):
1789
- """
1790
- Converts all text fields in the CSV data to uppercase.
1791
-
1792
- Parameters:
1793
- csv_data (list of dict): The CSV data where each row is represented as a dictionary.
1794
-
1795
- Returns:
1796
- None: The function modifies the csv_data in place.
1797
- """
1798
- # PERFORMANCE FIX: Optimize uppercase conversion while preserving complex types
1799
- for row in csv_data:
1800
- updated_row = {}
1801
- for key, value in row.items():
1802
- # Preserve internal/derived fields intact (e.g., `_all_surgery_dates`, `_surgery_date_to_diagnosis`)
1803
- if isinstance(key, str) and key.startswith('_'):
1804
- updated_row[key] = value
1805
- continue
1806
- # Uppercase plain strings
1807
- if isinstance(value, str):
1808
- updated_row[key] = value.upper()
1809
- continue
1810
- # Preserve complex containers; optionally uppercase their string contents
1811
- if isinstance(value, list):
1812
- updated_row[key] = [elem.upper() if isinstance(elem, str) else elem for elem in value]
1813
- continue
1814
- if isinstance(value, dict):
1815
- updated_row[key] = {k: (v.upper() if isinstance(v, str) else v) for k, v in value.items()}
1816
- continue
1817
- # Leave datetimes as-is; coerce simple scalars to string upper for consistency
1818
- if isinstance(value, datetime):
1819
- updated_row[key] = value
1820
- else:
1821
- updated_row[key] = str(value).upper() if value is not None else value
1
+ # MediBot_Preprocessor_lib.py
2
+ """
3
+ Core preprocessing library for MediBot
4
+ Contains core preprocessing functions and utilities.
5
+ """
6
+
7
+ import csv, time, os, sys
8
+ from datetime import datetime, timedelta
9
+ from collections import OrderedDict
10
+
11
+ # Try to import chardet for encoding detection
12
+ try:
13
+ import chardet
14
+ except ImportError:
15
+ chardet = None # Fallback if chardet is not available
16
+
17
+ # SORTING STRATEGY CONFIGURATION
18
+ # Set to 'schedule_based' to enable surgery schedule sorting
19
+ # Set to 'date_based' to use current date-based sorting (default)
20
+ SORTING_STRATEGY = 'date_based' # Hard-coded with clear comments
21
+
22
+ # When enabled, patients will be sorted based on their position in the DOCX surgery schedule
23
+ # When disabled, patients will be sorted by earliest surgery date (current behavior)
24
+
25
+ # Use core utilities for standardized imports
26
+ from MediCafe.core_utils import (
27
+ import_medibot_module,
28
+ import_medilink_module,
29
+ get_config_loader_with_fallback
30
+ )
31
+
32
+ # Initialize configuration loader with fallback
33
+ MediLink_ConfigLoader = get_config_loader_with_fallback()
34
+
35
+ # Import MediLink_DataMgmt using centralized import function
36
+ MediLink_DataMgmt = import_medilink_module('MediLink_DataMgmt')
37
+
38
+ # Import MediBot modules using centralized import functions
39
+ MediBot_UI = import_medibot_module('MediBot_UI')
40
+ if MediBot_UI:
41
+ app_control = getattr(MediBot_UI, 'app_control', None)
42
+ get_app_control = getattr(MediBot_UI, '_get_app_control', None)
43
+ def _ac():
44
+ try:
45
+ return get_app_control() if get_app_control else getattr(MediBot_UI, 'app_control', None)
46
+ except Exception:
47
+ return getattr(MediBot_UI, 'app_control', None)
48
+ else:
49
+ app_control = None
50
+
51
+ MediBot_docx_decoder = import_medibot_module('MediBot_docx_decoder')
52
+ if MediBot_docx_decoder:
53
+ parse_docx = getattr(MediBot_docx_decoder, 'parse_docx', None)
54
+ else:
55
+ parse_docx = None
56
+
57
+ # Add the parent directory of the project to the Python path
58
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
59
+
60
+ # Configuration cache to avoid repeated loading
61
+ _config_cache = None
62
+ _crosswalk_cache = None
63
+
64
+ # Use core utilities for standardized imports
65
+ from MediCafe.core_utils import get_shared_config_loader
66
+ MediLink_ConfigLoader = get_shared_config_loader()
67
+
68
+ # Ensure MediLink_ConfigLoader is available
69
+ if MediLink_ConfigLoader is None:
70
+ print("Warning: MediLink_ConfigLoader not available. Some functionality may be limited.")
71
+ # Create a minimal fallback logger
72
+ class FallbackLogger:
73
+ def log(self, message, level="INFO"):
74
+ print("[{}] {}".format(level, message))
75
+ MediLink_ConfigLoader = FallbackLogger()
76
+
77
+ # Import centralized logging configuration
78
+ try:
79
+ from MediCafe.logging_config import PERFORMANCE_LOGGING
80
+ except ImportError:
81
+ # Fallback to local flag if centralized config is not available
82
+ PERFORMANCE_LOGGING = False
83
+
84
+ # XP Compatibility: Add robust fallback for configuration loading
85
+ def get_cached_configuration_xp_safe():
86
+ """
87
+ XP-compatible version of get_cached_configuration with robust fallbacks.
88
+ """
89
+ global _config_cache, _crosswalk_cache
90
+
91
+ # If we already have cached data, return it
92
+ if _config_cache is not None and _crosswalk_cache is not None:
93
+ return _config_cache, _crosswalk_cache
94
+
95
+ # Try to load configuration using the standard method
96
+ try:
97
+ if MediLink_ConfigLoader and hasattr(MediLink_ConfigLoader, 'load_configuration'):
98
+ _config_cache, _crosswalk_cache = MediLink_ConfigLoader.load_configuration()
99
+ return _config_cache, _crosswalk_cache
100
+ except Exception as e:
101
+ print("Warning: Failed to load configuration via MediLink_ConfigLoader: {}".format(e))
102
+
103
+ # Fallback: Try to load configuration files directly
104
+ try:
105
+ import json
106
+ project_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
107
+
108
+ # Try to load config.json
109
+ config_path = os.path.join(project_dir, 'json', 'config.json')
110
+ if os.path.exists(config_path):
111
+ with open(config_path, 'r') as f:
112
+ _config_cache = json.load(f)
113
+ else:
114
+ _config_cache = {}
115
+
116
+ # Try to load crosswalk.json
117
+ crosswalk_path = os.path.join(project_dir, 'json', 'crosswalk.json')
118
+ if os.path.exists(crosswalk_path):
119
+ with open(crosswalk_path, 'r') as f:
120
+ _crosswalk_cache = json.load(f)
121
+ else:
122
+ _crosswalk_cache = {}
123
+
124
+ return _config_cache, _crosswalk_cache
125
+
126
+ except Exception as e:
127
+ print("Warning: Failed to load configuration files directly: {}".format(e))
128
+ # Return empty defaults
129
+ _config_cache = {}
130
+ _crosswalk_cache = {}
131
+ return _config_cache, _crosswalk_cache
132
+
133
+ # --- Helper: Read endpoint default from config with safe fallback (XP-safe) ---
134
+ def _get_default_endpoint(config):
135
+ try:
136
+ mlc = config.get('MediLink_Config', {}) if isinstance(config, dict) else {}
137
+ default_ep = mlc.get('default_endpoint', None)
138
+ return default_ep if default_ep else 'OPTUMEDI'
139
+ except Exception:
140
+ return 'OPTUMEDI'
141
+
142
+ class InitializationError(Exception):
143
+ def __init__(self, message):
144
+ self.message = message
145
+ super().__init__(self.message)
146
+
147
+ def initialize(config):
148
+ global AHK_EXECUTABLE, CSV_FILE_PATH, field_mapping, page_end_markers
149
+
150
+ required_keys = {
151
+ 'AHK_EXECUTABLE': "",
152
+ 'CSV_FILE_PATH': "",
153
+ 'field_mapping': {},
154
+ 'page_end_markers': []
155
+ }
156
+
157
+ for key, default in required_keys.items():
158
+ try:
159
+ globals()[key] = config.get(key, default) if key != 'field_mapping' else OrderedDict(config.get(key, default))
160
+ except AttributeError:
161
+ raise InitializationError("Error: '{}' not found in config.".format(key))
162
+
163
+ def get_cached_configuration():
164
+ """
165
+ Returns cached configuration and crosswalk data to avoid repeated I/O operations.
166
+ """
167
+ return get_cached_configuration_xp_safe()
168
+
169
+ def open_csv_for_editing(csv_file_path):
170
+ try:
171
+ # Open the CSV file with its associated application
172
+ os.system('start "" "{}"'.format(csv_file_path))
173
+ print("After saving the revised CSV, please re-run MediBot.")
174
+ except Exception as e:
175
+ print("Failed to open CSV file:", e)
176
+
177
+ # Function to clean the headers
178
+ def clean_header(headers):
179
+ """
180
+ Cleans the header strings by removing unwanted characters and trimming whitespace.
181
+
182
+ Parameters:
183
+ headers (list of str): The original header strings.
184
+
185
+ Returns:
186
+ list of str: The cleaned header strings.
187
+ """
188
+ cleaned_headers = []
189
+
190
+ for header in headers:
191
+ # Strip leading and trailing whitespace
192
+ cleaned_header = header.strip()
193
+ # Remove unwanted characters while keeping spaces, alphanumeric characters, hyphens, and underscores
194
+ cleaned_header = ''.join(char for char in cleaned_header if char.isalnum() or char.isspace() or char in ['-', '_'])
195
+ cleaned_headers.append(cleaned_header)
196
+
197
+ # Log the original and cleaned headers for debugging
198
+ MediLink_ConfigLoader.log("Original headers: {}".format(headers), level="INFO")
199
+ MediLink_ConfigLoader.log("Cleaned headers: {}".format(cleaned_headers), level="INFO")
200
+
201
+ # Check if 'Surgery Date' is in the cleaned headers
202
+ if 'Surgery Date' not in cleaned_headers:
203
+ MediLink_ConfigLoader.log("WARNING: 'Surgery Date' header not found after cleaning.", level="WARNING")
204
+ print("WARNING: 'Surgery Date' header not found after cleaning.")
205
+ raise ValueError("Error: 'Surgery Date' header not found after cleaning.")
206
+
207
+ return cleaned_headers
208
+
209
+ # Function to load and process CSV data
210
+ def load_csv_data(csv_file_path):
211
+ try:
212
+ # Check if the file exists
213
+ if not os.path.exists(csv_file_path):
214
+ raise FileNotFoundError("***Error: CSV file '{}' not found.".format(csv_file_path))
215
+
216
+ # Detect the file encoding
217
+ with open(csv_file_path, 'rb') as f:
218
+ raw_data = f.read()
219
+ if chardet:
220
+ result = chardet.detect(raw_data)
221
+ encoding = result['encoding']
222
+ confidence = result['confidence']
223
+ else:
224
+ # Fallback to UTF-8 when chardet is not available
225
+ encoding = 'utf-8'
226
+ confidence = 1.0
227
+ print("Detected encoding: {} (Confidence: {:.2f})".format(encoding, confidence))
228
+
229
+ # Read the CSV file with the detected encoding
230
+ with open(csv_file_path, 'r', encoding=encoding) as csvfile:
231
+ reader = csv.DictReader(csvfile)
232
+ # Clean the headers
233
+ cleaned_headers = clean_header(reader.fieldnames)
234
+
235
+ # PERFORMANCE FIX: Use zip() instead of range(len()) for header mapping
236
+ header_mapping = {clean: orig for clean, orig in zip(cleaned_headers, reader.fieldnames)}
237
+
238
+ # Process the remaining rows - optimize by pre-allocating the list
239
+ csv_data = []
240
+ # Pre-allocate list size if we can estimate it (optional optimization)
241
+ # csv_data = [None] * estimated_size # if we had row count
242
+
243
+ for row in reader:
244
+ # PERFORMANCE FIX: Use zip() instead of range(len()) for row processing
245
+ cleaned_row = {clean: row[header_mapping[clean]] for clean in cleaned_headers}
246
+ csv_data.append(cleaned_row)
247
+
248
+ return csv_data # Return a list of dictionaries
249
+ except FileNotFoundError as e:
250
+ print(e) # Print the informative error message
251
+ print("Hint: Check if CSV file is located in the expected directory or specify a different path in config file.")
252
+ print("Please correct the issue and re-run MediBot.")
253
+ sys.exit(1) # Halt the script
254
+ except IOError as e:
255
+ print("Error reading CSV file: {}. Please check the file path and permissions.".format(e))
256
+ sys.exit(1) # Halt the script in case of other IO errors
257
+
258
+ # CSV Pre-processor Helper functions
259
+ def add_columns(csv_data, column_headers):
260
+ """
261
+ Adds one or multiple columns to the CSV data.
262
+
263
+ Parameters:
264
+ csv_data (list of dict): The CSV data where each row is represented as a dictionary.
265
+ column_headers (list of str or str): A list of column headers to be added to each row, or a single column header.
266
+
267
+ Returns:
268
+ None: The function modifies the csv_data in place.
269
+ """
270
+ if isinstance(column_headers, str):
271
+ column_headers = [column_headers]
272
+ elif not isinstance(column_headers, list):
273
+ raise ValueError("column_headers should be a list or a string")
274
+
275
+ # PERFORMANCE FIX: Optimize column initialization to avoid nested loop
276
+ for row in csv_data:
277
+ # Use dict.update() to set multiple columns at once
278
+ row.update({header: '' for header in column_headers})
279
+
280
+ # Extracting the list to a variable for future refactoring:
281
+ def filter_rows(csv_data):
282
+ # TODO: This should be written in the crosswalk and not hardcoded here.
283
+ excluded_insurance = {'AETNA', 'AETNA MEDICARE', 'HUMANA MED HMO'}
284
+ csv_data[:] = [row for row in csv_data if row.get('Patient ID') and row.get('Primary Insurance') not in excluded_insurance]
285
+
286
+ def detect_date_format(date_str):
287
+ """
288
+ PERFORMANCE OPTIMIZATION: Quickly detect the most likely date format
289
+ to avoid trying all formats for every date string.
290
+
291
+ Parameters:
292
+ - date_str (str): The date string to analyze
293
+
294
+ Returns:
295
+ - str: The most likely format string, or None if unclear
296
+ """
297
+ if not date_str:
298
+ return None
299
+
300
+ # Remove time components if present
301
+ date_only = date_str.split()[0]
302
+
303
+ # Count separators to guess format
304
+ slash_count = date_only.count('/')
305
+ dash_count = date_only.count('-')
306
+
307
+ # Check for 4-digit year (likely YYYY format)
308
+ if len(date_only) >= 10: # YYYY-MM-DD or YYYY/MM/DD
309
+ if dash_count == 2:
310
+ return '%Y-%m-%d'
311
+ elif slash_count == 2:
312
+ return '%Y/%m/%d'
313
+
314
+ # Check for 2-digit year (likely MM/DD/YY or MM-DD-YY)
315
+ if len(date_only) >= 8: # MM/DD/YY or MM-DD-YY
316
+ if dash_count == 2:
317
+ return '%m-%d-%y'
318
+ elif slash_count == 2:
319
+ return '%m/%d/%y'
320
+
321
+ # Default to most common format (MM/DD/YYYY)
322
+ if dash_count == 2:
323
+ return '%m-%d-%Y'
324
+ elif slash_count == 2:
325
+ return '%m/%d/%Y'
326
+
327
+ return None
328
+
329
+ class OptimizedDate:
330
+ """
331
+ Optimized date object that pre-computes all common format variations
332
+ to avoid redundant datetime conversions throughout the application.
333
+ """
334
+ def __init__(self, datetime_obj):
335
+ self.datetime = datetime_obj
336
+ # Pre-compute all common format variations
337
+ self._display_short = datetime_obj.strftime('%m-%d') # For table display
338
+ self._display_full = datetime_obj.strftime('%m-%d-%Y') # Full format
339
+ self._medisoft_format = datetime_obj.strftime('%m%d%Y') # For Medisoft entry
340
+ self._iso_format = datetime_obj.strftime('%Y-%m-%d') # For sorting/comparison
341
+
342
+ @property
343
+ def display_short(self):
344
+ """Short display format: MM-DD"""
345
+ return self._display_short
346
+
347
+ @property
348
+ def display_full(self):
349
+ """Full display format: MM-DD-YYYY"""
350
+ return self._display_full
351
+
352
+ @property
353
+ def medisoft_format(self):
354
+ """Medisoft entry format: MMDDYYYY"""
355
+ return self._medisoft_format
356
+
357
+ @property
358
+ def iso_format(self):
359
+ """ISO format for sorting: YYYY-MM-DD"""
360
+ return self._iso_format
361
+
362
+ def __str__(self):
363
+ return self._display_full
364
+
365
+ def __repr__(self):
366
+ return "OptimizedDate({})".format(self._display_full)
367
+
368
+ def __eq__(self, other):
369
+ if isinstance(other, OptimizedDate):
370
+ return self.datetime == other.datetime
371
+ elif hasattr(other, 'strftime'): # datetime object
372
+ return self.datetime == other
373
+ return False
374
+
375
+ def __lt__(self, other):
376
+ if isinstance(other, OptimizedDate):
377
+ return self.datetime < other.datetime
378
+ elif hasattr(other, 'strftime'): # datetime object
379
+ return self.datetime < other
380
+ return NotImplemented
381
+
382
+ def __gt__(self, other):
383
+ if isinstance(other, OptimizedDate):
384
+ return self.datetime > other.datetime
385
+ elif hasattr(other, 'strftime'): # datetime object
386
+ return self.datetime > other
387
+ return NotImplemented
388
+
389
+ def strftime(self, format_str):
390
+ """Fallback for any custom format needs"""
391
+ return self.datetime.strftime(format_str)
392
+
393
+ @classmethod
394
+ def from_string(cls, date_str, cleaned=False):
395
+ """
396
+ Create OptimizedDate from string, with optional pre-cleaning.
397
+
398
+ Args:
399
+ date_str: Date string to parse
400
+ cleaned: If True, assumes string is already cleaned
401
+
402
+ Returns:
403
+ OptimizedDate object or None if parsing fails
404
+ """
405
+ if not cleaned:
406
+ date_str = clean_surgery_date_string(date_str)
407
+ if not date_str:
408
+ return None
409
+
410
+ # Try standard format first (most common)
411
+ try:
412
+ return cls(datetime.strptime(date_str, '%m/%d/%Y'))
413
+ except ValueError:
414
+ pass
415
+
416
+ # Try alternative formats
417
+ formats = ['%m-%d-%Y', '%m/%d/%y', '%m-%d-%y', '%Y/%m/%d', '%Y-%m-%d']
418
+ for fmt in formats:
419
+ try:
420
+ return cls(datetime.strptime(date_str, fmt))
421
+ except ValueError:
422
+ continue
423
+
424
+ return None
425
+
426
+ def clean_surgery_date_string(date_str):
427
+ """
428
+ Cleans and normalizes surgery date strings to handle damaged data.
429
+
430
+ Parameters:
431
+ - date_str (str): The raw date string from the CSV
432
+
433
+ Returns:
434
+ - str: Cleaned date string in MM/DD/YYYY format, or empty string if unparseable
435
+ """
436
+ if not date_str:
437
+ return ''
438
+
439
+ # Convert to string and strip whitespace
440
+ date_str = str(date_str).strip()
441
+ if not date_str:
442
+ return ''
443
+
444
+ # Remove common problematic characters and normalize
445
+ date_str = date_str.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
446
+ date_str = ' '.join(date_str.split()) # Normalize whitespace
447
+
448
+ # PERFORMANCE OPTIMIZATION: Try detected format first
449
+ detected_format = detect_date_format(date_str)
450
+ if detected_format:
451
+ try:
452
+ parsed_date = datetime.strptime(date_str, detected_format)
453
+ return parsed_date.strftime('%m/%d/%Y')
454
+ except ValueError:
455
+ pass
456
+
457
+ # PERFORMANCE OPTIMIZATION: Try most common format first (MM/DD/YYYY)
458
+ # This reduces the average number of format attempts from 8 to ~1-2
459
+ try:
460
+ parsed_date = datetime.strptime(date_str, '%m/%d/%Y')
461
+ return parsed_date.strftime('%m/%d/%Y')
462
+ except ValueError:
463
+ pass
464
+
465
+ # PERFORMANCE OPTIMIZATION: Try second most common format (MM-DD-YYYY)
466
+ try:
467
+ parsed_date = datetime.strptime(date_str, '%m-%d-%Y')
468
+ return parsed_date.strftime('%m/%d/%Y')
469
+ except ValueError:
470
+ pass
471
+
472
+ # PERFORMANCE OPTIMIZATION: Try 2-digit year formats only if needed
473
+ try:
474
+ parsed_date = datetime.strptime(date_str, '%m/%d/%y')
475
+ return parsed_date.strftime('%m/%d/%Y')
476
+ except ValueError:
477
+ pass
478
+
479
+ try:
480
+ parsed_date = datetime.strptime(date_str, '%m-%d-%y')
481
+ return parsed_date.strftime('%m/%d/%Y')
482
+ except ValueError:
483
+ pass
484
+
485
+ # PERFORMANCE OPTIMIZATION: Try YYYY formats only if needed
486
+ try:
487
+ parsed_date = datetime.strptime(date_str, '%Y/%m/%d')
488
+ return parsed_date.strftime('%m/%d/%Y')
489
+ except ValueError:
490
+ pass
491
+
492
+ try:
493
+ parsed_date = datetime.strptime(date_str, '%Y-%m-%d')
494
+ return parsed_date.strftime('%m/%d/%Y')
495
+ except ValueError:
496
+ pass
497
+
498
+ # PERFORMANCE OPTIMIZATION: Try datetime formats only if needed
499
+ try:
500
+ parsed_date = datetime.strptime(date_str, '%m/%d/%Y %H:%M:%S')
501
+ return parsed_date.strftime('%m/%d/%Y')
502
+ except ValueError:
503
+ pass
504
+
505
+ try:
506
+ parsed_date = datetime.strptime(date_str, '%m-%d-%Y %H:%M:%S')
507
+ return parsed_date.strftime('%m/%d/%Y')
508
+ except ValueError:
509
+ pass
510
+
511
+ # If no format matches, try to extract date components
512
+ try:
513
+ # Remove any time components and extra text
514
+ date_only = date_str.split()[0] # Take first part if there's extra text
515
+
516
+ # Try to extract numeric components
517
+ import re
518
+ numbers = re.findall(r'\d+', date_only)
519
+
520
+ if len(numbers) >= 3:
521
+ # Assume MM/DD/YYYY or MM-DD-YYYY format
522
+ month, day, year = int(numbers[0]), int(numbers[1]), int(numbers[2])
523
+
524
+ # Validate ranges
525
+ if 1 <= month <= 12 and 1 <= day <= 31 and 1900 <= year <= 2100:
526
+ # Handle 2-digit years
527
+ if year < 100:
528
+ year += 2000 if year < 50 else 1900
529
+
530
+ parsed_date = datetime(year, month, day)
531
+ return parsed_date.strftime('%m/%d/%Y')
532
+ except (ValueError, IndexError):
533
+ pass
534
+
535
+ # If all parsing attempts fail, return empty string
536
+ return ''
537
+
538
+ def convert_surgery_date(csv_data):
539
+ """
540
+ Converts surgery date strings to datetime objects with comprehensive data cleaning.
541
+
542
+ Parameters:
543
+ - csv_data (list): List of dictionaries containing CSV row data
544
+ """
545
+ # TIMING: Start surgery date conversion with granular tracking
546
+ total_start_time = time.time()
547
+ date_cleaning_time = 0
548
+ date_parsing_time = 0
549
+ processed_count = 0
550
+ empty_count = 0
551
+ error_count = 0
552
+
553
+ print("Starting surgery date conversion for {} rows...".format(len(csv_data)))
554
+ # LOGGING STRATEGY: Only log start/end of looped events at INFO level, not individual successes
555
+ # MediLink_ConfigLoader.log("Starting surgery date conversion for {} rows...".format(len(csv_data)), level="INFO") # REMOVED
556
+
557
+ # PERFORMANCE OPTIMIZATION: Pre-compile datetime.strptime for the most common format
558
+ # This avoids repeated format string parsing
559
+ standard_format = '%m/%d/%Y'
560
+
561
+ for row_idx, row in enumerate(csv_data, 1):
562
+ surgery_date_str = row.get('Surgery Date', '')
563
+
564
+ if not surgery_date_str:
565
+ empty_count += 1
566
+ # LOGGING STRATEGY: Only log actual errors/failures, not routine empty dates
567
+ # if empty_count <= 5: # Only log first 5 empty dates
568
+ # MediLink_ConfigLoader.log("Warning: Surgery Date not found for row: {}".format(row), level="WARNING")
569
+ # print("Surgery Date not found for row: {}".format(row))
570
+ row['Surgery Date'] = datetime.min # Assign a minimum datetime value if empty
571
+ else:
572
+ # TIMING: Start date string cleaning
573
+ cleaning_start = time.time()
574
+
575
+ # Clean the date string first
576
+ cleaned_date_str = clean_surgery_date_string(surgery_date_str)
577
+
578
+ # TIMING: End date string cleaning
579
+ cleaning_end = time.time()
580
+ date_cleaning_time += (cleaning_end - cleaning_start)
581
+
582
+ if not cleaned_date_str:
583
+ error_count += 1
584
+ # LOGGING STRATEGY: Log actual errors (cleaning failures) at INFO level
585
+ if error_count <= 5: # Only log first 5 errors
586
+ MediLink_ConfigLoader.log("Error: Could not clean Surgery Date '{}' for row: {}".format(surgery_date_str, row), level="INFO")
587
+ print("Could not clean Surgery Date '{}' for row: {}".format(surgery_date_str, row))
588
+ row['Surgery Date'] = datetime.min # Assign a minimum datetime value if cleaning fails
589
+ else:
590
+ # TIMING: Start date parsing
591
+ parsing_start = time.time()
592
+
593
+ try:
594
+ # PERFORMANCE OPTIMIZATION: Use pre-compiled format string
595
+ # Parse the cleaned date string
596
+ row['Surgery Date'] = datetime.strptime(cleaned_date_str, standard_format)
597
+ processed_count += 1
598
+ # LOGGING STRATEGY: Remove success logging - DEBUG is typically silent anyway
599
+ # if processed_count <= 10 or processed_count % 100 == 0: # Log first 10 and every 100th
600
+ # MediLink_ConfigLoader.log("Successfully cleaned and parsed Surgery Date '{}' -> '{}' for row: {}".format(
601
+ # surgery_date_str, cleaned_date_str, row), level="DEBUG")
602
+ except ValueError as e:
603
+ error_count += 1
604
+ # LOGGING STRATEGY: Log actual errors (parsing failures) at INFO level
605
+ if error_count <= 5: # Only log first 5 parsing errors
606
+ MediLink_ConfigLoader.log("Error parsing cleaned Surgery Date '{}': {} for row: {}".format(
607
+ cleaned_date_str, e, row), level="INFO")
608
+ row['Surgery Date'] = datetime.min # Assign a minimum datetime value if parsing fails
609
+
610
+ # TIMING: End date parsing
611
+ parsing_end = time.time()
612
+ date_parsing_time += (parsing_end - parsing_start)
613
+
614
+ # TIMING: End total surgery date conversion
615
+ total_end_time = time.time()
616
+ total_duration = total_end_time - total_start_time
617
+
618
+ if PERFORMANCE_LOGGING:
619
+ print("Surgery date conversion completed:")
620
+ print(" - Total duration: {:.2f} seconds".format(total_duration))
621
+ print(" - Date cleaning time: {:.2f} seconds ({:.1f}%)".format(date_cleaning_time, (date_cleaning_time/total_duration)*100))
622
+ print(" - Date parsing time: {:.2f} seconds ({:.1f}%)".format(date_parsing_time, (date_parsing_time/total_duration)*100))
623
+ print(" - Processed: {} rows, Empty: {} rows, Errors: {} rows".format(processed_count, empty_count, error_count))
624
+
625
+ # LOGGING STRATEGY: Log completion summary at INFO level (end of looped event)
626
+ MediLink_ConfigLoader.log("Surgery date conversion completed - Total: {:.2f}s, Cleaning: {:.2f}s, Parsing: {:.2f}s, Processed: {}, Empty: {}, Errors: {}".format(
627
+ total_duration, date_cleaning_time, date_parsing_time, processed_count, empty_count, error_count), level="INFO")
628
+
629
+ def _create_common_tie_breakers(row):
630
+ """
631
+ Creates common tie-breaker components used across multiple sorting strategies.
632
+ This follows DRY principle by extracting shared logic.
633
+ """
634
+ last_name = ((row.get('Patient Last') or '')).strip().upper()
635
+ first_name = ((row.get('Patient First') or '')).strip().upper()
636
+ patient_id_tiebreak = str(row.get('Patient ID') or '')
637
+ return (last_name, first_name, patient_id_tiebreak)
638
+
639
+ def _normalize_surgery_date(row):
640
+ """
641
+ Normalizes surgery date for consistent sorting across strategies.
642
+ """
643
+ # Prefer earliest surgery date across all known dates for the patient
644
+ earliest = row.get('_earliest_surgery_date')
645
+ if isinstance(earliest, str) and earliest and earliest != 'MISSING':
646
+ try:
647
+ return datetime.strptime(earliest, '%m-%d-%Y')
648
+ except Exception:
649
+ pass
650
+
651
+ # Fallback to the single Surgery Date field
652
+ surgery_date = row.get('Surgery Date')
653
+ if isinstance(surgery_date, datetime):
654
+ return surgery_date
655
+ elif isinstance(surgery_date, str) and surgery_date.strip():
656
+ try:
657
+ return datetime.strptime(surgery_date, '%m/%d/%Y')
658
+ except ValueError:
659
+ try:
660
+ return datetime.strptime(surgery_date, '%m-%d-%Y')
661
+ except ValueError:
662
+ pass
663
+
664
+ return datetime.min
665
+
666
+ def _get_schedule_position(row):
667
+ """
668
+ Gets the schedule position for a patient from stored DOCX data.
669
+ Returns a high number if no schedule data is available (puts at end).
670
+ """
671
+ schedule_positions = row.get('_schedule_positions', {})
672
+ surgery_date = row.get('Surgery Date')
673
+
674
+ # Convert surgery date to string format for lookup
675
+ if isinstance(surgery_date, datetime):
676
+ surgery_date_str = surgery_date.strftime('%m-%d-%Y')
677
+ else:
678
+ surgery_date_str = str(surgery_date)
679
+
680
+ # Return schedule position if available, otherwise high number (end of list)
681
+ return schedule_positions.get(surgery_date_str, 9999)
682
+
683
+ def _get_surgery_date_string(row):
684
+ """
685
+ Gets surgery date as string for consistent sorting.
686
+ """
687
+ surgery_date = row.get('Surgery Date')
688
+ if isinstance(surgery_date, datetime):
689
+ return surgery_date.strftime('%m-%d-%Y')
690
+ else:
691
+ return str(surgery_date)
692
+
693
+ def _create_date_based_sort_key(row):
694
+ """
695
+ Current date-based sorting logic (extracted from existing sort_key function).
696
+ """
697
+ normalized_date = _normalize_surgery_date(row)
698
+ tie_breakers = _create_common_tie_breakers(row)
699
+ return (normalized_date,) + tie_breakers
700
+
701
+ def _create_schedule_based_sort_key(row):
702
+ """
703
+ Schedule-based sorting logic (new strategy).
704
+ Uses patient position in DOCX surgery schedule as primary sort criterion.
705
+ """
706
+ schedule_position = _get_schedule_position(row)
707
+ surgery_date_str = _get_surgery_date_string(row)
708
+ tie_breakers = _create_common_tie_breakers(row)
709
+ return (schedule_position, surgery_date_str) + tie_breakers
710
+
711
+ def create_sort_key_strategy(strategy_type='date_based'):
712
+ """
713
+ Factory function that returns the appropriate sort key function.
714
+ Follows existing strategy patterns in the codebase.
715
+ """
716
+ if strategy_type == 'schedule_based':
717
+ return _create_schedule_based_sort_key
718
+ else:
719
+ return _create_date_based_sort_key
720
+
721
+ def sort_and_deduplicate(csv_data):
722
+ # Create a dictionary to hold unique patients based on Patient ID
723
+ unique_patients = {}
724
+ # Create a dictionary to store multiple surgery dates per patient
725
+ patient_surgery_dates = {}
726
+
727
+ # Iterate through the CSV data and populate the unique_patients dictionary
728
+ for row in csv_data:
729
+ patient_id = row.get('Patient ID')
730
+ surgery_date = row.get('Surgery Date')
731
+
732
+ if patient_id not in unique_patients:
733
+ unique_patients[patient_id] = row
734
+ patient_surgery_dates[patient_id] = [surgery_date]
735
+ else:
736
+ # If the patient ID already exists, compare surgery dates
737
+ existing_row = unique_patients[patient_id]
738
+ existing_date = existing_row['Surgery Date']
739
+
740
+ # Ensure both dates are comparable by converting to datetime objects
741
+ def normalize_date_for_comparison(date_value):
742
+ if isinstance(date_value, datetime):
743
+ return date_value
744
+ elif isinstance(date_value, str) and date_value.strip():
745
+ try:
746
+ # Try to parse the string as a date
747
+ return datetime.strptime(date_value, '%m/%d/%Y')
748
+ except ValueError:
749
+ try:
750
+ return datetime.strptime(date_value, '%m-%d-%Y')
751
+ except ValueError:
752
+ # If parsing fails, return minimum datetime
753
+ return datetime.min
754
+ else:
755
+ # Empty or invalid values get minimum datetime
756
+ return datetime.min
757
+
758
+ normalized_surgery_date = normalize_date_for_comparison(surgery_date)
759
+ normalized_existing_date = normalize_date_for_comparison(existing_date)
760
+
761
+ # Keep the most current demographic data (later surgery date takes precedence)
762
+ if normalized_surgery_date > normalized_existing_date:
763
+ # Store the old row's surgery date before replacing
764
+ old_date = existing_row['Surgery Date']
765
+ # Add the old date to the list if it's not already there
766
+ if old_date not in patient_surgery_dates[patient_id]:
767
+ patient_surgery_dates[patient_id].append(old_date)
768
+ # Replace with newer row (better demographics)
769
+ unique_patients[patient_id] = row
770
+ # Add the new surgery date to the list if it's not already there
771
+ if surgery_date not in patient_surgery_dates[patient_id]:
772
+ patient_surgery_dates[patient_id].append(surgery_date)
773
+ else:
774
+ # Add this surgery date to the list for this patient if it's not already there
775
+ if surgery_date not in patient_surgery_dates[patient_id]:
776
+ patient_surgery_dates[patient_id].append(surgery_date)
777
+
778
+ # Store the surgery dates information in the first row of each patient for later access
779
+ for patient_id, row in unique_patients.items():
780
+ # Convert surgery dates to strings for consistent storage
781
+ surgery_date_strings = []
782
+ for date in patient_surgery_dates[patient_id]:
783
+ if isinstance(date, datetime):
784
+ if date == datetime.min:
785
+ surgery_date_strings.append('MISSING')
786
+ else:
787
+ surgery_date_strings.append(date.strftime('%m-%d-%Y'))
788
+ else:
789
+ surgery_date_strings.append(str(date) if date else 'MISSING')
790
+
791
+ # Remove duplicates and sort
792
+ unique_surgery_dates = list(set(surgery_date_strings))
793
+ sorted_surgery_dates = sorted(unique_surgery_dates, key=lambda x: datetime.strptime(x, '%m-%d-%Y') if x != 'MISSING' else datetime.min)
794
+ row['_all_surgery_dates'] = sorted_surgery_dates
795
+ row['_primary_surgery_date'] = row['Surgery Date'] # Keep track of which date has the demographics
796
+ # Compute and store earliest surgery date for emission sort
797
+ earliest_dt = None
798
+ earliest_str = None
799
+ for d in sorted_surgery_dates:
800
+ if d and d != 'MISSING':
801
+ try:
802
+ earliest_dt = datetime.strptime(d, '%m-%d-%Y')
803
+ earliest_str = d
804
+ break
805
+ except Exception:
806
+ pass
807
+ # Fallback to demographics date if earliest could not be determined
808
+ if earliest_str is None:
809
+ try:
810
+ sd = row.get('Surgery Date')
811
+ if isinstance(sd, datetime) and sd != datetime.min:
812
+ earliest_dt = sd
813
+ earliest_str = sd.strftime('%m-%d-%Y')
814
+ elif isinstance(sd, str) and sd.strip():
815
+ try:
816
+ earliest_dt = datetime.strptime(sd, '%m/%d/%Y')
817
+ except Exception:
818
+ try:
819
+ earliest_dt = datetime.strptime(sd, '%m-%d-%Y')
820
+ except Exception:
821
+ earliest_dt = None
822
+ earliest_str = sd
823
+ except Exception:
824
+ earliest_dt = None
825
+ earliest_str = None
826
+ row['_earliest_surgery_date'] = earliest_str
827
+
828
+
829
+
830
+ # Convert the unique_patients dictionary back to a list and sort it
831
+ # Use strategy pattern for sorting (follows existing codebase patterns)
832
+ sort_key_func = create_sort_key_strategy(SORTING_STRATEGY)
833
+
834
+ csv_data[:] = sorted(unique_patients.values(), key=sort_key_func)
835
+
836
+ # TODO: Consider adding an option in the config to sort based on Surgery Schedules when available.
837
+ # If no schedule is available, the current sorting strategy will be used.
838
+ #
839
+ # IMPLEMENTATION STATUS: Backend infrastructure is ready.
840
+ # To enable surgery schedule sorting, set SORTING_STRATEGY = 'schedule_based' above.
841
+ # The system will automatically fall back to date-based sorting if schedule data is unavailable.
842
+
843
+ def combine_fields(csv_data):
844
+ for row in csv_data:
845
+ # Safely handle the 'Surgery Date' conversion with clear missing indicator
846
+ surgery_date = row.get('Surgery Date')
847
+ try:
848
+ if isinstance(surgery_date, datetime):
849
+ if surgery_date == datetime.min:
850
+ row['Surgery Date'] = 'MISSING'
851
+ else:
852
+ row['Surgery Date'] = surgery_date.strftime('%m-%d-%Y')
853
+ elif surgery_date:
854
+ # Already a non-empty string
855
+ row['Surgery Date'] = str(surgery_date)
856
+ else:
857
+ row['Surgery Date'] = 'MISSING'
858
+ except Exception:
859
+ row['Surgery Date'] = 'MISSING'
860
+
861
+ first_name = '_'.join(part.strip() for part in row.get('Patient First', '').split()) # Join the first name parts with underscores after cleaning.
862
+ middle_name = row.get('Patient Middle', '').strip()
863
+ middle_name = middle_name[0] if len(middle_name) > 1 else '' # Take only the first character or empty
864
+ last_name = '_'.join(part.strip() for part in row.get('Patient Last', '').split()) # Join the last name parts with underscores after cleaning.
865
+ row['Patient Name'] = ', '.join(filter(None, [last_name, first_name])) + (' ' + middle_name if middle_name else '') # Comma between last and first, space before middle
866
+
867
+ address1 = row.get('Patient Address1', '').strip()
868
+ address2 = row.get('Patient Address2', '').strip()
869
+ row['Patient Street'] = ' '.join(filter(None, [address1, address2])) # Join non-empty addresses
870
+
871
+ def apply_replacements(csv_data, crosswalk):
872
+ replacements = crosswalk.get('csv_replacements', {})
873
+ # Pre-define the keys to check for better performance
874
+ keys_to_check = ['Patient SSN', 'Primary Insurance', 'Ins1 Payer ID']
875
+
876
+ for row in csv_data:
877
+ # Use early termination - check each replacement only if needed
878
+ for old_value, new_value in replacements.items():
879
+ replacement_made = False
880
+ for key in keys_to_check:
881
+ if row.get(key) == old_value:
882
+ row[key] = new_value
883
+ replacement_made = True
884
+ break # Exit the key loop once a replacement is made
885
+ if replacement_made:
886
+ break # Exit the replacement loop once any replacement is made
887
+
888
+ import difflib
889
+ from collections import defaultdict
890
+
891
+ def find_best_medisoft_id(insurance_name, medisoft_ids, medisoft_to_mains_names):
892
+ """
893
+ Finds the best matching Medisoft ID for a given insurance name using fuzzy matching.
894
+
895
+ Parameters:
896
+ - insurance_name (str): The insurance name from the CSV row.
897
+ - medisoft_ids (list): List of Medisoft IDs associated with the Payer ID.
898
+ - medisoft_to_mains_names (dict): Mapping from Medisoft ID to list of MAINS names.
899
+
900
+ Returns:
901
+ - int or None: The best matching Medisoft ID or None if no match is found.
902
+ """
903
+ best_match_ratio = 0
904
+ best_medisoft_id = None
905
+
906
+ # Pre-process insurance name once
907
+ processed_insurance = ''.join(c for c in insurance_name if not c.isdigit()).upper()
908
+
909
+ for medisoft_id in medisoft_ids:
910
+ mains_names = medisoft_to_mains_names.get(medisoft_id, [])
911
+ for mains_name in mains_names:
912
+ # Preprocess names by extracting non-numeric characters and converting to uppercase
913
+ # Use more efficient string processing
914
+ processed_mains = ''.join(c for c in mains_name if not c.isdigit()).upper()
915
+
916
+ # Log the processed names before computing the match ratio
917
+ MediLink_ConfigLoader.log("Processing Medisoft ID '{}': Comparing processed insurance '{}' with processed mains '{}'.".format(medisoft_id, processed_insurance, processed_mains), level="DEBUG")
918
+
919
+ # Compute the similarity ratio
920
+ match_ratio = difflib.SequenceMatcher(None, processed_insurance, processed_mains).ratio()
921
+
922
+ # Log the match ratio
923
+ MediLink_ConfigLoader.log("Match ratio for Medisoft ID '{}': {:.2f}".format(medisoft_id, match_ratio), level="DEBUG")
924
+
925
+ if match_ratio > best_match_ratio:
926
+ best_match_ratio = match_ratio
927
+ best_medisoft_id = medisoft_id
928
+ # Log the current best match
929
+ MediLink_ConfigLoader.log("New best match found: Medisoft ID '{}' with match ratio {:.2f}".format(best_medisoft_id, best_match_ratio), level="DEBUG")
930
+
931
+ # Log the final best match ratio and ID
932
+ MediLink_ConfigLoader.log("Final best match ratio: {:.2f} for Medisoft ID '{}'".format(best_match_ratio, best_medisoft_id), level="DEBUG")
933
+
934
+ # No threshold applied, return the best match found
935
+ return best_medisoft_id
936
+
937
+ def NEW_update_insurance_ids(csv_data, config, crosswalk):
938
+ """
939
+ Updates the 'Ins1 Insurance ID' field in each row of csv_data based on the crosswalk and MAINS data.
940
+
941
+ Parameters:
942
+ - csv_data (list of dict): The CSV data where each row is represented as a dictionary.
943
+ - config (dict): Configuration object containing necessary paths and parameters.
944
+ - crosswalk (dict): Crosswalk data containing mappings between Payer IDs and Medisoft IDs.
945
+
946
+ Returns:
947
+ - None: The function modifies the csv_data in place.
948
+ """
949
+ processed_payer_ids = set() # Track processed Payer IDs
950
+ MediLink_ConfigLoader.log("Starting update of insurance IDs.", level="INFO")
951
+
952
+ # PERFORMANCE FIX: Pre-build flattened payer lookup cache to avoid nested dictionary access
953
+ payer_cache = {}
954
+ crosswalk_payers = crosswalk.get('payer_id', {})
955
+ for payer_id, details in crosswalk_payers.items():
956
+ payer_cache[payer_id] = {
957
+ 'medisoft_id': details.get('medisoft_id', []),
958
+ 'medisoft_medicare_id': details.get('medisoft_medicare_id', []),
959
+ 'endpoint': details.get('endpoint', None)
960
+ }
961
+ MediLink_ConfigLoader.log("Built payer cache for {} payers".format(len(payer_cache)), level="DEBUG")
962
+
963
+ # Load MAINS data to get mapping from Medisoft ID to MAINS names
964
+ insurance_to_id = load_insurance_data_from_mains(config) # Assuming it returns a dict mapping insurance names to IDs
965
+ MediLink_ConfigLoader.log("Loaded MAINS data for insurance to ID mapping.", level="DEBUG")
966
+
967
+ # Invert the mapping to get Medisoft ID to MAINS names
968
+ medisoft_to_mains_names = defaultdict(list)
969
+ for insurance_name, medisoft_id in insurance_to_id.items():
970
+ medisoft_to_mains_names[medisoft_id].append(insurance_name)
971
+
972
+ for row_idx, row in enumerate(csv_data, 1):
973
+ # PERFORMANCE FIX: Store row index to avoid O(n) csv_data.index() calls later
974
+ row['_row_index'] = row_idx
975
+ ins1_payer_id = row.get('Ins1 Payer ID', '').strip()
976
+ MediLink_ConfigLoader.log("Processing row with Ins1 Payer ID: '{}'.".format(ins1_payer_id), level="DEBUG")
977
+
978
+ if ins1_payer_id:
979
+ # Mark this Payer ID as processed
980
+ if ins1_payer_id not in processed_payer_ids:
981
+ processed_payer_ids.add(ins1_payer_id) # Add to set
982
+ MediLink_ConfigLoader.log("Marked Payer ID '{}' as processed.".format(ins1_payer_id), level="DEBUG")
983
+
984
+ # PERFORMANCE FIX: Use flattened cache instead of nested dictionary lookups
985
+ payer_info = payer_cache.get(ins1_payer_id, {})
986
+ medisoft_ids = payer_info.get('medisoft_id', [])
987
+ MediLink_ConfigLoader.log("Retrieved Medisoft IDs for Payer ID '{}': {}".format(ins1_payer_id, medisoft_ids), level="DEBUG")
988
+
989
+ if not medisoft_ids:
990
+ MediLink_ConfigLoader.log("No Medisoft IDs available for Payer ID '{}', creating placeholder entry.".format(ins1_payer_id), level="WARNING")
991
+ # Create a placeholder entry in the crosswalk and cache
992
+ placeholder_entry = {
993
+ 'medisoft_id': [], # Placeholder for future Medisoft IDs
994
+ 'medisoft_medicare_id': [], # Placeholder for future Medicare IDs
995
+ 'endpoint': None # Placeholder for future endpoint
996
+ }
997
+ if 'payer_id' not in crosswalk:
998
+ crosswalk['payer_id'] = {}
999
+ crosswalk['payer_id'][ins1_payer_id] = placeholder_entry
1000
+ # PERFORMANCE FIX: Update cache with placeholder entry
1001
+ payer_cache[ins1_payer_id] = placeholder_entry
1002
+ continue # Skip further processing for this Payer ID
1003
+
1004
+ # If only one Medisoft ID is associated, assign it directly
1005
+ if len(medisoft_ids) == 1:
1006
+ try:
1007
+ medisoft_id = int(medisoft_ids[0])
1008
+ row['Ins1 Insurance ID'] = medisoft_id
1009
+ # PERFORMANCE FIX: Use enumerate index instead of csv_data.index() which is O(n)
1010
+ row_number = getattr(row, '_row_index', 'Unknown')
1011
+ MediLink_ConfigLoader.log("Assigned Medisoft ID '{}' to row number {} with Payer ID '{}'.".format(medisoft_id, row_number, ins1_payer_id), level="DEBUG")
1012
+ except ValueError as e:
1013
+ MediLink_ConfigLoader.log("Error converting Medisoft ID '{}' to integer for Payer ID '{}': {}".format(medisoft_ids[0], ins1_payer_id, e), level="ERROR")
1014
+ row['Ins1 Insurance ID'] = None
1015
+ continue # Move to the next row
1016
+
1017
+ # If multiple Medisoft IDs are associated, perform fuzzy matching
1018
+ insurance_name = row.get('Primary Insurance', '').strip()
1019
+ if not insurance_name:
1020
+ MediLink_ConfigLoader.log("Row with Payer ID '{}' missing 'Primary Insurance', skipping assignment.".format(ins1_payer_id), level="WARNING")
1021
+ continue # Skip if insurance name is missing
1022
+
1023
+ best_medisoft_id = find_best_medisoft_id(insurance_name, medisoft_ids, medisoft_to_mains_names)
1024
+
1025
+ if best_medisoft_id:
1026
+ row['Ins1 Insurance ID'] = best_medisoft_id
1027
+ MediLink_ConfigLoader.log("Assigned Medisoft ID '{}' to row with Payer ID '{}' based on fuzzy match.".format(best_medisoft_id, ins1_payer_id), level="INFO")
1028
+ else:
1029
+ # Default to the first Medisoft ID if no good match is found
1030
+ try:
1031
+ default_medisoft_id = int(medisoft_ids[0])
1032
+ row['Ins1 Insurance ID'] = default_medisoft_id
1033
+ MediLink_ConfigLoader.log("No suitable match found. Defaulted to Medisoft ID '{}' for Payer ID '{}'.".format(default_medisoft_id, ins1_payer_id), level="INFO")
1034
+ except ValueError as e:
1035
+ MediLink_ConfigLoader.log("Error converting default Medisoft ID '{}' to integer for Payer ID '{}': {}".format(medisoft_ids[0], ins1_payer_id, e), level="ERROR")
1036
+ row['Ins1 Insurance ID'] = None
1037
+
1038
+ def update_insurance_ids(csv_data, config, crosswalk):
1039
+ # LOGGING STRATEGY: Remove DEBUG level function start log - DEBUG is typically silent anyway
1040
+ # MediLink_ConfigLoader.log("Starting update_insurance_ids function.", level="DEBUG")
1041
+
1042
+ # TIMING: Start insurance ID updates with granular tracking
1043
+ total_start_time = time.time()
1044
+ lookup_build_time = 0
1045
+ csv_processing_time = 0
1046
+ processed_count = 0
1047
+ medicare_count = 0
1048
+ regular_count = 0
1049
+ placeholder_count = 0
1050
+
1051
+ print("Starting insurance ID updates for {} rows...".format(len(csv_data)))
1052
+ # LOGGING STRATEGY: Only log start/end of looped events at INFO level, not individual successes
1053
+ # MediLink_ConfigLoader.log("Starting insurance ID updates for {} rows...".format(len(csv_data)), level="INFO") # REMOVED
1054
+
1055
+ # TIMING: Start lookup dictionary building
1056
+ lookup_start_time = time.time()
1057
+
1058
+ # PERFORMANCE FIX: Pre-build optimized lookup dictionaries for both regular and Medicare IDs
1059
+ # This reduces Medicare processing overhead by building lookups once instead of repeated processing
1060
+ payer_id_to_medisoft = {}
1061
+ payer_id_to_medicare = {}
1062
+ # LOGGING STRATEGY: Remove DEBUG level initialization log - DEBUG is typically silent anyway
1063
+ # MediLink_ConfigLoader.log("Initialized optimized lookup dictionaries for Medicare and regular IDs.", level="DEBUG")
1064
+
1065
+ # Build both lookup dictionaries simultaneously to avoid multiple iterations
1066
+ for payer_id, details in crosswalk.get('payer_id', {}).items():
1067
+ # Get both regular and Medicare IDs
1068
+ medisoft_ids = details.get('medisoft_id', [])
1069
+ medicare_ids = details.get('medisoft_medicare_id', [])
1070
+
1071
+ # Filter empty strings once for each type
1072
+ medisoft_ids = [id for id in medisoft_ids if id] if medisoft_ids else []
1073
+ medicare_ids = [id for id in medicare_ids if id] if medicare_ids else []
1074
+
1075
+ # Store first valid ID for quick lookup (Medicare takes precedence if available)
1076
+ payer_id_to_medisoft[payer_id] = int(medisoft_ids[0]) if medisoft_ids else None
1077
+ payer_id_to_medicare[payer_id] = int(medicare_ids[0]) if medicare_ids else None
1078
+
1079
+ # LOGGING STRATEGY: Remove success logging - DEBUG is typically silent anyway
1080
+ # if len(payer_id_to_medisoft) <= 10 or len(payer_id_to_medisoft) % 50 == 0: # Log first 10 and every 50th
1081
+ # MediLink_ConfigLoader.log("Processed Payer ID '{}': Regular IDs: {}, Medicare IDs: {}".format(
1082
+ # payer_id, medisoft_ids, medicare_ids), level="DEBUG")
1083
+
1084
+ # TIMING: End lookup dictionary building
1085
+ lookup_end_time = time.time()
1086
+ lookup_build_time = lookup_end_time - lookup_start_time
1087
+
1088
+ if PERFORMANCE_LOGGING:
1089
+ print("Built lookup dictionaries in {:.2f} seconds for {} payer IDs".format(lookup_build_time, len(payer_id_to_medisoft)))
1090
+
1091
+
1092
+ # TIMING: Start CSV processing
1093
+ csv_start_time = time.time()
1094
+
1095
+ # PERFORMANCE FIX: Single pass through CSV data with optimized Medicare ID resolution
1096
+ for row_idx, row in enumerate(csv_data, 1):
1097
+ ins1_payer_id = row.get('Ins1 Payer ID', '').strip()
1098
+ # LOGGING STRATEGY: Remove success logging - DEBUG is typically silent anyway
1099
+ # if row_idx <= 10 or row_idx % 100 == 0: # Log first 10 and every 100th
1100
+ # MediLink_ConfigLoader.log("Processing row #{} with Ins1 Payer ID '{}'.".format(row_idx, ins1_payer_id), level="DEBUG")
1101
+
1102
+ # Try Medicare ID first, then fall back to regular ID (optimized Medicare processing)
1103
+ insurance_id = (payer_id_to_medicare.get(ins1_payer_id) or
1104
+ payer_id_to_medisoft.get(ins1_payer_id))
1105
+
1106
+ if insurance_id is None and ins1_payer_id not in payer_id_to_medisoft:
1107
+ # Add placeholder entry for new payer ID (preserve original functionality)
1108
+ payer_id_to_medisoft[ins1_payer_id] = None
1109
+ payer_id_to_medicare[ins1_payer_id] = None
1110
+ crosswalk.setdefault('payer_id', {})[ins1_payer_id] = {
1111
+ 'medisoft_id': [], # Placeholder for future Medisoft IDs
1112
+ 'medisoft_medicare_id': [], # Placeholder for future Medicare IDs
1113
+ 'endpoint': None # Placeholder for future endpoint
1114
+ }
1115
+ placeholder_count += 1
1116
+ # LOGGING STRATEGY: Log actual events (new payer IDs) at INFO level
1117
+ if placeholder_count <= 5: # Only log first 5 placeholders
1118
+ MediLink_ConfigLoader.log("Added placeholder entry for new Payer ID '{}'.".format(ins1_payer_id), level="INFO")
1119
+ elif insurance_id == payer_id_to_medicare.get(ins1_payer_id):
1120
+ medicare_count += 1
1121
+ else:
1122
+ regular_count += 1
1123
+
1124
+ # Assign the resolved insurance ID to the row
1125
+ row['Ins1 Insurance ID'] = insurance_id
1126
+ # TODO (SECONDARY QUEUE): When building a secondary-claims queue after Medicare crossover,
1127
+ # set claim_type='secondary' and attach prior payer fields here from the Medicare primary outcome:
1128
+ # - row['prior_payer_name'] = 'MEDICARE'
1129
+ # - row['prior_payer_id'] = best Medicare ID from config/crosswalk
1130
+ # - optionally row['primary_paid_amount'], row['cas_adjustments'] extracted from 835
1131
+ processed_count += 1
1132
+ # LOGGING STRATEGY: Remove success logging - DEBUG is typically silent anyway
1133
+ # if processed_count <= 10 or processed_count % 100 == 0: # Log first 10 and every 100th
1134
+ # MediLink_ConfigLoader.log("Assigned Insurance ID '{}' to row with Ins1 Payer ID '{}'.".format(insurance_id, ins1_payer_id), level="DEBUG")
1135
+
1136
+ # TIMING: End CSV processing
1137
+ csv_end_time = time.time()
1138
+ csv_processing_time = csv_end_time - csv_start_time
1139
+
1140
+ # TIMING: End total insurance ID updates
1141
+ total_end_time = time.time()
1142
+ total_duration = total_end_time - total_start_time
1143
+
1144
+ if PERFORMANCE_LOGGING:
1145
+ print("Insurance ID updates completed:")
1146
+ print(" - Total duration: {:.2f} seconds".format(total_duration))
1147
+ print(" - Lookup building time: {:.2f} seconds ({:.1f}%)".format(lookup_build_time, (lookup_build_time/total_duration)*100))
1148
+ print(" - CSV processing time: {:.2f} seconds ({:.1f}%)".format(csv_processing_time, (csv_processing_time/total_duration)*100))
1149
+ print(" - Processed: {} rows, Medicare: {} rows, Regular: {} rows, Placeholders: {} rows".format(
1150
+ processed_count, medicare_count, regular_count, placeholder_count))
1151
+
1152
+ # LOGGING STRATEGY: Log completion summary at INFO level (end of looped event)
1153
+ MediLink_ConfigLoader.log("Insurance ID updates completed - Total: {:.2f}s, Lookup: {:.2f}s, Processing: {:.2f}s, Processed: {}, Medicare: {}, Regular: {}, Placeholders: {}".format(
1154
+ total_duration, lookup_build_time, csv_processing_time, processed_count, medicare_count, regular_count, placeholder_count), level="INFO")
1155
+
1156
+ def update_procedure_codes(csv_data, crosswalk):
1157
+
1158
+ # Get Medisoft shorthand dictionary from crosswalk and reverse it
1159
+ diagnosis_to_medisoft = crosswalk.get('diagnosis_to_medisoft', {}) # BUG We need to be careful here in case we decide we need to change the crosswalk data specifically with regard to the T8/H usage.
1160
+ medisoft_to_diagnosis = {v: k for k, v in diagnosis_to_medisoft.items()}
1161
+
1162
+ # Get procedure code to diagnosis dictionary from crosswalk and reverse it for easier lookup
1163
+ diagnosis_to_procedure = {
1164
+ diagnosis_code: procedure_code
1165
+ for procedure_code, diagnosis_codes in crosswalk.get('procedure_to_diagnosis', {}).items()
1166
+ for diagnosis_code in diagnosis_codes
1167
+ }
1168
+
1169
+ # Initialize counters for tracking
1170
+ updated_count = 0
1171
+ missing_medisoft_codes = set()
1172
+ missing_procedure_mappings = set()
1173
+
1174
+ # Update the "Procedure Code" column in the CSV data
1175
+ for row_num, row in enumerate(csv_data, start=1):
1176
+ try:
1177
+ medisoft_code = row.get('Default Diagnosis #1', '').strip()
1178
+ diagnosis_code = medisoft_to_diagnosis.get(medisoft_code)
1179
+
1180
+ if diagnosis_code:
1181
+ procedure_code = diagnosis_to_procedure.get(diagnosis_code)
1182
+ if procedure_code:
1183
+ row['Procedure Code'] = procedure_code
1184
+ updated_count += 1
1185
+ else:
1186
+ # Track missing procedure mapping
1187
+ missing_procedure_mappings.add(diagnosis_code)
1188
+ row['Procedure Code'] = "Unknown" # Will be handled by 837p encoder
1189
+ MediLink_ConfigLoader.log("Missing procedure mapping for diagnosis code '{}' (Medisoft code: '{}') in row {}".format(
1190
+ diagnosis_code, medisoft_code, row_num), level="WARNING")
1191
+ else:
1192
+ # Track missing Medisoft code mapping
1193
+ if medisoft_code: # Only track if there's actually a code
1194
+ missing_medisoft_codes.add(medisoft_code)
1195
+ row['Procedure Code'] = "Unknown" # Will be handled by 837p encoder
1196
+ MediLink_ConfigLoader.log("Missing Medisoft code mapping for '{}' in row {}".format(
1197
+ medisoft_code, row_num), level="WARNING")
1198
+ except Exception as e:
1199
+ MediLink_ConfigLoader.log("In update_procedure_codes, Error processing row {}: {}".format(row_num, e), level="ERROR")
1200
+
1201
+ # Log summary statistics
1202
+ MediLink_ConfigLoader.log("Total {} 'Procedure Code' rows updated.".format(updated_count), level="INFO")
1203
+
1204
+ if missing_medisoft_codes:
1205
+ MediLink_ConfigLoader.log("Missing Medisoft code mappings: {}".format(sorted(missing_medisoft_codes)), level="WARNING")
1206
+ print("WARNING: {} Medisoft codes need to be added to diagnosis_to_medisoft mapping: {}".format(
1207
+ len(missing_medisoft_codes), sorted(missing_medisoft_codes)))
1208
+
1209
+ if missing_procedure_mappings:
1210
+ MediLink_ConfigLoader.log("Missing procedure mappings for diagnosis codes: {}".format(sorted(missing_procedure_mappings)), level="WARNING")
1211
+ print("WARNING: {} diagnosis codes need to be added to procedure_to_diagnosis mapping: {}".format(
1212
+ len(missing_procedure_mappings), sorted(missing_procedure_mappings)))
1213
+
1214
+ return True
1215
+
1216
+ def update_diagnosis_codes(csv_data):
1217
+ try:
1218
+ # TIMING: Start surgery schedule parsing timing
1219
+ parsing_start_time = time.time()
1220
+ print("Starting surgery schedule parsing at: {}".format(time.strftime("%H:%M:%S")))
1221
+ MediLink_ConfigLoader.log("Starting surgery schedule parsing at: {}".format(time.strftime("%H:%M:%S")), level="INFO")
1222
+
1223
+ # Use cached configuration instead of loading repeatedly
1224
+ config, crosswalk = get_cached_configuration()
1225
+
1226
+ # Extract the local storage path from the configuration
1227
+ local_storage_path = config['MediLink_Config']['local_storage_path']
1228
+
1229
+ # Initialize a dictionary to hold diagnosis codes from all DOCX files
1230
+ all_patient_data = {}
1231
+ all_schedule_positions = {} # NEW: Store schedule positions for future sorting
1232
+
1233
+ # Convert surgery dates in CSV data
1234
+ convert_surgery_date(csv_data)
1235
+
1236
+ # Extract all valid surgery dates from csv_data
1237
+ surgery_dates = [row['Surgery Date'] for row in csv_data if row['Surgery Date'] != datetime.min]
1238
+
1239
+ if not surgery_dates:
1240
+ raise ValueError("No valid surgery dates found in csv_data.")
1241
+
1242
+ # Determine the minimum and maximum surgery dates
1243
+ min_surgery_date = min(surgery_dates)
1244
+ max_surgery_date = max(surgery_dates)
1245
+
1246
+ # Apply a +/-8-day margin to the surgery dates... Increased from 5 days.
1247
+ margin = timedelta(days=8)
1248
+ threshold_start = min_surgery_date - margin
1249
+ threshold_end = max_surgery_date + margin
1250
+
1251
+ # TODO (Low) This is a bad idea. We need a better way to handle this because it leaves
1252
+ # us with a situation where if we take 'too long' to download the DOCX files, it will presume that the DOCX files are out of range because
1253
+ # the modfied date is a bad proxy for the date of the surgery which would be contained inside the DOCX file. The processing overhead for extracting the
1254
+ # date of the surgery from the DOCX file is non-trivial and computationally expensive so we need a smarter way to handle this.
1255
+
1256
+ MediLink_ConfigLoader.log("BAD IDEA: Processing DOCX files modified between {} and {}.".format(threshold_start, threshold_end), level="INFO")
1257
+
1258
+ # TIMING: Start file system operations
1259
+ filesystem_start_time = time.time()
1260
+
1261
+ # PERFORMANCE OPTIMIZATION: Batch file system operations with caching
1262
+ # Pre-convert threshold timestamps for efficient comparison (Windows XP compatible)
1263
+ threshold_start_ts = threshold_start.timestamp() if hasattr(threshold_start, 'timestamp') else time.mktime(threshold_start.timetuple())
1264
+ threshold_end_ts = threshold_end.timestamp() if hasattr(threshold_end, 'timestamp') else time.mktime(threshold_end.timetuple())
1265
+
1266
+ # Lightweight on-disk index to avoid relying on mtime for clinical windowing
1267
+ # Index format: { filename: { 'mtime': float, 'dates': ['MM-DD-YYYY', ...] } }
1268
+ index_path = os.path.join(local_storage_path, '.docx_index.json')
1269
+ docx_index = {}
1270
+ try:
1271
+ if os.path.exists(index_path):
1272
+ import json
1273
+ with open(index_path, 'r') as jf:
1274
+ docx_index = json.load(jf)
1275
+ except Exception:
1276
+ docx_index = {}
1277
+
1278
+ valid_files = []
1279
+ try:
1280
+ # Use os.listdir() with optimized timestamp comparison (XP/3.4.4 compatible)
1281
+ for filename in os.listdir(local_storage_path):
1282
+ if filename.endswith('.docx'):
1283
+ filepath = os.path.join(local_storage_path, filename)
1284
+ # Get file modification time in single operation
1285
+ try:
1286
+ stat_info = os.stat(filepath)
1287
+ # Direct timestamp comparison avoids datetime conversion overhead
1288
+ # First filter by mtime for performance
1289
+ if not (threshold_start_ts <= stat_info.st_mtime <= threshold_end_ts):
1290
+ continue
1291
+ # If indexed and mtime unchanged, prefer date-based decision from index
1292
+ rec = docx_index.get(filename)
1293
+ if rec and isinstance(rec, dict) and abs(rec.get('mtime', 0) - stat_info.st_mtime) < 0.001:
1294
+ # If any extracted date falls within threshold window, keep file
1295
+ dates = rec.get('dates', []) or []
1296
+ keep = False
1297
+ for d in dates:
1298
+ try:
1299
+ # parse 'MM-DD-YYYY' to timestamp
1300
+ dt = datetime.strptime(d, '%m-%d-%Y')
1301
+ ts = dt.timestamp() if hasattr(dt, 'timestamp') else time.mktime(dt.timetuple())
1302
+ if threshold_start_ts <= ts <= threshold_end_ts:
1303
+ keep = True
1304
+ break
1305
+ except Exception:
1306
+ continue
1307
+ if not keep:
1308
+ continue
1309
+ # mtime passes or index indicates date window match
1310
+ valid_files.append(filepath)
1311
+ except (OSError, ValueError):
1312
+ # Skip files with invalid modification times
1313
+ continue
1314
+ except OSError:
1315
+ MediLink_ConfigLoader.log("Error accessing directory: {}".format(local_storage_path), level="ERROR")
1316
+ return
1317
+
1318
+ # TIMING: End file system operations
1319
+ filesystem_end_time = time.time()
1320
+ filesystem_duration = filesystem_end_time - filesystem_start_time
1321
+
1322
+ # PERFORMANCE OPTIMIZATION: Log file count for debugging without processing overhead
1323
+ MediLink_ConfigLoader.log("Found {} DOCX files within date threshold".format(len(valid_files)), level="INFO")
1324
+
1325
+ # TIMING: Start CSV data preprocessing
1326
+ csv_prep_start_time = time.time()
1327
+
1328
+ # PERFORMANCE OPTIMIZATION: Pre-process patient IDs for efficient lookup
1329
+ # Create a set of patient IDs from CSV data for faster lookups
1330
+ patient_ids_in_csv = {row.get('Patient ID', '').strip() for row in csv_data}
1331
+
1332
+ # PERFORMANCE OPTIMIZATION: Pre-convert surgery dates to string format
1333
+ # Convert all surgery dates to string format once to avoid repeated conversions in loops
1334
+ surgery_date_strings = {}
1335
+ for row in csv_data:
1336
+ patient_id = row.get('Patient ID', '').strip()
1337
+ surgery_date = row.get('Surgery Date')
1338
+ if surgery_date != datetime.min:
1339
+ surgery_date_strings[patient_id] = surgery_date.strftime("%m-%d-%Y")
1340
+ else:
1341
+ surgery_date_strings[patient_id] = ''
1342
+
1343
+ # TIMING: End CSV data preprocessing
1344
+ csv_prep_end_time = time.time()
1345
+ csv_prep_duration = csv_prep_end_time - csv_prep_start_time
1346
+
1347
+ # TIMING: Log before processing DOCX files
1348
+ docx_processing_start_time = time.time()
1349
+ print("Found {} DOCX files to process. Starting DOCX parsing...".format(len(valid_files)))
1350
+ MediLink_ConfigLoader.log("Found {} DOCX files to process. Starting DOCX parsing...".format(len(valid_files)), level="INFO")
1351
+
1352
+ # TIMING: Track individual DOCX file processing
1353
+ docx_files_processed = 0
1354
+ docx_files_skipped = 0
1355
+ docx_parse_errors = 0
1356
+
1357
+ # Process valid DOCX files
1358
+ updated_index = False
1359
+ for filepath in valid_files:
1360
+ # TIMING: Start individual file processing
1361
+ file_start_time = time.time()
1362
+
1363
+ try:
1364
+ if SORTING_STRATEGY == 'schedule_based':
1365
+ # Enhanced parsing to capture schedule positions
1366
+ patient_data, schedule_positions = parse_docx(filepath, surgery_dates, capture_schedule_positions=True) # Pass surgery_dates to parse_docx
1367
+ # Store schedule positions for future sorting
1368
+ for patient_id, dates in schedule_positions.items():
1369
+ if patient_id not in all_schedule_positions:
1370
+ all_schedule_positions[patient_id] = {}
1371
+ all_schedule_positions[patient_id].update(dates)
1372
+ else:
1373
+ # Standard parsing (maintains backward compatibility)
1374
+ patient_data = parse_docx(filepath, surgery_dates, capture_schedule_positions=False) # Pass surgery_dates to parse_docx
1375
+
1376
+ docx_files_processed += 1
1377
+
1378
+ # PERFORMANCE OPTIMIZATION: Use defaultdict for more efficient dictionary operations
1379
+ for patient_id, service_dates in patient_data.items():
1380
+ if patient_id not in all_patient_data:
1381
+ all_patient_data[patient_id] = {}
1382
+ for date_of_service, diagnosis_data in service_dates.items():
1383
+ # TODO: SURGERY SCHEDULE CONFLICT RESOLUTION
1384
+ # Implement enhanced conflict detection and logging as outlined in
1385
+ # surgery_schedule_conflict_resolution_strategy.md
1386
+ #
1387
+ # Current behavior: Silent overwriting with latest file wins
1388
+ # Proposed enhancement:
1389
+ # 1. Detect when multiple files contain data for same date
1390
+ # 2. Log conflicts with date-organized notifications showing:
1391
+ # - Source files (with modification timestamps)
1392
+ # - Patients affected (added/removed/modified)
1393
+ # - Specific changes (diagnosis, laterality, etc.)
1394
+ # 3. Use file modification time to determine priority
1395
+ # 4. Generate summary report organized by surgery date
1396
+ #
1397
+ # Example notification format:
1398
+ # "SURGERY SCHEDULE CONFLICTS DETECTED FOR: 12/15/2023"
1399
+ # " Original: file1.docx (modified: 08:30:00)"
1400
+ # " Revised: file2.docx (modified: 14:45:00)"
1401
+ # " Patients affected: 3 modified, 1 added, 1 removed"
1402
+ # " Resolution: Using latest file (file2.docx)"
1403
+ #
1404
+ # This will provide transparency when revised schedules overwrite
1405
+ # original schedules, organized by the affected surgery dates.
1406
+ all_patient_data[patient_id][date_of_service] = diagnosis_data
1407
+ # Update index entry for this file (store union of extracted surgery dates)
1408
+ try:
1409
+ dates_list = []
1410
+ for _pid, _dates in patient_data.items():
1411
+ for _dos in _dates.keys():
1412
+ dates_list.append(_dos)
1413
+ if dates_list:
1414
+ filename = os.path.basename(filepath)
1415
+ stat_info = os.stat(filepath)
1416
+ docx_index[filename] = {
1417
+ 'mtime': stat_info.st_mtime,
1418
+ 'dates': sorted(list(set(dates_list)))
1419
+ }
1420
+ updated_index = True
1421
+ except Exception:
1422
+ pass
1423
+ except Exception as e:
1424
+ docx_parse_errors += 1
1425
+ MediLink_ConfigLoader.log("Error parsing DOCX file {}: {}".format(filepath, e), level="ERROR")
1426
+
1427
+ # TIMING: End individual file processing
1428
+ file_end_time = time.time()
1429
+ file_duration = file_end_time - file_start_time
1430
+
1431
+ # Log slow files (taking more than 1 second)
1432
+ if file_duration > 1.0 and PERFORMANCE_LOGGING:
1433
+ print(" - Slow file: {} (Duration: {:.2f} seconds)".format(os.path.basename(filepath), file_duration))
1434
+
1435
+ # Write index back if updated
1436
+ if updated_index:
1437
+ try:
1438
+ import json
1439
+ with open(index_path, 'w') as jf:
1440
+ json.dump(docx_index, jf)
1441
+ except Exception:
1442
+ pass
1443
+
1444
+ # TIMING: Log DOCX processing completion
1445
+ docx_processing_end_time = time.time()
1446
+ docx_processing_duration = docx_processing_end_time - docx_processing_start_time
1447
+ if PERFORMANCE_LOGGING:
1448
+ print("DOCX parsing completed at: {} (Duration: {:.2f} seconds)".format(
1449
+ time.strftime("%H:%M:%S"), docx_processing_duration))
1450
+ print(" - Files processed: {}, Files skipped: {}, Parse errors: {}".format(
1451
+ docx_files_processed, docx_files_skipped, docx_parse_errors))
1452
+ MediLink_ConfigLoader.log("DOCX parsing completed at: {} (Duration: {:.2f} seconds)".format(
1453
+ time.strftime("%H:%M:%S"), docx_processing_duration), level="INFO")
1454
+
1455
+ # Log if no valid files were found
1456
+ if not valid_files:
1457
+ MediLink_ConfigLoader.log("No valid DOCX files found within the modification time threshold.", level="INFO")
1458
+
1459
+ # Debug logging for all_patient_data
1460
+ MediLink_ConfigLoader.log("All patient data collected from DOCX files: {}".format(all_patient_data), level="DEBUG")
1461
+
1462
+ # Check if any patient data was collected
1463
+ if not all_patient_data or not patient_ids_in_csv.intersection(all_patient_data.keys()):
1464
+ MediLink_ConfigLoader.log("No patient data collected or no matching Patient IDs found. Skipping further processing.", level="INFO")
1465
+ return # Exit the function early if no data is available
1466
+
1467
+ # TIMING: Start CSV data matching
1468
+ csv_matching_start_time = time.time()
1469
+
1470
+ # Get Medisoft shorthand dictionary from crosswalk.
1471
+ diagnosis_to_medisoft = crosswalk.get('diagnosis_to_medisoft', {})
1472
+
1473
+ # Initialize counter for updated rows
1474
+ updated_count = 0
1475
+
1476
+ # PERFORMANCE OPTIMIZATION: Single pass through CSV data with pre-processed lookups
1477
+ # Update the "Default Diagnosis #1" column in the CSV data and store diagnosis codes for all surgery dates
1478
+ for row_num, row in enumerate(csv_data, start=1):
1479
+ patient_id = row.get('Patient ID', '').strip()
1480
+ # Use pre-processed patient ID lookup for efficiency
1481
+ if patient_id not in patient_ids_in_csv:
1482
+ continue # Skip rows that do not match any patient ID
1483
+
1484
+ MediLink_ConfigLoader.log("Processing row number {}.".format(row_num), level="DEBUG")
1485
+
1486
+ # Get all surgery dates for this patient
1487
+ all_surgery_dates = row.get('_all_surgery_dates', [row.get('Surgery Date')])
1488
+
1489
+ # Create a mapping of surgery dates to diagnosis codes for this patient
1490
+ surgery_date_to_diagnosis = {}
1491
+
1492
+ if patient_id in all_patient_data:
1493
+ # Process each surgery date for this patient
1494
+ for surgery_date in all_surgery_dates:
1495
+ # Convert surgery date to string format for lookup
1496
+ try:
1497
+ if hasattr(surgery_date, 'strftime'):
1498
+ surgery_date_str = surgery_date.strftime('%m-%d-%Y')
1499
+ else:
1500
+ surgery_date_str = str(surgery_date)
1501
+ except Exception:
1502
+ surgery_date_str = str(surgery_date)
1503
+
1504
+ MediLink_ConfigLoader.log("Patient ID: {}, Surgery Date: {}".format(patient_id, surgery_date_str), level="DEBUG")
1505
+
1506
+ if surgery_date_str in all_patient_data[patient_id]:
1507
+ diagnosis_data = all_patient_data[patient_id][surgery_date_str]
1508
+ # XP SP3 + Py3.4.4 compatible tuple unpacking with safety check
1509
+ try:
1510
+ if isinstance(diagnosis_data, (list, tuple)) and len(diagnosis_data) >= 3:
1511
+ diagnosis_code, left_or_right_eye, femto_yes_or_no = diagnosis_data
1512
+ else:
1513
+ # Handle case where diagnosis_data is not a proper tuple
1514
+ diagnosis_code = diagnosis_data if diagnosis_data else None
1515
+ left_or_right_eye = None
1516
+ femto_yes_or_no = None
1517
+ except Exception as e:
1518
+ MediLink_ConfigLoader.log("Error unpacking diagnosis data for Patient ID: {}, Surgery Date: {}: {}".format(
1519
+ patient_id, surgery_date_str, str(e)), level="WARNING")
1520
+ diagnosis_code = None
1521
+ left_or_right_eye = None
1522
+ femto_yes_or_no = None
1523
+
1524
+ MediLink_ConfigLoader.log("Found diagnosis data for Patient ID: {}, Surgery Date: {}".format(patient_id, surgery_date_str), level="DEBUG")
1525
+
1526
+ # Convert diagnosis code to Medisoft shorthand format.
1527
+ # XP SP3 + Py3.4.4 compatible null check
1528
+ if diagnosis_code is None:
1529
+ medisoft_shorthand = 'N/A'
1530
+ MediLink_ConfigLoader.log("Diagnosis code is None for Patient ID: {}, Surgery Date: {}".format(
1531
+ patient_id, surgery_date_str), level="WARNING")
1532
+ else:
1533
+ medisoft_shorthand = diagnosis_to_medisoft.get(diagnosis_code, None)
1534
+ if medisoft_shorthand is None and diagnosis_code:
1535
+ # Use fallback logic for missing mapping (XP SP3 + Py3.4.4 compatible)
1536
+ try:
1537
+ defaulted_code = diagnosis_code.lstrip('H').lstrip('T8').replace('.', '')[-5:]
1538
+ # Basic validation: ensure code is not empty and has reasonable length
1539
+ if defaulted_code and len(defaulted_code) >= 3:
1540
+ medisoft_shorthand = defaulted_code
1541
+ MediLink_ConfigLoader.log("Missing diagnosis mapping for '{}', using fallback code '{}'".format(
1542
+ diagnosis_code, medisoft_shorthand), level="WARNING")
1543
+ else:
1544
+ medisoft_shorthand = 'N/A'
1545
+ MediLink_ConfigLoader.log("Fallback diagnosis code validation failed for '{}', using 'N/A'".format(
1546
+ diagnosis_code), level="WARNING")
1547
+ except Exception as e:
1548
+ medisoft_shorthand = 'N/A'
1549
+ MediLink_ConfigLoader.log("Error in fallback diagnosis code generation for '{}': {}".format(
1550
+ diagnosis_code, str(e)), level="WARNING")
1551
+
1552
+ MediLink_ConfigLoader.log("Converted diagnosis code to Medisoft shorthand: {}".format(medisoft_shorthand), level="DEBUG")
1553
+
1554
+ surgery_date_to_diagnosis[surgery_date_str] = medisoft_shorthand
1555
+ else:
1556
+ MediLink_ConfigLoader.log("No matching surgery date found for Patient ID: {} on date {}.".format(patient_id, surgery_date_str), level="INFO")
1557
+ surgery_date_to_diagnosis[surgery_date_str] = 'N/A'
1558
+
1559
+ # Store the diagnosis mapping for all surgery dates
1560
+ row['_surgery_date_to_diagnosis'] = surgery_date_to_diagnosis
1561
+
1562
+ # NEW: Store schedule positions for future sorting if available
1563
+ if SORTING_STRATEGY == 'schedule_based' and patient_id in all_schedule_positions:
1564
+ row['_schedule_positions'] = all_schedule_positions[patient_id]
1565
+
1566
+ # Set the primary diagnosis code (for the main surgery date)
1567
+ primary_surgery_date = row.get('Surgery Date')
1568
+ # Convert primary surgery date to string for lookup
1569
+ if isinstance(primary_surgery_date, datetime):
1570
+ primary_surgery_date_str = primary_surgery_date.strftime('%m-%d-%Y')
1571
+ else:
1572
+ primary_surgery_date_str = str(primary_surgery_date)
1573
+ primary_diagnosis = surgery_date_to_diagnosis.get(primary_surgery_date_str, 'N/A')
1574
+ row['Default Diagnosis #1'] = primary_diagnosis
1575
+
1576
+ updated_count += 1
1577
+ MediLink_ConfigLoader.log("Updated row number {} with diagnosis codes for {} surgery dates.".format(row_num, len(all_surgery_dates)), level="INFO")
1578
+ else:
1579
+ MediLink_ConfigLoader.log("Patient ID: {} not found in DOCX data for row {}.".format(patient_id, row_num), level="INFO")
1580
+
1581
+ # TIMING: End CSV data matching
1582
+ csv_matching_end_time = time.time()
1583
+ csv_matching_duration = csv_matching_end_time - csv_matching_start_time
1584
+
1585
+ # Log total count of updated rows
1586
+ MediLink_ConfigLoader.log("Total {} 'Default Diagnosis #1' rows updated.".format(updated_count), level="INFO")
1587
+
1588
+ # TIMING: End surgery schedule parsing timing
1589
+ parsing_end_time = time.time()
1590
+ parsing_duration = parsing_end_time - parsing_start_time
1591
+ if PERFORMANCE_LOGGING:
1592
+ print("Surgery schedule parsing completed at: {} (Duration: {:.2f} seconds)".format(
1593
+ time.strftime("%H:%M:%S"), parsing_duration))
1594
+ print(" - File system operations: {:.2f} seconds ({:.1f}%)".format(filesystem_duration, (filesystem_duration/parsing_duration)*100))
1595
+ print(" - CSV data preprocessing: {:.2f} seconds ({:.1f}%)".format(csv_prep_duration, (csv_prep_duration/parsing_duration)*100))
1596
+ print(" - DOCX file processing: {:.2f} seconds ({:.1f}%)".format(docx_processing_duration, (docx_processing_duration/parsing_duration)*100))
1597
+ print(" - CSV data matching: {:.2f} seconds ({:.1f}%)".format(csv_matching_duration, (csv_matching_duration/parsing_duration)*100))
1598
+ print(" - Files processed: {}, Files skipped: {}, Parse errors: {}".format(docx_files_processed, docx_files_skipped, docx_parse_errors))
1599
+ MediLink_ConfigLoader.log("Surgery schedule parsing completed at: {} (Duration: {:.2f} seconds)".format(
1600
+ time.strftime("%H:%M:%S"), parsing_duration), level="INFO")
1601
+
1602
+ except Exception as e:
1603
+ message = "An error occurred while updating diagnosis codes. Please check the DOCX files and configuration: {}".format(e)
1604
+ MediLink_ConfigLoader.log(message, level="ERROR")
1605
+ print(message)
1606
+
1607
+ def load_data_sources(config, crosswalk):
1608
+ """Loads historical mappings from MAPAT and Carol's CSVs."""
1609
+ patient_id_to_insurance_id = load_insurance_data_from_mapat(config, crosswalk)
1610
+ if not patient_id_to_insurance_id:
1611
+ raise ValueError("Failed to load historical Patient ID to Insurance ID mappings from MAPAT.")
1612
+
1613
+ payer_id_to_patient_ids = load_historical_payer_to_patient_mappings(config)
1614
+ if not payer_id_to_patient_ids:
1615
+ raise ValueError("Failed to load historical Carol's CSVs.")
1616
+
1617
+ return patient_id_to_insurance_id, payer_id_to_patient_ids
1618
+
1619
+ def map_payer_ids_to_insurance_ids(patient_id_to_insurance_id, payer_id_to_patient_ids):
1620
+ """Maps Payer IDs to Insurance IDs based on the historical mappings."""
1621
+ payer_id_to_details = {}
1622
+ for payer_id, patient_ids in payer_id_to_patient_ids.items():
1623
+ medisoft_ids = set()
1624
+ for patient_id in patient_ids:
1625
+ if patient_id in patient_id_to_insurance_id:
1626
+ medisoft_id = patient_id_to_insurance_id[patient_id]
1627
+ medisoft_ids.add(medisoft_id)
1628
+ MediLink_ConfigLoader.log("Added Medisoft ID {} for Patient ID {} and Payer ID {}".format(medisoft_id, patient_id, payer_id))
1629
+ else:
1630
+ MediLink_ConfigLoader.log("No matching Insurance ID found for Patient ID {}".format(patient_id))
1631
+ if medisoft_ids:
1632
+ # Read default endpoint from cached configuration (maintains existing OPTUMEDI behavior)
1633
+ try:
1634
+ cfg, _cw = get_cached_configuration()
1635
+ except Exception:
1636
+ cfg = {}
1637
+ default_ep = _get_default_endpoint(cfg)
1638
+ payer_id_to_details[payer_id] = {
1639
+ "endpoint": default_ep,
1640
+ "medisoft_id": list(medisoft_ids),
1641
+ "medisoft_medicare_id": [] # Placeholder for future implementation
1642
+ }
1643
+ return payer_id_to_details
1644
+
1645
+ def _display_mains_file_error(mains_path):
1646
+ """
1647
+ Helper function to display the critical MAINS file error message.
1648
+
1649
+ Args:
1650
+ mains_path (str): The path where the MAINS file was expected to be found.
1651
+ """
1652
+ error_msg = "CRITICAL: MAINS file not found at: {}. This file is required for insurance name to Medisoft ID mapping.".format(mains_path)
1653
+ if hasattr(MediLink_ConfigLoader, 'log'):
1654
+ MediLink_ConfigLoader.log(error_msg, level="CRITICAL")
1655
+ print("\n" + "="*80)
1656
+ print("CRITICAL ERROR: MAINS FILE MISSING")
1657
+ print("="*80)
1658
+ print("\nThe MAINS file is required for the following critical functions:")
1659
+ print("* Mapping insurance company names to Medisoft IDs")
1660
+ print("* Converting insurance names to payer IDs for claim submission")
1661
+ print("* Creating properly formatted 837p claim files")
1662
+ print("\nWithout this file, claim submission will fail because:")
1663
+ print("* Insurance names cannot be converted to payer IDs")
1664
+ print("* 837p claim files cannot be generated")
1665
+ print("* Claims cannot be submitted to insurance companies")
1666
+ print("\nTO FIX THIS:")
1667
+ print("1. Ensure the MAINS file exists at: {}".format(mains_path))
1668
+ print("2. If the file is missing, llamar a Dani")
1669
+ print("3. The file should contain insurance company data from your Medisoft system")
1670
+ print("="*80)
1671
+ time.sleep(3) # 3 second pause to allow user to read critical error message
1672
+
1673
+
1674
+ def load_insurance_data_from_mains(config):
1675
+ """
1676
+ Loads insurance data from MAINS and creates a mapping from insurance names to their respective IDs.
1677
+ This mapping is critical for the crosswalk update process to correctly associate payer IDs with insurance IDs.
1678
+
1679
+ Args:
1680
+ config (dict): Configuration object containing necessary paths and parameters.
1681
+
1682
+ Returns:
1683
+ dict: A dictionary mapping insurance names to insurance IDs.
1684
+ """
1685
+ # Use cached configuration to avoid repeated loading
1686
+ try:
1687
+ config, crosswalk = get_cached_configuration()
1688
+ except Exception as e:
1689
+ print("Warning: Failed to load cached configuration: {}".format(e))
1690
+ # Return empty mapping if configuration loading fails
1691
+ return {}
1692
+
1693
+ # XP Compatibility: Check if MediLink_DataMgmt is available
1694
+ if MediLink_DataMgmt is None:
1695
+ print("Warning: MediLink_DataMgmt not available. Cannot load MAINS data.")
1696
+ return {}
1697
+
1698
+ # Retrieve MAINS path and slicing information from the configuration
1699
+ # TODO (Low) For secondary insurance, this needs to be pulling from the correct MAINS (there are 2)
1700
+ # TODO (Low) Performance: There probably needs to be a dictionary proxy for MAINS that gets updated.
1701
+ # Meh, this just has to be part of the new architecture plan where we make Medisoft a downstream
1702
+ # recipient from the db.
1703
+ # TODO (High) The Medisoft Medicare flag needs to be brought in here.
1704
+ try:
1705
+ mains_path = config.get('MAINS_MED_PATH', '')
1706
+ mains_slices = crosswalk.get('mains_mapping', {}).get('slices', {})
1707
+ except (KeyError, AttributeError) as e:
1708
+ print("Warning: Failed to get MAINS configuration: {}".format(e))
1709
+ return {}
1710
+
1711
+ # Initialize the dictionary to hold the insurance to insurance ID mappings
1712
+ insurance_to_id = {}
1713
+
1714
+ try:
1715
+ # Check if MAINS file exists before attempting to read
1716
+ if not os.path.exists(mains_path):
1717
+ _display_mains_file_error(mains_path)
1718
+ return insurance_to_id
1719
+
1720
+ # XP Compatibility: Check if MediLink_DataMgmt has the required function
1721
+ if not hasattr(MediLink_DataMgmt, 'read_general_fixed_width_data'):
1722
+ print("Warning: MediLink_DataMgmt.read_general_fixed_width_data not available. Cannot load MAINS data.")
1723
+ return insurance_to_id
1724
+
1725
+ # Read data from MAINS using a provided function to handle fixed-width data
1726
+ for record, line_number in MediLink_DataMgmt.read_general_fixed_width_data(mains_path, mains_slices):
1727
+ insurance_name = record['MAINSNAME']
1728
+ # Assuming line_number gives the correct insurance ID without needing adjustment
1729
+ insurance_to_id[insurance_name] = line_number
1730
+
1731
+ if hasattr(MediLink_ConfigLoader, 'log'):
1732
+ MediLink_ConfigLoader.log("Successfully loaded {} insurance records from MAINS".format(len(insurance_to_id)), level="INFO")
1733
+ else:
1734
+ print("Successfully loaded {} insurance records from MAINS".format(len(insurance_to_id)))
1735
+
1736
+ except FileNotFoundError:
1737
+ _display_mains_file_error(mains_path)
1738
+ except Exception as e:
1739
+ error_msg = "Error loading MAINS data: {}. Continuing without MAINS data.".format(str(e))
1740
+ if hasattr(MediLink_ConfigLoader, 'log'):
1741
+ MediLink_ConfigLoader.log(error_msg, level="ERROR")
1742
+ print("Error loading MAINS data: {}. Continuing without MAINS data.".format(str(e)))
1743
+
1744
+ return insurance_to_id
1745
+
1746
+ def load_insurance_data_from_mapat(config, crosswalk):
1747
+ """
1748
+ Loads insurance data from MAPAT and creates a mapping from patient ID to insurance ID.
1749
+
1750
+ Args:
1751
+ config (dict): Configuration object containing necessary paths and parameters.
1752
+ crosswalk ... ADD HERE.
1753
+
1754
+ Returns:
1755
+ dict: A dictionary mapping patient IDs to insurance IDs.
1756
+ """
1757
+ # Retrieve MAPAT path and slicing information from the configuration
1758
+ ac = _ac()
1759
+ mapat_path = ac.get_mapat_med_path() if ac else ''
1760
+ mapat_slices = crosswalk['mapat_mapping']['slices']
1761
+
1762
+ # Initialize the dictionary to hold the patient ID to insurance ID mappings
1763
+ patient_id_to_insurance_id = {}
1764
+
1765
+ # Read data from MAPAT using a provided function to handle fixed-width data
1766
+ for record, _ in MediLink_DataMgmt.read_general_fixed_width_data(mapat_path, mapat_slices):
1767
+ patient_id = record['MAPATPXID']
1768
+ insurance_id = record['MAPATINID']
1769
+ patient_id_to_insurance_id[patient_id] = insurance_id
1770
+
1771
+ return patient_id_to_insurance_id
1772
+
1773
+ def parse_z_dat(z_dat_path, config): # Why is this in MediBot and not MediLink?
1774
+ """
1775
+ Parses the Z.dat file to map Patient IDs to Insurance Names using the provided fixed-width file format.
1776
+
1777
+ Args:
1778
+ z_dat_path (str): Path to the Z.dat file.
1779
+ config (dict): Configuration object containing slicing information and other parameters.
1780
+
1781
+ Returns:
1782
+ dict: A dictionary mapping Patient IDs to Insurance Names.
1783
+ """
1784
+ patient_id_to_insurance_name = {}
1785
+
1786
+ try:
1787
+ # Reading blocks of fixed-width data (up to 5 lines per record)
1788
+ for personal_info, insurance_info, service_info, service_info_2, service_info_3 in MediLink_DataMgmt.read_fixed_width_data(z_dat_path):
1789
+ # Parse Z.dat reserved record format: 3 active + 2 reserved lines
1790
+ parsed_data = MediLink_DataMgmt.parse_fixed_width_data(personal_info, insurance_info, service_info, service_info_2, service_info_3, config.get('MediLink_Config', config))
1791
+
1792
+ # Extract Patient ID and Insurance Name from parsed data
1793
+ patient_id = parsed_data.get('PATID')
1794
+ insurance_name = parsed_data.get('INAME')
1795
+
1796
+ if patient_id and insurance_name:
1797
+ patient_id_to_insurance_name[patient_id] = insurance_name
1798
+ MediLink_ConfigLoader.log("Mapped Patient ID {} to Insurance Name {}".format(patient_id, insurance_name), config, level="INFO")
1799
+
1800
+ except FileNotFoundError:
1801
+ MediLink_ConfigLoader.log("File not found: {}".format(z_dat_path), config, level="INFO")
1802
+ except Exception as e:
1803
+ MediLink_ConfigLoader.log("Failed to parse Z.dat: {}".format(str(e)), config, level="INFO")
1804
+
1805
+ return patient_id_to_insurance_name
1806
+
1807
+ def load_historical_payer_to_patient_mappings(config):
1808
+ """
1809
+ Loads historical mappings from multiple Carol's CSV files in a specified directory,
1810
+ mapping Payer IDs to sets of Patient IDs.
1811
+
1812
+ Args:
1813
+ config (dict): Configuration object containing the directory path for Carol's CSV files
1814
+ and other necessary parameters.
1815
+
1816
+ Returns:
1817
+ dict: A dictionary where each key is a Payer ID and the value is a set of Patient IDs.
1818
+ """
1819
+ directory_path = os.path.dirname(config['CSV_FILE_PATH'])
1820
+ payer_to_patient_ids = defaultdict(set)
1821
+
1822
+ try:
1823
+ # Check if the directory exists
1824
+ if not os.path.isdir(directory_path):
1825
+ raise FileNotFoundError("Directory '{}' not found.".format(directory_path))
1826
+
1827
+ # Loop through each file in the directory containing Carol's historical CSVs
1828
+ for filename in os.listdir(directory_path):
1829
+ file_path = os.path.join(directory_path, filename)
1830
+ if filename.endswith('.csv'):
1831
+ try:
1832
+ with open(file_path, 'r', encoding='utf-8') as csvfile:
1833
+ reader = csv.DictReader(csvfile)
1834
+ patient_count = 0 # Counter for Patient IDs found in this CSV
1835
+ for row in reader:
1836
+ if 'Patient ID' not in row or 'Ins1 Payer ID' not in row:
1837
+ continue # Skip this row if either key is missing
1838
+ if not row.get('Patient ID').strip() or not row.get('Ins1 Payer ID').strip():
1839
+ continue # Skip this row if either value is missing or empty
1840
+
1841
+ payer_id = row['Ins1 Payer ID'].strip()
1842
+ patient_id = row['Patient ID'].strip()
1843
+ payer_to_patient_ids[payer_id].add(patient_id)
1844
+ patient_count += 1 # Increment the counter for each valid mapping
1845
+
1846
+ # Log the accumulated count for this CSV file
1847
+ if patient_count > 0:
1848
+ MediLink_ConfigLoader.log("CSV file '{}' has {} Patient IDs with Payer IDs.".format(filename, patient_count), level="DEBUG")
1849
+ else:
1850
+ MediLink_ConfigLoader.log("CSV file '{}' is empty or does not have valid Patient ID or Payer ID mappings.".format(filename), level="DEBUG")
1851
+ except Exception as e:
1852
+ print("Error processing file {}: {}".format(filename, e))
1853
+ MediLink_ConfigLoader.log("Error processing file '{}': {}".format(filename, e), level="ERROR")
1854
+ except FileNotFoundError as e:
1855
+ print("Error: {}".format(e))
1856
+
1857
+ if not payer_to_patient_ids:
1858
+ print("No historical mappings were generated.")
1859
+
1860
+ return dict(payer_to_patient_ids)
1861
+
1862
+ def capitalize_all_fields(csv_data):
1863
+ """
1864
+ Converts all text fields in the CSV data to uppercase.
1865
+
1866
+ Parameters:
1867
+ csv_data (list of dict): The CSV data where each row is represented as a dictionary.
1868
+
1869
+ Returns:
1870
+ None: The function modifies the csv_data in place.
1871
+ """
1872
+ # PERFORMANCE FIX: Optimize uppercase conversion while preserving complex types
1873
+ for row in csv_data:
1874
+ updated_row = {}
1875
+ for key, value in row.items():
1876
+ # Preserve internal/derived fields intact (e.g., `_all_surgery_dates`, `_surgery_date_to_diagnosis`)
1877
+ if isinstance(key, str) and key.startswith('_'):
1878
+ updated_row[key] = value
1879
+ continue
1880
+ # Uppercase plain strings
1881
+ if isinstance(value, str):
1882
+ updated_row[key] = value.upper()
1883
+ continue
1884
+ # Preserve complex containers; optionally uppercase their string contents
1885
+ if isinstance(value, list):
1886
+ updated_row[key] = [elem.upper() if isinstance(elem, str) else elem for elem in value]
1887
+ continue
1888
+ if isinstance(value, dict):
1889
+ updated_row[key] = {k: (v.upper() if isinstance(v, str) else v) for k, v in value.items()}
1890
+ continue
1891
+ # Leave datetimes as-is; coerce simple scalars to string upper for consistency
1892
+ if isinstance(value, datetime):
1893
+ updated_row[key] = value
1894
+ else:
1895
+ updated_row[key] = str(value).upper() if value is not None else value
1822
1896
  row.update(updated_row)