carrot-transform 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of carrot-transform might be problematic. Click here for more details.
- {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
- carrot_transform-0.4.0.dist-info/RECORD +41 -0
- {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
- carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
- carrottransform/__init__.py +1 -1
- carrottransform/_version.py +2 -2
- carrottransform/cli/command.py +9 -5
- carrottransform/cli/subcommands/run.py +302 -443
- carrottransform/cli/subcommands/run_v2.py +145 -0
- carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
- carrottransform/examples/test/rules/v1.json +280 -0
- carrottransform/examples/test/rules/v2.json +115 -0
- carrottransform/tools/__init__.py +4 -14
- carrottransform/tools/args.py +128 -0
- carrottransform/tools/click.py +21 -0
- carrottransform/tools/concept_helpers.py +61 -0
- carrottransform/tools/core.py +163 -0
- carrottransform/tools/date_helpers.py +79 -0
- carrottransform/tools/file_helpers.py +177 -7
- carrottransform/tools/logger.py +19 -0
- carrottransform/tools/mapping_types.py +32 -0
- carrottransform/tools/mappingrules.py +298 -32
- carrottransform/tools/metrics.py +274 -49
- carrottransform/tools/omopcdm.py +42 -32
- carrottransform/tools/orchestrator.py +381 -0
- carrottransform/tools/person_helpers.py +126 -0
- carrottransform/tools/record_builder.py +413 -0
- carrottransform/tools/stream_helpers.py +71 -0
- carrottransform/tools/types.py +71 -0
- carrottransform/tools/validation.py +62 -0
- carrot_transform-0.3.4.dist-info/RECORD +0 -24
- carrot_transform-0.3.4.dist-info/entry_points.txt +0 -3
- {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
from typing import Dict, List, Optional, Tuple, Set
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from carrottransform.tools.mapping_types import ConceptMapping
|
|
4
|
+
from carrottransform.tools.date_helpers import get_datetime_value
|
|
5
|
+
from carrottransform.tools.logger import logger_setup
|
|
6
|
+
from carrottransform.tools.validation import valid_value
|
|
7
|
+
from carrottransform.tools.types import (
|
|
8
|
+
RecordContext,
|
|
9
|
+
RecordResult,
|
|
10
|
+
)
|
|
11
|
+
from carrottransform.tools.concept_helpers import (
|
|
12
|
+
generate_combinations,
|
|
13
|
+
get_value_mapping,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
logger = logger_setup()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TargetRecordBuilder(ABC):
|
|
20
|
+
"""Base class for building target records"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, context: RecordContext):
|
|
23
|
+
self.context = context
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def build_records(self) -> RecordResult:
|
|
27
|
+
"""Build target records - must be implemented by subclasses"""
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
def create_empty_record(self) -> List[str]:
|
|
31
|
+
"""Create an empty target record with proper initialization"""
|
|
32
|
+
tgtarray = [""] * len(self.context.tgtcolmap)
|
|
33
|
+
|
|
34
|
+
# Initialize numeric fields to 0
|
|
35
|
+
for req_integer in self.context.notnull_numeric_fields:
|
|
36
|
+
if req_integer in self.context.tgtcolmap:
|
|
37
|
+
tgtarray[self.context.tgtcolmap[req_integer]] = "0"
|
|
38
|
+
|
|
39
|
+
return tgtarray
|
|
40
|
+
|
|
41
|
+
def apply_concept_mapping(self, tgtarray: List[str], concept_combo: Dict[str, int]):
|
|
42
|
+
"""Apply a single concept combination to target array"""
|
|
43
|
+
for dest_field, concept_id in concept_combo.items():
|
|
44
|
+
if dest_field in self.context.tgtcolmap:
|
|
45
|
+
tgtarray[self.context.tgtcolmap[dest_field]] = str(concept_id)
|
|
46
|
+
|
|
47
|
+
def apply_original_value_mappings(
|
|
48
|
+
self, tgtarray: List[str], original_value_fields: List[str], source_value: str
|
|
49
|
+
):
|
|
50
|
+
"""Apply original value mappings (direct field copying)"""
|
|
51
|
+
for dest_field in original_value_fields:
|
|
52
|
+
if dest_field in self.context.tgtcolmap:
|
|
53
|
+
tgtarray[self.context.tgtcolmap[dest_field]] = source_value
|
|
54
|
+
|
|
55
|
+
def apply_person_id_mapping(self, tgtarray: List[str]):
|
|
56
|
+
"""Apply person ID mapping"""
|
|
57
|
+
if not self.context.v2_mapping.person_id_mapping:
|
|
58
|
+
return
|
|
59
|
+
|
|
60
|
+
person_id_mapping = self.context.v2_mapping.person_id_mapping
|
|
61
|
+
if (
|
|
62
|
+
person_id_mapping.dest_field in self.context.tgtcolmap
|
|
63
|
+
and person_id_mapping.source_field in self.context.srccolmap
|
|
64
|
+
):
|
|
65
|
+
person_id = self.context.srcdata[
|
|
66
|
+
self.context.srccolmap[person_id_mapping.source_field]
|
|
67
|
+
]
|
|
68
|
+
tgtarray[self.context.tgtcolmap[person_id_mapping.dest_field]] = person_id
|
|
69
|
+
|
|
70
|
+
def apply_date_mappings(self, tgtarray: List[str]) -> bool:
|
|
71
|
+
"""Apply date mappings with proper error handling"""
|
|
72
|
+
if not self.context.v2_mapping.date_mapping:
|
|
73
|
+
return True
|
|
74
|
+
|
|
75
|
+
date_mapping = self.context.v2_mapping.date_mapping
|
|
76
|
+
|
|
77
|
+
if date_mapping.source_field not in self.context.srccolmap:
|
|
78
|
+
logger.warning(
|
|
79
|
+
f"Date mapping source field not found in source data: {date_mapping.source_field}"
|
|
80
|
+
)
|
|
81
|
+
return True
|
|
82
|
+
|
|
83
|
+
source_date = self.context.srcdata[
|
|
84
|
+
self.context.srccolmap[date_mapping.source_field]
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
for dest_field in date_mapping.dest_fields:
|
|
88
|
+
if dest_field in self.context.tgtcolmap:
|
|
89
|
+
if not self._apply_single_date_field(tgtarray, dest_field, source_date):
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
return True
|
|
93
|
+
|
|
94
|
+
def _apply_single_date_field(
|
|
95
|
+
self, tgtarray: List[str], dest_field: str, source_date: str
|
|
96
|
+
) -> bool:
|
|
97
|
+
"""Apply a single date field mapping"""
|
|
98
|
+
# Handle date component fields (birth dates with year/month/day)
|
|
99
|
+
if dest_field in self.context.date_component_data:
|
|
100
|
+
dt = get_datetime_value(source_date.split(" ")[0])
|
|
101
|
+
if dt is None:
|
|
102
|
+
self.context.metrics.increment_key_count(
|
|
103
|
+
source=self.context.srcfilename,
|
|
104
|
+
fieldname=self.context.srcfield,
|
|
105
|
+
tablename=self.context.tgtfilename,
|
|
106
|
+
concept_id="all",
|
|
107
|
+
additional="",
|
|
108
|
+
count_type="invalid_date_fields",
|
|
109
|
+
)
|
|
110
|
+
logger.warning(f"Invalid date fields: {self.context.srcfield}")
|
|
111
|
+
return False
|
|
112
|
+
|
|
113
|
+
# Set individual date components
|
|
114
|
+
component_info = self.context.date_component_data[dest_field]
|
|
115
|
+
if (
|
|
116
|
+
"year" in component_info
|
|
117
|
+
and component_info["year"] in self.context.tgtcolmap
|
|
118
|
+
):
|
|
119
|
+
tgtarray[self.context.tgtcolmap[component_info["year"]]] = str(dt.year)
|
|
120
|
+
if (
|
|
121
|
+
"month" in component_info
|
|
122
|
+
and component_info["month"] in self.context.tgtcolmap
|
|
123
|
+
):
|
|
124
|
+
tgtarray[self.context.tgtcolmap[component_info["month"]]] = str(
|
|
125
|
+
dt.month
|
|
126
|
+
)
|
|
127
|
+
if (
|
|
128
|
+
"day" in component_info
|
|
129
|
+
and component_info["day"] in self.context.tgtcolmap
|
|
130
|
+
):
|
|
131
|
+
tgtarray[self.context.tgtcolmap[component_info["day"]]] = str(dt.day)
|
|
132
|
+
|
|
133
|
+
# Set the main date field
|
|
134
|
+
tgtarray[self.context.tgtcolmap[dest_field]] = source_date
|
|
135
|
+
|
|
136
|
+
# Handle regular date fields with linked date-only fields
|
|
137
|
+
elif dest_field in self.context.date_col_data:
|
|
138
|
+
tgtarray[self.context.tgtcolmap[dest_field]] = source_date
|
|
139
|
+
# Set the linked date-only field
|
|
140
|
+
if self.context.date_col_data[dest_field] in self.context.tgtcolmap:
|
|
141
|
+
tgtarray[
|
|
142
|
+
self.context.tgtcolmap[self.context.date_col_data[dest_field]]
|
|
143
|
+
] = source_date[:10]
|
|
144
|
+
|
|
145
|
+
# Handle simple date fields
|
|
146
|
+
else:
|
|
147
|
+
tgtarray[self.context.tgtcolmap[dest_field]] = source_date
|
|
148
|
+
|
|
149
|
+
return True
|
|
150
|
+
|
|
151
|
+
def write_record_directly(self, output_record: List[str]) -> bool:
|
|
152
|
+
"""Write single record directly to output file with all necessary processing"""
|
|
153
|
+
# Set auto-increment ID
|
|
154
|
+
if self.context.auto_num_col is not None:
|
|
155
|
+
output_record[self.context.tgtcolmap[self.context.auto_num_col]] = str(
|
|
156
|
+
self.context.record_numbers[self.context.tgtfilename]
|
|
157
|
+
)
|
|
158
|
+
self.context.record_numbers[self.context.tgtfilename] += 1
|
|
159
|
+
|
|
160
|
+
# Map person ID
|
|
161
|
+
person_id = output_record[self.context.tgtcolmap[self.context.person_id_col]]
|
|
162
|
+
if person_id in self.context.person_lookup:
|
|
163
|
+
output_record[self.context.tgtcolmap[self.context.person_id_col]] = (
|
|
164
|
+
self.context.person_lookup[person_id]
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Update metrics
|
|
168
|
+
self.context.metrics.increment_with_datacol(
|
|
169
|
+
source_path=self.context.srcfilename,
|
|
170
|
+
target_file=self.context.tgtfilename,
|
|
171
|
+
datacol=self.context.srcfield,
|
|
172
|
+
out_record=output_record,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Write directly to output file (files are kept open)
|
|
176
|
+
self.context.file_handles[self.context.tgtfilename].write(
|
|
177
|
+
"\t".join(output_record) + "\n"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
return True
|
|
181
|
+
else:
|
|
182
|
+
# Invalid person ID
|
|
183
|
+
self.context.metrics.increment_key_count(
|
|
184
|
+
source=self.context.srcfilename,
|
|
185
|
+
fieldname="all",
|
|
186
|
+
tablename=self.context.tgtfilename,
|
|
187
|
+
concept_id="all",
|
|
188
|
+
additional="",
|
|
189
|
+
count_type="invalid_person_ids",
|
|
190
|
+
)
|
|
191
|
+
return False
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class PersonRecordBuilder(TargetRecordBuilder):
|
|
195
|
+
"""Specialized builder for person table records"""
|
|
196
|
+
|
|
197
|
+
def __init__(self, context: RecordContext):
|
|
198
|
+
super().__init__(context)
|
|
199
|
+
self.processed_cache: Set[str] = set()
|
|
200
|
+
|
|
201
|
+
def build_records(self) -> RecordResult:
|
|
202
|
+
"""Build person table records with special merging logic"""
|
|
203
|
+
# Check if person ID mapping exists
|
|
204
|
+
if not self.context.v2_mapping.person_id_mapping:
|
|
205
|
+
return RecordResult(False, 0, self.context.metrics)
|
|
206
|
+
|
|
207
|
+
# Create a unique key for this source row
|
|
208
|
+
person_key = f"{self.context.srcfilename}:{self.context.srcdata[self.context.srccolmap[self.context.v2_mapping.person_id_mapping.source_field]]}"
|
|
209
|
+
|
|
210
|
+
# Only process if we haven't already processed this person record
|
|
211
|
+
if person_key in self.processed_cache:
|
|
212
|
+
return RecordResult(False, 0, self.context.metrics)
|
|
213
|
+
|
|
214
|
+
# Mark this person as processed
|
|
215
|
+
self.processed_cache.add(person_key)
|
|
216
|
+
|
|
217
|
+
# Collect all mappings from all fields
|
|
218
|
+
all_concept_mappings, all_original_values = self._collect_all_mappings()
|
|
219
|
+
|
|
220
|
+
# If no valid mappings found, return empty
|
|
221
|
+
if not all_concept_mappings and not all_original_values:
|
|
222
|
+
return RecordResult(False, 0, self.context.metrics)
|
|
223
|
+
|
|
224
|
+
# Generate combined concept combinations
|
|
225
|
+
concept_combinations = (
|
|
226
|
+
generate_combinations(all_concept_mappings)
|
|
227
|
+
if all_concept_mappings
|
|
228
|
+
else [{}]
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
# Create person records for each combination
|
|
232
|
+
record_count = 0
|
|
233
|
+
for concept_combo in concept_combinations:
|
|
234
|
+
record = self._build_single_person_record(
|
|
235
|
+
concept_combo, all_original_values
|
|
236
|
+
)
|
|
237
|
+
if record:
|
|
238
|
+
# Write record directly using the built-in method
|
|
239
|
+
if self.write_record_directly(record):
|
|
240
|
+
record_count += 1
|
|
241
|
+
|
|
242
|
+
return RecordResult(record_count > 0, record_count, self.context.metrics)
|
|
243
|
+
|
|
244
|
+
def _collect_all_mappings(self) -> Tuple[Dict[str, List[int]], Dict[str, str]]:
|
|
245
|
+
"""Collect all concept mappings and original values from all fields"""
|
|
246
|
+
all_concept_mappings = {}
|
|
247
|
+
all_original_values = {}
|
|
248
|
+
|
|
249
|
+
for (
|
|
250
|
+
field_name,
|
|
251
|
+
concept_mapping,
|
|
252
|
+
) in self.context.v2_mapping.concept_mappings.items():
|
|
253
|
+
if field_name not in self.context.srccolmap:
|
|
254
|
+
continue
|
|
255
|
+
|
|
256
|
+
# Check if field has valid value
|
|
257
|
+
source_value = str(self.context.srcdata[self.context.srccolmap[field_name]])
|
|
258
|
+
if not valid_value(source_value):
|
|
259
|
+
continue
|
|
260
|
+
|
|
261
|
+
# Get value mapping for this field
|
|
262
|
+
value_mapping = get_value_mapping(concept_mapping, source_value)
|
|
263
|
+
|
|
264
|
+
if value_mapping:
|
|
265
|
+
# Add this field's mappings to the combined mappings
|
|
266
|
+
for dest_field, concept_ids in value_mapping.items():
|
|
267
|
+
all_concept_mappings[dest_field] = concept_ids
|
|
268
|
+
|
|
269
|
+
# Collect original value mappings
|
|
270
|
+
if concept_mapping.original_value_fields:
|
|
271
|
+
for dest_field in concept_mapping.original_value_fields:
|
|
272
|
+
all_original_values[dest_field] = source_value
|
|
273
|
+
|
|
274
|
+
return all_concept_mappings, all_original_values
|
|
275
|
+
|
|
276
|
+
def _build_single_person_record(
|
|
277
|
+
self, concept_combo: Dict[str, int], all_original_values: Dict[str, str]
|
|
278
|
+
) -> Optional[List[str]]:
|
|
279
|
+
"""Build a single person record"""
|
|
280
|
+
tgtarray = self.create_empty_record()
|
|
281
|
+
|
|
282
|
+
# Apply the merged concept combination
|
|
283
|
+
self.apply_concept_mapping(tgtarray, concept_combo)
|
|
284
|
+
|
|
285
|
+
# Handle original value fields (direct field copying)
|
|
286
|
+
for dest_field, source_value in all_original_values.items():
|
|
287
|
+
if dest_field in self.context.tgtcolmap:
|
|
288
|
+
tgtarray[self.context.tgtcolmap[dest_field]] = source_value
|
|
289
|
+
|
|
290
|
+
# Handle person ID mapping
|
|
291
|
+
self.apply_person_id_mapping(tgtarray)
|
|
292
|
+
|
|
293
|
+
# Handle date mappings
|
|
294
|
+
if not self.apply_date_mappings(tgtarray):
|
|
295
|
+
logger.warning("Failed to apply date mappings for person table")
|
|
296
|
+
return None
|
|
297
|
+
|
|
298
|
+
return tgtarray
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
class StandardRecordBuilder(TargetRecordBuilder):
|
|
302
|
+
"""Builder for standard (non-person) table records"""
|
|
303
|
+
|
|
304
|
+
def build_records(self) -> RecordResult:
|
|
305
|
+
"""Build standard table records"""
|
|
306
|
+
# Check if source field has a value
|
|
307
|
+
if not valid_value(
|
|
308
|
+
str(self.context.srcdata[self.context.srccolmap[self.context.srcfield]])
|
|
309
|
+
):
|
|
310
|
+
self.context.metrics.increment_key_count(
|
|
311
|
+
source=self.context.srcfilename,
|
|
312
|
+
fieldname=self.context.srcfield,
|
|
313
|
+
tablename=self.context.tgtfilename,
|
|
314
|
+
concept_id="all",
|
|
315
|
+
additional="",
|
|
316
|
+
count_type="invalid_source_fields",
|
|
317
|
+
)
|
|
318
|
+
return RecordResult(False, 0, self.context.metrics)
|
|
319
|
+
|
|
320
|
+
# Check if we have a concept mapping for this field
|
|
321
|
+
if self.context.srcfield not in self.context.v2_mapping.concept_mappings:
|
|
322
|
+
return RecordResult(False, 0, self.context.metrics)
|
|
323
|
+
|
|
324
|
+
concept_mapping = self.context.v2_mapping.concept_mappings[
|
|
325
|
+
self.context.srcfield
|
|
326
|
+
]
|
|
327
|
+
source_value = str(
|
|
328
|
+
self.context.srcdata[self.context.srccolmap[self.context.srcfield]]
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
# Get value mapping (concept mappings or wildcard)
|
|
332
|
+
value_mapping = get_value_mapping(concept_mapping, source_value)
|
|
333
|
+
|
|
334
|
+
# Only proceed if we have concept mappings OR original value fields
|
|
335
|
+
if not value_mapping and not concept_mapping.original_value_fields:
|
|
336
|
+
return RecordResult(False, 0, self.context.metrics)
|
|
337
|
+
|
|
338
|
+
# Generate all concept combinations
|
|
339
|
+
concept_combinations = generate_combinations(value_mapping)
|
|
340
|
+
|
|
341
|
+
# If no concept combinations, don't build records
|
|
342
|
+
if not concept_combinations:
|
|
343
|
+
return RecordResult(False, 0, self.context.metrics)
|
|
344
|
+
|
|
345
|
+
# Create records for each concept combination
|
|
346
|
+
record_count = 0
|
|
347
|
+
for concept_combo in concept_combinations:
|
|
348
|
+
record = self._build_single_standard_record(
|
|
349
|
+
concept_combo, concept_mapping, source_value
|
|
350
|
+
)
|
|
351
|
+
if record:
|
|
352
|
+
# Write record directly using the built-in method
|
|
353
|
+
if self.write_record_directly(record):
|
|
354
|
+
record_count += 1
|
|
355
|
+
else:
|
|
356
|
+
# If write fails, return failure
|
|
357
|
+
return RecordResult(False, 0, self.context.metrics)
|
|
358
|
+
else:
|
|
359
|
+
# If any record fails, return failure
|
|
360
|
+
return RecordResult(False, 0, self.context.metrics)
|
|
361
|
+
|
|
362
|
+
return RecordResult(record_count > 0, record_count, self.context.metrics)
|
|
363
|
+
|
|
364
|
+
def _build_single_standard_record(
|
|
365
|
+
self,
|
|
366
|
+
concept_combo: Dict[str, int],
|
|
367
|
+
concept_mapping: ConceptMapping,
|
|
368
|
+
source_value: str,
|
|
369
|
+
) -> Optional[List[str]]:
|
|
370
|
+
"""Build a single standard record"""
|
|
371
|
+
tgtarray = self.create_empty_record()
|
|
372
|
+
|
|
373
|
+
# Apply this specific concept combination
|
|
374
|
+
self.apply_concept_mapping(tgtarray, concept_combo)
|
|
375
|
+
|
|
376
|
+
# Handle original value fields (direct field copying)
|
|
377
|
+
if concept_mapping.original_value_fields:
|
|
378
|
+
self.apply_original_value_mappings(
|
|
379
|
+
tgtarray, concept_mapping.original_value_fields, source_value
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
# Handle person ID mapping
|
|
383
|
+
self.apply_person_id_mapping(tgtarray)
|
|
384
|
+
|
|
385
|
+
# Handle date mappings
|
|
386
|
+
if not self.apply_date_mappings(tgtarray):
|
|
387
|
+
logger.warning(f"Failed to apply date mappings for {self.context.srcfield}")
|
|
388
|
+
return None
|
|
389
|
+
|
|
390
|
+
return tgtarray
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
class RecordBuilderFactory:
|
|
394
|
+
"""Factory for creating appropriate record builders"""
|
|
395
|
+
|
|
396
|
+
# Class-level cache for person records
|
|
397
|
+
_person_processed_cache: Set[str] = set()
|
|
398
|
+
|
|
399
|
+
@classmethod
|
|
400
|
+
def create_builder(cls, context: RecordContext) -> TargetRecordBuilder:
|
|
401
|
+
"""Create the appropriate record builder based on table type"""
|
|
402
|
+
if context.tgtfilename == "person":
|
|
403
|
+
builder = PersonRecordBuilder(context)
|
|
404
|
+
# Share the class-level cache across all person builders
|
|
405
|
+
builder.processed_cache = cls._person_processed_cache
|
|
406
|
+
return builder
|
|
407
|
+
else:
|
|
408
|
+
return StandardRecordBuilder(context)
|
|
409
|
+
|
|
410
|
+
@classmethod
|
|
411
|
+
def clear_person_cache(cls):
|
|
412
|
+
"""Clear the person processed cache (useful for testing or new runs)"""
|
|
413
|
+
cls._person_processed_cache.clear()
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from typing import Dict, Set, Any
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from carrottransform.tools.mappingrules import MappingRules
|
|
4
|
+
from carrottransform.tools.omopcdm import OmopCDM
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class StreamingLookupCache:
|
|
8
|
+
"""Pre-computed lookup tables for efficient streaming processing"""
|
|
9
|
+
|
|
10
|
+
def __init__(self, mappingrules: MappingRules, omopcdm: OmopCDM):
|
|
11
|
+
self.mappingrules = mappingrules
|
|
12
|
+
self.omopcdm = omopcdm
|
|
13
|
+
|
|
14
|
+
# Pre-compute lookups
|
|
15
|
+
self.input_to_outputs = self._build_input_to_output_lookup()
|
|
16
|
+
self.file_metadata_cache = self._build_file_metadata_cache()
|
|
17
|
+
self.target_metadata_cache = self._build_target_metadata_cache()
|
|
18
|
+
|
|
19
|
+
def _build_input_to_output_lookup(self) -> Dict[str, Set[str]]:
|
|
20
|
+
"""Build lookup: input_file -> set of output tables it can map to"""
|
|
21
|
+
lookup = defaultdict(set)
|
|
22
|
+
|
|
23
|
+
for target_file, source_mappings in self.mappingrules.v2_mappings.items():
|
|
24
|
+
for source_file in source_mappings.keys():
|
|
25
|
+
lookup[source_file].add(target_file)
|
|
26
|
+
|
|
27
|
+
return dict(lookup)
|
|
28
|
+
|
|
29
|
+
def _build_file_metadata_cache(self) -> Dict[str, Dict[str, Any]]:
|
|
30
|
+
"""Pre-compute metadata for each input file"""
|
|
31
|
+
cache = {}
|
|
32
|
+
|
|
33
|
+
for input_file in self.mappingrules.get_all_infile_names():
|
|
34
|
+
datetime_source, person_id_source = (
|
|
35
|
+
self.mappingrules.get_infile_date_person_id(input_file)
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
data_fields = self.mappingrules.get_infile_data_fields(input_file)
|
|
39
|
+
|
|
40
|
+
cache[input_file] = {
|
|
41
|
+
"datetime_source": datetime_source,
|
|
42
|
+
"person_id_source": person_id_source,
|
|
43
|
+
"data_fields": data_fields,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
return cache
|
|
47
|
+
|
|
48
|
+
def _build_target_metadata_cache(self) -> Dict[str, Dict[str, Any]]:
|
|
49
|
+
"""Pre-compute metadata for each target table"""
|
|
50
|
+
cache = {}
|
|
51
|
+
|
|
52
|
+
for target_file in self.mappingrules.get_all_outfile_names():
|
|
53
|
+
auto_num_col = self.omopcdm.get_omop_auto_number_field(target_file)
|
|
54
|
+
person_id_col = self.omopcdm.get_omop_person_id_field(target_file)
|
|
55
|
+
date_col_data = self.omopcdm.get_omop_datetime_linked_fields(target_file)
|
|
56
|
+
date_component_data = self.omopcdm.get_omop_date_field_components(
|
|
57
|
+
target_file
|
|
58
|
+
)
|
|
59
|
+
notnull_numeric_fields = self.omopcdm.get_omop_notnull_numeric_fields(
|
|
60
|
+
target_file
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
cache[target_file] = {
|
|
64
|
+
"auto_num_col": auto_num_col,
|
|
65
|
+
"person_id_col": person_id_col,
|
|
66
|
+
"date_col_data": date_col_data,
|
|
67
|
+
"date_component_data": date_component_data,
|
|
68
|
+
"notnull_numeric_fields": notnull_numeric_fields,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
return cache
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from typing import Dict, List, Optional, TextIO
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import carrottransform.tools as tools
|
|
5
|
+
from carrottransform.tools.omopcdm import OmopCDM
|
|
6
|
+
from carrottransform.tools.mapping_types import V2TableMapping
|
|
7
|
+
from carrottransform.tools.mappingrules import MappingRules
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class ProcessingContext:
|
|
12
|
+
"""Context object containing all processing configuration and state"""
|
|
13
|
+
|
|
14
|
+
mappingrules: MappingRules
|
|
15
|
+
omopcdm: OmopCDM
|
|
16
|
+
input_dir: Path
|
|
17
|
+
person_lookup: Dict[str, str]
|
|
18
|
+
record_numbers: Dict[str, int]
|
|
19
|
+
file_handles: Dict[str, TextIO]
|
|
20
|
+
target_column_maps: Dict[str, Dict[str, int]]
|
|
21
|
+
metrics: tools.metrics.Metrics
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def input_files(self) -> List[str]:
|
|
25
|
+
return self.mappingrules.get_all_infile_names()
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def output_files(self) -> List[str]:
|
|
29
|
+
return self.mappingrules.get_all_outfile_names()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class RecordResult:
|
|
34
|
+
"""Result of record building operation"""
|
|
35
|
+
|
|
36
|
+
success: bool
|
|
37
|
+
record_count: int
|
|
38
|
+
metrics: tools.metrics.Metrics
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class RecordContext:
|
|
43
|
+
"""Context object containing all the data needed for record building"""
|
|
44
|
+
|
|
45
|
+
tgtfilename: str
|
|
46
|
+
tgtcolmap: Dict[str, int]
|
|
47
|
+
v2_mapping: V2TableMapping
|
|
48
|
+
srcfield: str
|
|
49
|
+
srcdata: List[str]
|
|
50
|
+
srccolmap: Dict[str, int]
|
|
51
|
+
srcfilename: str
|
|
52
|
+
omopcdm: OmopCDM
|
|
53
|
+
metrics: tools.metrics.Metrics
|
|
54
|
+
person_lookup: Dict[str, str]
|
|
55
|
+
record_numbers: Dict[str, int]
|
|
56
|
+
file_handles: Dict[str, TextIO]
|
|
57
|
+
auto_num_col: Optional[str]
|
|
58
|
+
person_id_col: str
|
|
59
|
+
date_col_data: Dict[str, str]
|
|
60
|
+
date_component_data: Dict[str, Dict[str, str]]
|
|
61
|
+
notnull_numeric_fields: List[str]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class ProcessingResult:
|
|
66
|
+
"""Result of data processing operation"""
|
|
67
|
+
|
|
68
|
+
output_counts: Dict[str, int]
|
|
69
|
+
rejected_id_counts: Dict[str, int]
|
|
70
|
+
success: bool = True
|
|
71
|
+
error_message: Optional[str] = None
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
from carrottransform.tools.logger import logger_setup
|
|
3
|
+
|
|
4
|
+
logger = logger_setup()
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def valid_value(value: str) -> bool:
|
|
8
|
+
"""Check if a value is valid (not empty/null)"""
|
|
9
|
+
return value.strip() != ""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def valid_date_value(item: str) -> bool:
|
|
13
|
+
"""
|
|
14
|
+
Check if a date item is non null and parses as ISO (YYYY-MM-DD), reverse-ISO
|
|
15
|
+
or dd/mm/yyyy or mm/dd/yyyy
|
|
16
|
+
"""
|
|
17
|
+
if item.strip() == "":
|
|
18
|
+
return False
|
|
19
|
+
if (
|
|
20
|
+
not _valid_iso_date(item)
|
|
21
|
+
and not _valid_reverse_iso_date(item)
|
|
22
|
+
and not _valid_uk_date(item)
|
|
23
|
+
):
|
|
24
|
+
logger.warning("Bad date : `{0}`".format(item))
|
|
25
|
+
return False
|
|
26
|
+
return True
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _valid_iso_date(item: str) -> bool:
|
|
30
|
+
"""
|
|
31
|
+
Check if a date item is non null and parses as ISO (YYYY-MM-DD)
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
datetime.datetime.strptime(item, "%Y-%m-%d")
|
|
35
|
+
except ValueError:
|
|
36
|
+
return False
|
|
37
|
+
|
|
38
|
+
return True
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _valid_reverse_iso_date(item: str) -> bool:
|
|
42
|
+
"""
|
|
43
|
+
Check if a date item is non null and parses as reverse ISO (DD-MM-YYYY)
|
|
44
|
+
"""
|
|
45
|
+
try:
|
|
46
|
+
datetime.datetime.strptime(item, "%d-%m-%Y")
|
|
47
|
+
except ValueError:
|
|
48
|
+
return False
|
|
49
|
+
|
|
50
|
+
return True
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _valid_uk_date(item: str) -> bool:
|
|
54
|
+
"""
|
|
55
|
+
Check if a date item is non null and parses as UK format (DD/MM/YYYY)
|
|
56
|
+
"""
|
|
57
|
+
try:
|
|
58
|
+
datetime.datetime.strptime(item, "%d/%m/%Y")
|
|
59
|
+
except ValueError:
|
|
60
|
+
return False
|
|
61
|
+
|
|
62
|
+
return True
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
carrottransform/__init__.py,sha256=cQJKTCpG2qmKxDl-VtSWQ3_WFjyzg4u_8nZacWAHFcU,73
|
|
2
|
-
carrottransform/_version.py,sha256=bm7SM-_MN0gstlNsCDO6dAajKcjQD-NxI_xpvfRx0Ts,172
|
|
3
|
-
carrottransform/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
carrottransform/cli/command.py,sha256=xYTaJsVZyRYv0CzUwrh7ZPK8hhGyC3MDfvVYxHcXYSM,508
|
|
5
|
-
carrottransform/cli/subcommands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
carrottransform/cli/subcommands/run.py,sha256=r2XanTvy4QowPbziZ5lqs-Tm8CAzCquL7DRy4lTT9Ak,23977
|
|
7
|
-
carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql,sha256=fXrPfdL3IzU5ux55ogsQKjjd-c1KzdP_N2A_JjlY3gk,18084
|
|
8
|
-
carrottransform/config/omop.json,sha256=OT3jvfPjKhjsDnQcQw1OAEOHhQLoHXNxTj_MDwNbYqo,1934
|
|
9
|
-
carrottransform/examples/test/inputs/Covid19_test.csv,sha256=d5t7Lfhkwbfe3Uk2IBqB2ZT5o0h9QaeraC8E5-IMERo,67521
|
|
10
|
-
carrottransform/examples/test/inputs/Demographics.csv,sha256=_ukUTpD4g751sL_mSL3f26T_Edd2kvH-evwm54VfXJI,85237
|
|
11
|
-
carrottransform/examples/test/inputs/Symptoms.csv,sha256=5dvGv16PNJJO_lFc0reRmQbE3m7iWfWajl51JDsqg0M,78447
|
|
12
|
-
carrottransform/examples/test/inputs/covid19_antibody.csv,sha256=SPCpyqpTbVq9987jXZ8AS4FEkrchRMAIYhTQJjfpwfY,98927
|
|
13
|
-
carrottransform/examples/test/inputs/vaccine.csv,sha256=_gcM-SIymyt2Dkkr_zGmQI9keIdmDm-gDI_QvXXLFrY,44037
|
|
14
|
-
carrottransform/examples/test/rules/rules_14June2021.json,sha256=n2OYNFhbx-NLhmqjAad6RsfXjQFknZIgQ7a5uyJF0Co,13226
|
|
15
|
-
carrottransform/tools/__init__.py,sha256=b3JuCwgJVx0rqx5igB8hNNKO0ktlbQjHGHwy-vzpdo0,198
|
|
16
|
-
carrottransform/tools/file_helpers.py,sha256=xlODDAUpsx0H4sweGZ81ttjJjNQGn2spNUa1Fndotw8,316
|
|
17
|
-
carrottransform/tools/mappingrules.py,sha256=IiZx24G27Rag-YgV-4jDxprJea9Ce7SZUbjxMm0n49k,7040
|
|
18
|
-
carrottransform/tools/metrics.py,sha256=LOzm80-YIVM9mvgvQXRpyArl2nSfSTTW9DikqJ5M2Yg,5700
|
|
19
|
-
carrottransform/tools/omopcdm.py,sha256=MwS_MwwBrypwjbFLuxoE0xlddWIi0T3BEPgN9LPkGAs,8508
|
|
20
|
-
carrot_transform-0.3.4.dist-info/LICENSE,sha256=pqIiuuTs6Na-oFd10MMsZoZmdfhfUhHeOtQzgzSkcaw,1082
|
|
21
|
-
carrot_transform-0.3.4.dist-info/METADATA,sha256=mbB8-GgOH6EnJXDr2j46Q97R3ID4Dro9IbgAFcJVAXY,4219
|
|
22
|
-
carrot_transform-0.3.4.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
23
|
-
carrot_transform-0.3.4.dist-info/entry_points.txt,sha256=z7qmjTl7C8shrYiPBy6yZo9RRZ31Jcvo6L8ntdqbs2E,74
|
|
24
|
-
carrot_transform-0.3.4.dist-info/RECORD,,
|
|
File without changes
|