carrot-transform 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of carrot-transform might be problematic. Click here for more details.
- {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
- carrot_transform-0.4.0.dist-info/RECORD +41 -0
- {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
- carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
- carrottransform/__init__.py +1 -1
- carrottransform/_version.py +2 -2
- carrottransform/cli/command.py +9 -5
- carrottransform/cli/subcommands/run.py +302 -443
- carrottransform/cli/subcommands/run_v2.py +145 -0
- carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
- carrottransform/examples/test/rules/v1.json +280 -0
- carrottransform/examples/test/rules/v2.json +115 -0
- carrottransform/tools/__init__.py +4 -14
- carrottransform/tools/args.py +128 -0
- carrottransform/tools/click.py +21 -0
- carrottransform/tools/concept_helpers.py +61 -0
- carrottransform/tools/core.py +163 -0
- carrottransform/tools/date_helpers.py +79 -0
- carrottransform/tools/file_helpers.py +177 -7
- carrottransform/tools/logger.py +19 -0
- carrottransform/tools/mapping_types.py +32 -0
- carrottransform/tools/mappingrules.py +298 -32
- carrottransform/tools/metrics.py +274 -49
- carrottransform/tools/omopcdm.py +42 -32
- carrottransform/tools/orchestrator.py +381 -0
- carrottransform/tools/person_helpers.py +126 -0
- carrottransform/tools/record_builder.py +413 -0
- carrottransform/tools/stream_helpers.py +71 -0
- carrottransform/tools/types.py +71 -0
- carrottransform/tools/validation.py +62 -0
- carrot_transform-0.3.4.dist-info/RECORD +0 -24
- carrot_transform-0.3.4.dist-info/entry_points.txt +0 -3
- {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,26 +1,122 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import json
|
|
2
|
+
from pathlib import Path
|
|
3
3
|
import carrottransform.tools as tools
|
|
4
|
-
from
|
|
4
|
+
from typing import Dict, Any, List, Optional
|
|
5
|
+
from carrottransform.tools.mapping_types import (
|
|
6
|
+
PersonIdMapping,
|
|
7
|
+
DateMapping,
|
|
8
|
+
ConceptMapping,
|
|
9
|
+
V2TableMapping,
|
|
10
|
+
)
|
|
11
|
+
from carrottransform.tools.logger import logger_setup
|
|
12
|
+
from carrottransform.tools.omopcdm import OmopCDM
|
|
13
|
+
|
|
14
|
+
logger = logger_setup()
|
|
15
|
+
|
|
5
16
|
|
|
6
17
|
class MappingRules:
|
|
7
18
|
"""
|
|
8
|
-
self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
|
|
19
|
+
self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
|
|
9
20
|
as a file-specific dictionary allowing rules to be "looked-up" depending on data content
|
|
10
21
|
"""
|
|
11
22
|
|
|
12
|
-
def __init__(self, rulesfilepath, omopcdm):
|
|
23
|
+
def __init__(self, rulesfilepath: Path, omopcdm: OmopCDM):
|
|
13
24
|
## just loads the json directly
|
|
14
|
-
self.rules_data = tools.load_json(rulesfilepath)
|
|
25
|
+
self.rules_data = tools.load_json(Path(rulesfilepath))
|
|
15
26
|
self.omopcdm = omopcdm
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
self.
|
|
27
|
+
|
|
28
|
+
# Detect format version and parse accordingly
|
|
29
|
+
self.is_v2_format = self._is_v2_format()
|
|
30
|
+
if self.is_v2_format:
|
|
31
|
+
logger.info("Detected v2.json format, using direct v2 parser...")
|
|
32
|
+
self.v2_mappings = self._parse_v2_format()
|
|
33
|
+
else:
|
|
34
|
+
logger.info("Detected v1.json format, using legacy parser...")
|
|
35
|
+
|
|
36
|
+
self.parsed_rules: Dict[str, Dict[str, Any]] = {}
|
|
37
|
+
self.outfile_names: Dict[str, List[str]] = {}
|
|
19
38
|
|
|
20
39
|
self.dataset_name = self.get_dsname_from_rules()
|
|
21
40
|
|
|
41
|
+
def _is_v2_format(self) -> bool:
|
|
42
|
+
"""
|
|
43
|
+
Detect if the rules file is in v2 format by checking for characteristic v2 structures
|
|
44
|
+
"""
|
|
45
|
+
# Check if any table has the v2 structure (source_table -> mapping_types)
|
|
46
|
+
for table_name, table_data in self.rules_data["cdm"].items():
|
|
47
|
+
if isinstance(table_data, dict):
|
|
48
|
+
for key, value in table_data.items():
|
|
49
|
+
# v2 format has CSV filenames as keys, with mapping types as values
|
|
50
|
+
if isinstance(value, dict) and all(
|
|
51
|
+
mapping_type in value
|
|
52
|
+
for mapping_type in [
|
|
53
|
+
"person_id_mapping",
|
|
54
|
+
"date_mapping",
|
|
55
|
+
"concept_mappings",
|
|
56
|
+
]
|
|
57
|
+
):
|
|
58
|
+
return True
|
|
59
|
+
return False
|
|
60
|
+
|
|
61
|
+
def _parse_v2_format(self) -> Dict[str, Dict[str, V2TableMapping]]:
|
|
62
|
+
"""
|
|
63
|
+
Parse v2 format into clean data structures
|
|
64
|
+
Returns: Dict[table_name, Dict[source_table, V2TableMapping]]
|
|
65
|
+
"""
|
|
66
|
+
v2_mappings: Dict[str, Dict[str, V2TableMapping]] = {}
|
|
67
|
+
|
|
68
|
+
for table_name, table_data in self.rules_data["cdm"].items():
|
|
69
|
+
v2_mappings[table_name] = {}
|
|
70
|
+
|
|
71
|
+
for source_table, mappings in table_data.items():
|
|
72
|
+
# Parse person_id_mapping
|
|
73
|
+
person_id_mapping = None
|
|
74
|
+
if "person_id_mapping" in mappings:
|
|
75
|
+
pid_data = mappings["person_id_mapping"]
|
|
76
|
+
person_id_mapping = PersonIdMapping(
|
|
77
|
+
source_field=pid_data["source_field"],
|
|
78
|
+
dest_field=pid_data["dest_field"],
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Parse date_mapping
|
|
82
|
+
date_mapping = None
|
|
83
|
+
if "date_mapping" in mappings:
|
|
84
|
+
date_data = mappings["date_mapping"]
|
|
85
|
+
date_mapping = DateMapping(
|
|
86
|
+
source_field=date_data["source_field"],
|
|
87
|
+
dest_fields=date_data["dest_field"],
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Parse concept_mappings
|
|
91
|
+
concept_mappings = {}
|
|
92
|
+
if "concept_mappings" in mappings:
|
|
93
|
+
for source_field, field_mappings in mappings[
|
|
94
|
+
"concept_mappings"
|
|
95
|
+
].items():
|
|
96
|
+
original_value_fields = field_mappings.get("original_value", [])
|
|
97
|
+
value_mappings = {}
|
|
98
|
+
|
|
99
|
+
for source_value, dest_mappings in field_mappings.items():
|
|
100
|
+
if source_value != "original_value":
|
|
101
|
+
value_mappings[source_value] = dest_mappings
|
|
102
|
+
|
|
103
|
+
concept_mappings[source_field] = ConceptMapping(
|
|
104
|
+
source_field=source_field,
|
|
105
|
+
value_mappings=value_mappings,
|
|
106
|
+
original_value_fields=original_value_fields,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
v2_mappings[table_name][source_table] = V2TableMapping(
|
|
110
|
+
source_table=source_table,
|
|
111
|
+
person_id_mapping=person_id_mapping,
|
|
112
|
+
date_mapping=date_mapping,
|
|
113
|
+
concept_mappings=concept_mappings,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
return v2_mappings
|
|
117
|
+
|
|
22
118
|
def dump_parsed_rules(self):
|
|
23
|
-
return
|
|
119
|
+
return json.dumps(self.parsed_rules, indent=2)
|
|
24
120
|
|
|
25
121
|
def get_dsname_from_rules(self):
|
|
26
122
|
dsname = "Unknown"
|
|
@@ -35,23 +131,62 @@ class MappingRules:
|
|
|
35
131
|
return self.dataset_name
|
|
36
132
|
|
|
37
133
|
def get_all_outfile_names(self):
|
|
38
|
-
|
|
134
|
+
if self.is_v2_format:
|
|
135
|
+
return list(self.v2_mappings.keys())
|
|
136
|
+
else:
|
|
137
|
+
return list(self.rules_data["cdm"])
|
|
39
138
|
|
|
40
139
|
def get_all_infile_names(self):
|
|
140
|
+
if self.is_v2_format:
|
|
141
|
+
return self._get_all_infile_names_v2()
|
|
142
|
+
else:
|
|
143
|
+
return self._get_all_infile_names_v1()
|
|
144
|
+
|
|
145
|
+
def _get_all_infile_names_v2(self) -> List[str]:
|
|
146
|
+
"""Get all input file names from v2 format"""
|
|
41
147
|
file_list = []
|
|
148
|
+
for table_mappings in self.v2_mappings.values():
|
|
149
|
+
for source_table in table_mappings.keys():
|
|
150
|
+
if source_table not in file_list:
|
|
151
|
+
file_list.append(source_table)
|
|
152
|
+
return file_list
|
|
42
153
|
|
|
154
|
+
def _get_all_infile_names_v1(self) -> List[str]:
|
|
155
|
+
"""Get all input file names from v1 format (legacy method)"""
|
|
156
|
+
file_list = []
|
|
43
157
|
for outfilename, conditions in self.rules_data["cdm"].items():
|
|
44
158
|
for outfield, source_field in conditions.items():
|
|
45
159
|
for source_field_name, source_data in source_field.items():
|
|
46
160
|
if "source_table" in source_data:
|
|
47
161
|
if source_data["source_table"] not in file_list:
|
|
48
162
|
file_list.append(source_data["source_table"])
|
|
49
|
-
|
|
50
163
|
return file_list
|
|
51
|
-
|
|
52
|
-
def get_infile_data_fields(self, infilename):
|
|
53
|
-
data_fields_lists = {}
|
|
54
164
|
|
|
165
|
+
def get_infile_data_fields(self, infilename: str):
|
|
166
|
+
if self.is_v2_format:
|
|
167
|
+
return self._get_infile_data_fields_v2(infilename)
|
|
168
|
+
else:
|
|
169
|
+
return self._get_infile_data_fields_v1(infilename)
|
|
170
|
+
|
|
171
|
+
def _get_infile_data_fields_v2(self, infilename: str) -> Dict[str, List[str]]:
|
|
172
|
+
"""Get data fields for a specific input file from v2 format"""
|
|
173
|
+
data_fields_lists: Dict[str, List[str]] = {}
|
|
174
|
+
|
|
175
|
+
for table_name, table_mappings in self.v2_mappings.items():
|
|
176
|
+
if infilename in table_mappings:
|
|
177
|
+
mapping = table_mappings[infilename]
|
|
178
|
+
data_fields_lists[table_name] = []
|
|
179
|
+
|
|
180
|
+
# Add fields from concept mappings
|
|
181
|
+
for source_field in mapping.concept_mappings.keys():
|
|
182
|
+
if source_field not in data_fields_lists[table_name]:
|
|
183
|
+
data_fields_lists[table_name].append(source_field)
|
|
184
|
+
|
|
185
|
+
return data_fields_lists
|
|
186
|
+
|
|
187
|
+
def _get_infile_data_fields_v1(self, infilename: str) -> Dict[str, List[str]]:
|
|
188
|
+
"""Get data fields for a specific input file from v1 format (legacy method)"""
|
|
189
|
+
data_fields_lists: Dict[str, List[str]] = {}
|
|
55
190
|
outfilenames, outdata = self.parse_rules_src_to_tgt(infilename)
|
|
56
191
|
|
|
57
192
|
for outfilename in outfilenames:
|
|
@@ -70,7 +205,36 @@ class MappingRules:
|
|
|
70
205
|
|
|
71
206
|
return data_fields_lists
|
|
72
207
|
|
|
73
|
-
def get_infile_date_person_id(self, infilename):
|
|
208
|
+
def get_infile_date_person_id(self, infilename: str):
|
|
209
|
+
if self.is_v2_format:
|
|
210
|
+
return self._get_infile_date_person_id_v2(infilename)
|
|
211
|
+
else:
|
|
212
|
+
return self._get_infile_date_person_id_v1(infilename)
|
|
213
|
+
|
|
214
|
+
# TODO: combine this with _get_person_source_field_info_v2
|
|
215
|
+
def _get_infile_date_person_id_v2(self, infilename: str) -> tuple[str, str]:
|
|
216
|
+
"""Get datetime and person_id source fields for v2 format"""
|
|
217
|
+
datetime_source = ""
|
|
218
|
+
person_id_source = ""
|
|
219
|
+
|
|
220
|
+
for table_mappings in self.v2_mappings.values():
|
|
221
|
+
if infilename in table_mappings:
|
|
222
|
+
mapping = table_mappings[infilename]
|
|
223
|
+
|
|
224
|
+
if mapping.date_mapping:
|
|
225
|
+
datetime_source = mapping.date_mapping.source_field
|
|
226
|
+
|
|
227
|
+
if mapping.person_id_mapping:
|
|
228
|
+
person_id_source = mapping.person_id_mapping.source_field
|
|
229
|
+
|
|
230
|
+
# If we found both, we can break
|
|
231
|
+
if datetime_source and person_id_source:
|
|
232
|
+
break
|
|
233
|
+
|
|
234
|
+
return datetime_source, person_id_source
|
|
235
|
+
|
|
236
|
+
def _get_infile_date_person_id_v1(self, infilename: str) -> tuple[str, str]:
|
|
237
|
+
"""Get datetime and person_id source fields for v1 format (legacy method)"""
|
|
74
238
|
outfilenames, outdata = self.parse_rules_src_to_tgt(infilename)
|
|
75
239
|
datetime_source = ""
|
|
76
240
|
person_id_source = ""
|
|
@@ -80,27 +244,65 @@ class MappingRules:
|
|
|
80
244
|
outfile = keydata[-1]
|
|
81
245
|
for outfield_elem in outfield_data:
|
|
82
246
|
for infield, outfield_list in outfield_elem.items():
|
|
83
|
-
|
|
247
|
+
logger.debug(
|
|
248
|
+
"{0}, {1}, {2}".format(outfile, infield, str(outfield_list))
|
|
249
|
+
)
|
|
84
250
|
for outfield in outfield_list:
|
|
85
|
-
if outfield.split(
|
|
251
|
+
if outfield.split("~")[
|
|
252
|
+
0
|
|
253
|
+
] in self.omopcdm.get_omop_datetime_fields(outfile):
|
|
86
254
|
datetime_source = infield
|
|
87
|
-
if outfield.split(
|
|
255
|
+
if outfield.split("~")[
|
|
256
|
+
0
|
|
257
|
+
] == self.omopcdm.get_omop_person_id_field(outfile):
|
|
88
258
|
person_id_source = infield
|
|
89
259
|
|
|
90
260
|
return datetime_source, person_id_source
|
|
91
261
|
|
|
92
|
-
def get_person_source_field_info(self, tgtfilename):
|
|
262
|
+
def get_person_source_field_info(self, tgtfilename: str):
|
|
263
|
+
if self.is_v2_format:
|
|
264
|
+
return self._get_person_source_field_info_v2(tgtfilename)
|
|
265
|
+
else:
|
|
266
|
+
return self._get_person_source_field_info_v1(tgtfilename)
|
|
267
|
+
|
|
268
|
+
def _get_person_source_field_info_v2(
|
|
269
|
+
self, tgtfilename: str
|
|
270
|
+
) -> tuple[Optional[str], Optional[str]]:
|
|
93
271
|
"""
|
|
94
|
-
|
|
272
|
+
Get person source field info for v2 format,
|
|
273
|
+
from the dest. table "Person" in the rules file.
|
|
95
274
|
"""
|
|
96
275
|
birth_datetime_source = None
|
|
97
276
|
person_id_source = None
|
|
277
|
+
|
|
278
|
+
if tgtfilename in self.v2_mappings:
|
|
279
|
+
for mapping in self.v2_mappings[tgtfilename].values():
|
|
280
|
+
if mapping.date_mapping:
|
|
281
|
+
birth_datetime_source = mapping.date_mapping.source_field
|
|
282
|
+
|
|
283
|
+
if mapping.person_id_mapping:
|
|
284
|
+
person_id_source = mapping.person_id_mapping.source_field
|
|
285
|
+
|
|
286
|
+
# If we found both, we can break
|
|
287
|
+
if birth_datetime_source and person_id_source:
|
|
288
|
+
break
|
|
289
|
+
|
|
290
|
+
return birth_datetime_source, person_id_source
|
|
291
|
+
|
|
292
|
+
def _get_person_source_field_info_v1(
|
|
293
|
+
self, tgtfilename: str
|
|
294
|
+
) -> tuple[Optional[str], Optional[str]]:
|
|
295
|
+
"""Get person source field info for v1 format (legacy method)"""
|
|
296
|
+
birth_datetime_source = None
|
|
297
|
+
person_id_source = None
|
|
98
298
|
if tgtfilename in self.rules_data["cdm"]:
|
|
99
299
|
source_rules_data = self.rules_data["cdm"][tgtfilename]
|
|
100
300
|
## this loops over all the fields in the person part of the rules, which will lead to overwriting of the source variables and unneccesary looping
|
|
101
301
|
for rule_name, rule_fields in source_rules_data.items():
|
|
102
302
|
if "birth_datetime" in rule_fields:
|
|
103
|
-
birth_datetime_source = rule_fields["birth_datetime"][
|
|
303
|
+
birth_datetime_source = rule_fields["birth_datetime"][
|
|
304
|
+
"source_field"
|
|
305
|
+
]
|
|
104
306
|
if "person_id" in rule_fields:
|
|
105
307
|
person_id_source = rule_fields["person_id"]["source_field"]
|
|
106
308
|
|
|
@@ -122,7 +324,23 @@ class MappingRules:
|
|
|
122
324
|
if key != "":
|
|
123
325
|
if key not in outdata:
|
|
124
326
|
outdata[key] = []
|
|
125
|
-
|
|
327
|
+
if key.split("~")[-1] == "person":
|
|
328
|
+
outdata[key].append(data)
|
|
329
|
+
|
|
330
|
+
if key.split("~")[-1] == "person":
|
|
331
|
+
# Find matching source field keys and merge their dictionaries
|
|
332
|
+
for source_field, value in data.items():
|
|
333
|
+
if source_field in outdata[key][0] and isinstance(
|
|
334
|
+
outdata[key][0][source_field], dict
|
|
335
|
+
):
|
|
336
|
+
# Merge the dictionaries for this source field
|
|
337
|
+
outdata[key][0][source_field].update(value)
|
|
338
|
+
else:
|
|
339
|
+
# If no matching dict or new source field, just set it
|
|
340
|
+
outdata[key][0][source_field] = value
|
|
341
|
+
pass
|
|
342
|
+
else:
|
|
343
|
+
outdata[key].append(data)
|
|
126
344
|
if outfilename not in outfilenames:
|
|
127
345
|
outfilenames.append(outfilename)
|
|
128
346
|
|
|
@@ -134,27 +352,75 @@ class MappingRules:
|
|
|
134
352
|
"""
|
|
135
353
|
Process rules for an infile, outfile combination
|
|
136
354
|
"""
|
|
137
|
-
outkey = ""
|
|
138
355
|
data = {}
|
|
356
|
+
### used for mapping simple fields that are always mapped (e.g., dob)
|
|
139
357
|
plain_key = ""
|
|
140
|
-
term_value_key = ""
|
|
358
|
+
term_value_key = "" ### used for mapping terms (e.g., gender, race, ethnicity)
|
|
141
359
|
|
|
142
360
|
## iterate through the rules, looking for rules that apply to the input file.
|
|
143
361
|
for outfield, source_info in rules.items():
|
|
144
|
-
if
|
|
145
|
-
data[source_info["source_field"]] = []
|
|
362
|
+
# Check if this rule applies to our input file
|
|
146
363
|
if source_info["source_table"] == infilename:
|
|
147
364
|
if "term_mapping" in source_info:
|
|
148
365
|
if type(source_info["term_mapping"]) is dict:
|
|
149
366
|
for inputvalue, term in source_info["term_mapping"].items():
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
367
|
+
if outfilename == "person":
|
|
368
|
+
term_value_key = infilename + "~person"
|
|
369
|
+
source_field = source_info["source_field"]
|
|
370
|
+
if source_field not in data:
|
|
371
|
+
data[source_field] = {}
|
|
372
|
+
if str(inputvalue) not in data[source_field]:
|
|
373
|
+
try:
|
|
374
|
+
data[source_field][str(inputvalue)] = []
|
|
375
|
+
except TypeError:
|
|
376
|
+
### need to convert data[source_field] to a dict
|
|
377
|
+
### like this: {'F': ['gender_concept_id~8532', 'gender_source_concept_id~8532', 'gender_source_value']}
|
|
378
|
+
temp_data_list = data[source_field].copy()
|
|
379
|
+
data[source_field] = {}
|
|
380
|
+
data[source_field][str(inputvalue)] = (
|
|
381
|
+
temp_data_list
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
data[source_field][str(inputvalue)].append(
|
|
385
|
+
outfield + "~" + str(term)
|
|
386
|
+
)
|
|
387
|
+
else:
|
|
388
|
+
term_value_key = (
|
|
389
|
+
infilename
|
|
390
|
+
+ "~"
|
|
391
|
+
+ source_info["source_field"]
|
|
392
|
+
+ "~"
|
|
393
|
+
+ str(inputvalue)
|
|
394
|
+
+ "~"
|
|
395
|
+
+ outfilename
|
|
396
|
+
)
|
|
397
|
+
if source_info["source_field"] not in data:
|
|
398
|
+
data[source_info["source_field"]] = []
|
|
399
|
+
data[source_info["source_field"]].append(
|
|
400
|
+
outfield + "~" + str(term)
|
|
401
|
+
)
|
|
153
402
|
else:
|
|
154
|
-
plain_key =
|
|
155
|
-
|
|
403
|
+
plain_key = (
|
|
404
|
+
infilename
|
|
405
|
+
+ "~"
|
|
406
|
+
+ source_info["source_field"]
|
|
407
|
+
+ "~"
|
|
408
|
+
+ outfilename
|
|
409
|
+
)
|
|
410
|
+
if source_info["source_field"] not in data:
|
|
411
|
+
data[source_info["source_field"]] = []
|
|
412
|
+
data[source_info["source_field"]].append(
|
|
413
|
+
outfield + "~" + str(source_info["term_mapping"])
|
|
414
|
+
)
|
|
156
415
|
else:
|
|
157
|
-
|
|
416
|
+
if source_info["source_field"] not in data:
|
|
417
|
+
data[source_info["source_field"]] = []
|
|
418
|
+
if type(data[source_info["source_field"]]) is dict:
|
|
419
|
+
data[source_info["source_field"]][str(inputvalue)].append(
|
|
420
|
+
outfield
|
|
421
|
+
)
|
|
422
|
+
else:
|
|
423
|
+
data[source_info["source_field"]].append(outfield)
|
|
158
424
|
if term_value_key != "":
|
|
159
425
|
return term_value_key, data
|
|
160
426
|
|