carrot-transform 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

Files changed (32) hide show
  1. {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
  2. carrot_transform-0.4.0.dist-info/RECORD +41 -0
  3. {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
  4. carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
  5. carrottransform/__init__.py +1 -1
  6. carrottransform/_version.py +2 -2
  7. carrottransform/cli/command.py +9 -5
  8. carrottransform/cli/subcommands/run.py +214 -526
  9. carrottransform/cli/subcommands/run_v2.py +145 -0
  10. carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
  11. carrottransform/examples/test/rules/v1.json +280 -0
  12. carrottransform/examples/test/rules/v2.json +115 -0
  13. carrottransform/tools/__init__.py +4 -14
  14. carrottransform/tools/args.py +128 -0
  15. carrottransform/tools/concept_helpers.py +61 -0
  16. carrottransform/tools/core.py +163 -0
  17. carrottransform/tools/date_helpers.py +79 -0
  18. carrottransform/tools/file_helpers.py +153 -9
  19. carrottransform/tools/logger.py +19 -0
  20. carrottransform/tools/mapping_types.py +32 -0
  21. carrottransform/tools/mappingrules.py +297 -34
  22. carrottransform/tools/metrics.py +162 -109
  23. carrottransform/tools/omopcdm.py +37 -32
  24. carrottransform/tools/orchestrator.py +381 -0
  25. carrottransform/tools/person_helpers.py +126 -0
  26. carrottransform/tools/record_builder.py +413 -0
  27. carrottransform/tools/stream_helpers.py +71 -0
  28. carrottransform/tools/types.py +71 -0
  29. carrottransform/tools/validation.py +62 -0
  30. carrot_transform-0.3.5.dist-info/RECORD +0 -25
  31. carrot_transform-0.3.5.dist-info/entry_points.txt +0 -3
  32. {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,29 +1,122 @@
1
- import os
2
1
  import json
2
+ from pathlib import Path
3
3
  import carrottransform.tools as tools
4
- from .omopcdm import OmopCDM
4
+ from typing import Dict, Any, List, Optional
5
+ from carrottransform.tools.mapping_types import (
6
+ PersonIdMapping,
7
+ DateMapping,
8
+ ConceptMapping,
9
+ V2TableMapping,
10
+ )
11
+ from carrottransform.tools.logger import logger_setup
12
+ from carrottransform.tools.omopcdm import OmopCDM
13
+
14
+ logger = logger_setup()
5
15
 
6
- import logging
7
- logger = logging.getLogger(__name__)
8
16
 
9
17
  class MappingRules:
10
18
  """
11
- self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
19
+ self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
12
20
  as a file-specific dictionary allowing rules to be "looked-up" depending on data content
13
21
  """
14
22
 
15
- def __init__(self, rulesfilepath: os.PathLike, omopcdm: OmopCDM):
23
+ def __init__(self, rulesfilepath: Path, omopcdm: OmopCDM):
16
24
  ## just loads the json directly
17
- self.rules_data = tools.load_json(rulesfilepath)
25
+ self.rules_data = tools.load_json(Path(rulesfilepath))
18
26
  self.omopcdm = omopcdm
19
-
20
- self.parsed_rules = {}
21
- self.outfile_names = {}
27
+
28
+ # Detect format version and parse accordingly
29
+ self.is_v2_format = self._is_v2_format()
30
+ if self.is_v2_format:
31
+ logger.info("Detected v2.json format, using direct v2 parser...")
32
+ self.v2_mappings = self._parse_v2_format()
33
+ else:
34
+ logger.info("Detected v1.json format, using legacy parser...")
35
+
36
+ self.parsed_rules: Dict[str, Dict[str, Any]] = {}
37
+ self.outfile_names: Dict[str, List[str]] = {}
22
38
 
23
39
  self.dataset_name = self.get_dsname_from_rules()
24
40
 
41
+ def _is_v2_format(self) -> bool:
42
+ """
43
+ Detect if the rules file is in v2 format by checking for characteristic v2 structures
44
+ """
45
+ # Check if any table has the v2 structure (source_table -> mapping_types)
46
+ for table_name, table_data in self.rules_data["cdm"].items():
47
+ if isinstance(table_data, dict):
48
+ for key, value in table_data.items():
49
+ # v2 format has CSV filenames as keys, with mapping types as values
50
+ if isinstance(value, dict) and all(
51
+ mapping_type in value
52
+ for mapping_type in [
53
+ "person_id_mapping",
54
+ "date_mapping",
55
+ "concept_mappings",
56
+ ]
57
+ ):
58
+ return True
59
+ return False
60
+
61
+ def _parse_v2_format(self) -> Dict[str, Dict[str, V2TableMapping]]:
62
+ """
63
+ Parse v2 format into clean data structures
64
+ Returns: Dict[table_name, Dict[source_table, V2TableMapping]]
65
+ """
66
+ v2_mappings: Dict[str, Dict[str, V2TableMapping]] = {}
67
+
68
+ for table_name, table_data in self.rules_data["cdm"].items():
69
+ v2_mappings[table_name] = {}
70
+
71
+ for source_table, mappings in table_data.items():
72
+ # Parse person_id_mapping
73
+ person_id_mapping = None
74
+ if "person_id_mapping" in mappings:
75
+ pid_data = mappings["person_id_mapping"]
76
+ person_id_mapping = PersonIdMapping(
77
+ source_field=pid_data["source_field"],
78
+ dest_field=pid_data["dest_field"],
79
+ )
80
+
81
+ # Parse date_mapping
82
+ date_mapping = None
83
+ if "date_mapping" in mappings:
84
+ date_data = mappings["date_mapping"]
85
+ date_mapping = DateMapping(
86
+ source_field=date_data["source_field"],
87
+ dest_fields=date_data["dest_field"],
88
+ )
89
+
90
+ # Parse concept_mappings
91
+ concept_mappings = {}
92
+ if "concept_mappings" in mappings:
93
+ for source_field, field_mappings in mappings[
94
+ "concept_mappings"
95
+ ].items():
96
+ original_value_fields = field_mappings.get("original_value", [])
97
+ value_mappings = {}
98
+
99
+ for source_value, dest_mappings in field_mappings.items():
100
+ if source_value != "original_value":
101
+ value_mappings[source_value] = dest_mappings
102
+
103
+ concept_mappings[source_field] = ConceptMapping(
104
+ source_field=source_field,
105
+ value_mappings=value_mappings,
106
+ original_value_fields=original_value_fields,
107
+ )
108
+
109
+ v2_mappings[table_name][source_table] = V2TableMapping(
110
+ source_table=source_table,
111
+ person_id_mapping=person_id_mapping,
112
+ date_mapping=date_mapping,
113
+ concept_mappings=concept_mappings,
114
+ )
115
+
116
+ return v2_mappings
117
+
25
118
  def dump_parsed_rules(self):
26
- return(json.dumps(self.parsed_rules, indent=2))
119
+ return json.dumps(self.parsed_rules, indent=2)
27
120
 
28
121
  def get_dsname_from_rules(self):
29
122
  dsname = "Unknown"
@@ -38,23 +131,62 @@ class MappingRules:
38
131
  return self.dataset_name
39
132
 
40
133
  def get_all_outfile_names(self):
41
- return list(self.rules_data["cdm"])
134
+ if self.is_v2_format:
135
+ return list(self.v2_mappings.keys())
136
+ else:
137
+ return list(self.rules_data["cdm"])
42
138
 
43
139
  def get_all_infile_names(self):
140
+ if self.is_v2_format:
141
+ return self._get_all_infile_names_v2()
142
+ else:
143
+ return self._get_all_infile_names_v1()
144
+
145
+ def _get_all_infile_names_v2(self) -> List[str]:
146
+ """Get all input file names from v2 format"""
44
147
  file_list = []
148
+ for table_mappings in self.v2_mappings.values():
149
+ for source_table in table_mappings.keys():
150
+ if source_table not in file_list:
151
+ file_list.append(source_table)
152
+ return file_list
45
153
 
154
+ def _get_all_infile_names_v1(self) -> List[str]:
155
+ """Get all input file names from v1 format (legacy method)"""
156
+ file_list = []
46
157
  for outfilename, conditions in self.rules_data["cdm"].items():
47
158
  for outfield, source_field in conditions.items():
48
159
  for source_field_name, source_data in source_field.items():
49
160
  if "source_table" in source_data:
50
161
  if source_data["source_table"] not in file_list:
51
162
  file_list.append(source_data["source_table"])
52
-
53
163
  return file_list
54
-
55
- def get_infile_data_fields(self, infilename):
56
- data_fields_lists = {}
57
164
 
165
+ def get_infile_data_fields(self, infilename: str):
166
+ if self.is_v2_format:
167
+ return self._get_infile_data_fields_v2(infilename)
168
+ else:
169
+ return self._get_infile_data_fields_v1(infilename)
170
+
171
+ def _get_infile_data_fields_v2(self, infilename: str) -> Dict[str, List[str]]:
172
+ """Get data fields for a specific input file from v2 format"""
173
+ data_fields_lists: Dict[str, List[str]] = {}
174
+
175
+ for table_name, table_mappings in self.v2_mappings.items():
176
+ if infilename in table_mappings:
177
+ mapping = table_mappings[infilename]
178
+ data_fields_lists[table_name] = []
179
+
180
+ # Add fields from concept mappings
181
+ for source_field in mapping.concept_mappings.keys():
182
+ if source_field not in data_fields_lists[table_name]:
183
+ data_fields_lists[table_name].append(source_field)
184
+
185
+ return data_fields_lists
186
+
187
+ def _get_infile_data_fields_v1(self, infilename: str) -> Dict[str, List[str]]:
188
+ """Get data fields for a specific input file from v1 format (legacy method)"""
189
+ data_fields_lists: Dict[str, List[str]] = {}
58
190
  outfilenames, outdata = self.parse_rules_src_to_tgt(infilename)
59
191
 
60
192
  for outfilename in outfilenames:
@@ -73,7 +205,36 @@ class MappingRules:
73
205
 
74
206
  return data_fields_lists
75
207
 
76
- def get_infile_date_person_id(self, infilename):
208
+ def get_infile_date_person_id(self, infilename: str):
209
+ if self.is_v2_format:
210
+ return self._get_infile_date_person_id_v2(infilename)
211
+ else:
212
+ return self._get_infile_date_person_id_v1(infilename)
213
+
214
+ # TODO: combine this with _get_person_source_field_info_v2
215
+ def _get_infile_date_person_id_v2(self, infilename: str) -> tuple[str, str]:
216
+ """Get datetime and person_id source fields for v2 format"""
217
+ datetime_source = ""
218
+ person_id_source = ""
219
+
220
+ for table_mappings in self.v2_mappings.values():
221
+ if infilename in table_mappings:
222
+ mapping = table_mappings[infilename]
223
+
224
+ if mapping.date_mapping:
225
+ datetime_source = mapping.date_mapping.source_field
226
+
227
+ if mapping.person_id_mapping:
228
+ person_id_source = mapping.person_id_mapping.source_field
229
+
230
+ # If we found both, we can break
231
+ if datetime_source and person_id_source:
232
+ break
233
+
234
+ return datetime_source, person_id_source
235
+
236
+ def _get_infile_date_person_id_v1(self, infilename: str) -> tuple[str, str]:
237
+ """Get datetime and person_id source fields for v1 format (legacy method)"""
77
238
  outfilenames, outdata = self.parse_rules_src_to_tgt(infilename)
78
239
  datetime_source = ""
79
240
  person_id_source = ""
@@ -83,27 +244,65 @@ class MappingRules:
83
244
  outfile = keydata[-1]
84
245
  for outfield_elem in outfield_data:
85
246
  for infield, outfield_list in outfield_elem.items():
86
- logger.debug("{0}, {1}, {2}".format(outfile, infield, str(outfield_list)))
247
+ logger.debug(
248
+ "{0}, {1}, {2}".format(outfile, infield, str(outfield_list))
249
+ )
87
250
  for outfield in outfield_list:
88
- if outfield.split('~')[0] in self.omopcdm.get_omop_datetime_fields(outfile):
251
+ if outfield.split("~")[
252
+ 0
253
+ ] in self.omopcdm.get_omop_datetime_fields(outfile):
89
254
  datetime_source = infield
90
- if outfield.split('~')[0] == self.omopcdm.get_omop_person_id_field(outfile):
255
+ if outfield.split("~")[
256
+ 0
257
+ ] == self.omopcdm.get_omop_person_id_field(outfile):
91
258
  person_id_source = infield
92
259
 
93
260
  return datetime_source, person_id_source
94
261
 
95
- def get_person_source_field_info(self, tgtfilename):
262
+ def get_person_source_field_info(self, tgtfilename: str):
263
+ if self.is_v2_format:
264
+ return self._get_person_source_field_info_v2(tgtfilename)
265
+ else:
266
+ return self._get_person_source_field_info_v1(tgtfilename)
267
+
268
+ def _get_person_source_field_info_v2(
269
+ self, tgtfilename: str
270
+ ) -> tuple[Optional[str], Optional[str]]:
96
271
  """
97
- Specific discovery of input data field names for 'person' in these rules
272
+ Get person source field info for v2 format,
273
+ from the dest. table "Person" in the rules file.
98
274
  """
99
275
  birth_datetime_source = None
100
276
  person_id_source = None
277
+
278
+ if tgtfilename in self.v2_mappings:
279
+ for mapping in self.v2_mappings[tgtfilename].values():
280
+ if mapping.date_mapping:
281
+ birth_datetime_source = mapping.date_mapping.source_field
282
+
283
+ if mapping.person_id_mapping:
284
+ person_id_source = mapping.person_id_mapping.source_field
285
+
286
+ # If we found both, we can break
287
+ if birth_datetime_source and person_id_source:
288
+ break
289
+
290
+ return birth_datetime_source, person_id_source
291
+
292
+ def _get_person_source_field_info_v1(
293
+ self, tgtfilename: str
294
+ ) -> tuple[Optional[str], Optional[str]]:
295
+ """Get person source field info for v1 format (legacy method)"""
296
+ birth_datetime_source = None
297
+ person_id_source = None
101
298
  if tgtfilename in self.rules_data["cdm"]:
102
299
  source_rules_data = self.rules_data["cdm"][tgtfilename]
103
300
  ## this loops over all the fields in the person part of the rules, which will lead to overwriting of the source variables and unneccesary looping
104
301
  for rule_name, rule_fields in source_rules_data.items():
105
302
  if "birth_datetime" in rule_fields:
106
- birth_datetime_source = rule_fields["birth_datetime"]["source_field"]
303
+ birth_datetime_source = rule_fields["birth_datetime"][
304
+ "source_field"
305
+ ]
107
306
  if "person_id" in rule_fields:
108
307
  person_id_source = rule_fields["person_id"]["source_field"]
109
308
 
@@ -125,7 +324,23 @@ class MappingRules:
125
324
  if key != "":
126
325
  if key not in outdata:
127
326
  outdata[key] = []
128
- outdata[key].append(data)
327
+ if key.split("~")[-1] == "person":
328
+ outdata[key].append(data)
329
+
330
+ if key.split("~")[-1] == "person":
331
+ # Find matching source field keys and merge their dictionaries
332
+ for source_field, value in data.items():
333
+ if source_field in outdata[key][0] and isinstance(
334
+ outdata[key][0][source_field], dict
335
+ ):
336
+ # Merge the dictionaries for this source field
337
+ outdata[key][0][source_field].update(value)
338
+ else:
339
+ # If no matching dict or new source field, just set it
340
+ outdata[key][0][source_field] = value
341
+ pass
342
+ else:
343
+ outdata[key].append(data)
129
344
  if outfilename not in outfilenames:
130
345
  outfilenames.append(outfilename)
131
346
 
@@ -137,27 +352,75 @@ class MappingRules:
137
352
  """
138
353
  Process rules for an infile, outfile combination
139
354
  """
140
- outkey = ""
141
355
  data = {}
356
+ ### used for mapping simple fields that are always mapped (e.g., dob)
142
357
  plain_key = ""
143
- term_value_key = ""
358
+ term_value_key = "" ### used for mapping terms (e.g., gender, race, ethnicity)
144
359
 
145
360
  ## iterate through the rules, looking for rules that apply to the input file.
146
361
  for outfield, source_info in rules.items():
147
- if source_info["source_field"] not in data:
148
- data[source_info["source_field"]] = []
362
+ # Check if this rule applies to our input file
149
363
  if source_info["source_table"] == infilename:
150
364
  if "term_mapping" in source_info:
151
365
  if type(source_info["term_mapping"]) is dict:
152
366
  for inputvalue, term in source_info["term_mapping"].items():
153
- ## add a key/add to the list of data in the dict for the given input file
154
- term_value_key = infilename + "~" + source_info["source_field"] + "~" + str(inputvalue) + "~" + outfilename
155
- data[source_info["source_field"]].append(outfield + "~" + str(source_info["term_mapping"][str(inputvalue)]))
367
+ if outfilename == "person":
368
+ term_value_key = infilename + "~person"
369
+ source_field = source_info["source_field"]
370
+ if source_field not in data:
371
+ data[source_field] = {}
372
+ if str(inputvalue) not in data[source_field]:
373
+ try:
374
+ data[source_field][str(inputvalue)] = []
375
+ except TypeError:
376
+ ### need to convert data[source_field] to a dict
377
+ ### like this: {'F': ['gender_concept_id~8532', 'gender_source_concept_id~8532', 'gender_source_value']}
378
+ temp_data_list = data[source_field].copy()
379
+ data[source_field] = {}
380
+ data[source_field][str(inputvalue)] = (
381
+ temp_data_list
382
+ )
383
+
384
+ data[source_field][str(inputvalue)].append(
385
+ outfield + "~" + str(term)
386
+ )
387
+ else:
388
+ term_value_key = (
389
+ infilename
390
+ + "~"
391
+ + source_info["source_field"]
392
+ + "~"
393
+ + str(inputvalue)
394
+ + "~"
395
+ + outfilename
396
+ )
397
+ if source_info["source_field"] not in data:
398
+ data[source_info["source_field"]] = []
399
+ data[source_info["source_field"]].append(
400
+ outfield + "~" + str(term)
401
+ )
156
402
  else:
157
- plain_key = infilename + "~" + source_info["source_field"] + "~" + outfilename
158
- data[source_info["source_field"]].append(outfield + "~" + str(source_info["term_mapping"]))
403
+ plain_key = (
404
+ infilename
405
+ + "~"
406
+ + source_info["source_field"]
407
+ + "~"
408
+ + outfilename
409
+ )
410
+ if source_info["source_field"] not in data:
411
+ data[source_info["source_field"]] = []
412
+ data[source_info["source_field"]].append(
413
+ outfield + "~" + str(source_info["term_mapping"])
414
+ )
159
415
  else:
160
- data[source_info["source_field"]].append(outfield)
416
+ if source_info["source_field"] not in data:
417
+ data[source_info["source_field"]] = []
418
+ if type(data[source_info["source_field"]]) is dict:
419
+ data[source_info["source_field"]][str(inputvalue)].append(
420
+ outfield
421
+ )
422
+ else:
423
+ data[source_info["source_field"]].append(outfield)
161
424
  if term_value_key != "":
162
425
  return term_value_key, data
163
426