carrot-transform 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

Files changed (32) hide show
  1. {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
  2. carrot_transform-0.4.0.dist-info/RECORD +41 -0
  3. {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
  4. carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
  5. carrottransform/__init__.py +1 -1
  6. carrottransform/_version.py +2 -2
  7. carrottransform/cli/command.py +9 -5
  8. carrottransform/cli/subcommands/run.py +214 -526
  9. carrottransform/cli/subcommands/run_v2.py +145 -0
  10. carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
  11. carrottransform/examples/test/rules/v1.json +280 -0
  12. carrottransform/examples/test/rules/v2.json +115 -0
  13. carrottransform/tools/__init__.py +4 -14
  14. carrottransform/tools/args.py +128 -0
  15. carrottransform/tools/concept_helpers.py +61 -0
  16. carrottransform/tools/core.py +163 -0
  17. carrottransform/tools/date_helpers.py +79 -0
  18. carrottransform/tools/file_helpers.py +153 -9
  19. carrottransform/tools/logger.py +19 -0
  20. carrottransform/tools/mapping_types.py +32 -0
  21. carrottransform/tools/mappingrules.py +297 -34
  22. carrottransform/tools/metrics.py +162 -109
  23. carrottransform/tools/omopcdm.py +37 -32
  24. carrottransform/tools/orchestrator.py +381 -0
  25. carrottransform/tools/person_helpers.py +126 -0
  26. carrottransform/tools/record_builder.py +413 -0
  27. carrottransform/tools/stream_helpers.py +71 -0
  28. carrottransform/tools/types.py +71 -0
  29. carrottransform/tools/validation.py +62 -0
  30. carrot_transform-0.3.5.dist-info/RECORD +0 -25
  31. carrot_transform-0.3.5.dist-info/entry_points.txt +0 -3
  32. {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,280 @@
1
+ {
2
+ "metadata": {
3
+ "date_created": "2025-07-22T10:41:33.297641+00:00",
4
+ "dataset": "transform"
5
+ },
6
+ "cdm": {
7
+ "observation": {
8
+ "Asian or Asian British 34532": {
9
+ "observation_datetime": {
10
+ "source_table": "Demographics.csv",
11
+ "source_field": "date_of_birth"
12
+ },
13
+ "observation_source_concept_id": {
14
+ "source_table": "Demographics.csv",
15
+ "source_field": "ethnicity",
16
+ "term_mapping": { "Asian": 35825508 }
17
+ },
18
+ "observation_concept_id": {
19
+ "source_table": "Demographics.csv",
20
+ "source_field": "ethnicity",
21
+ "term_mapping": { "Asian": 35825508 }
22
+ },
23
+ "observation_source_value": {
24
+ "source_table": "Demographics.csv",
25
+ "source_field": "ethnicity"
26
+ },
27
+ "person_id": {
28
+ "source_table": "Demographics.csv",
29
+ "source_field": "PersonID"
30
+ },
31
+ "value_as_string": {
32
+ "source_table": "Demographics.csv",
33
+ "source_field": "ethnicity"
34
+ }
35
+ },
36
+ "Indian 34534": {
37
+ "person_id": {
38
+ "source_table": "Demographics.csv",
39
+ "source_field": "PersonID"
40
+ },
41
+ "observation_datetime": {
42
+ "source_table": "Demographics.csv",
43
+ "source_field": "date_of_birth"
44
+ },
45
+ "observation_source_concept_id": {
46
+ "source_table": "Demographics.csv",
47
+ "source_field": "ethnicity",
48
+ "term_mapping": { "Indian": 35826241 }
49
+ },
50
+ "observation_concept_id": {
51
+ "source_table": "Demographics.csv",
52
+ "source_field": "ethnicity",
53
+ "term_mapping": { "Indian": 35826241 }
54
+ },
55
+ "observation_source_value": {
56
+ "source_table": "Demographics.csv",
57
+ "source_field": "ethnicity"
58
+ },
59
+ "value_as_string": {
60
+ "source_table": "Demographics.csv",
61
+ "source_field": "ethnicity"
62
+ }
63
+ },
64
+ "White and Asian 34537": {
65
+ "person_id": {
66
+ "source_table": "Demographics.csv",
67
+ "source_field": "PersonID"
68
+ },
69
+ "observation_datetime": {
70
+ "source_table": "Demographics.csv",
71
+ "source_field": "date_of_birth"
72
+ },
73
+ "observation_source_concept_id": {
74
+ "source_table": "Demographics.csv",
75
+ "source_field": "ethnicity",
76
+ "term_mapping": { "White and Asian": 35827395 }
77
+ },
78
+ "observation_concept_id": {
79
+ "source_table": "Demographics.csv",
80
+ "source_field": "ethnicity",
81
+ "term_mapping": { "White and Asian": 35827395 }
82
+ },
83
+ "observation_source_value": {
84
+ "source_table": "Demographics.csv",
85
+ "source_field": "ethnicity"
86
+ },
87
+ "value_as_string": {
88
+ "source_table": "Demographics.csv",
89
+ "source_field": "ethnicity"
90
+ }
91
+ },
92
+ "Bangladesh 34542": {
93
+ "value_as_string": {
94
+ "source_table": "Demographics.csv",
95
+ "source_field": "ethnicity"
96
+ },
97
+ "observation_source_value": {
98
+ "source_table": "Demographics.csv",
99
+ "source_field": "ethnicity"
100
+ },
101
+ "observation_concept_id": {
102
+ "source_table": "Demographics.csv",
103
+ "source_field": "ethnicity",
104
+ "term_mapping": { "White and Asian": 35825531 }
105
+ },
106
+ "observation_source_concept_id": {
107
+ "source_table": "Demographics.csv",
108
+ "source_field": "ethnicity",
109
+ "term_mapping": { "White and Asian": 35825531 }
110
+ },
111
+ "observation_datetime": {
112
+ "source_table": "Demographics.csv",
113
+ "source_field": "date_of_birth"
114
+ },
115
+ "person_id": {
116
+ "source_table": "Demographics.csv",
117
+ "source_field": "PersonID"
118
+ }
119
+ }
120
+ },
121
+ "condition_occurrence": {
122
+ "Cough 34538": {
123
+ "person_id": {
124
+ "source_table": "Symptoms.csv",
125
+ "source_field": "PersonID"
126
+ },
127
+ "condition_start_datetime": {
128
+ "source_table": "Symptoms.csv",
129
+ "source_field": "visit_date"
130
+ },
131
+ "condition_end_datetime": {
132
+ "source_table": "Symptoms.csv",
133
+ "source_field": "visit_date"
134
+ },
135
+ "condition_source_concept_id": {
136
+ "source_table": "Symptoms.csv",
137
+ "source_field": "symptom1",
138
+ "term_mapping": { "Y": 254761 }
139
+ },
140
+ "condition_concept_id": {
141
+ "source_table": "Symptoms.csv",
142
+ "source_field": "symptom1",
143
+ "term_mapping": { "Y": 254761 }
144
+ },
145
+ "condition_source_value": {
146
+ "source_table": "Symptoms.csv",
147
+ "source_field": "symptom1"
148
+ }
149
+ }
150
+ },
151
+ "person": {
152
+ "FEMALE 34539": {
153
+ "gender_source_value": {
154
+ "source_table": "Demographics.csv",
155
+ "source_field": "sex"
156
+ },
157
+ "person_id": {
158
+ "source_table": "Demographics.csv",
159
+ "source_field": "PersonID"
160
+ },
161
+ "birth_datetime": {
162
+ "source_table": "Demographics.csv",
163
+ "source_field": "date_of_birth"
164
+ },
165
+ "gender_source_concept_id": {
166
+ "source_table": "Demographics.csv",
167
+ "source_field": "sex",
168
+ "term_mapping": { "F": 8532 }
169
+ },
170
+ "gender_concept_id": {
171
+ "source_table": "Demographics.csv",
172
+ "source_field": "sex",
173
+ "term_mapping": { "F": 8532 }
174
+ }
175
+ },
176
+ "MALE 34540": {
177
+ "gender_source_value": {
178
+ "source_table": "Demographics.csv",
179
+ "source_field": "sex"
180
+ },
181
+ "gender_concept_id": {
182
+ "source_table": "Demographics.csv",
183
+ "source_field": "sex",
184
+ "term_mapping": { "M": 8507 }
185
+ },
186
+ "gender_source_concept_id": {
187
+ "source_table": "Demographics.csv",
188
+ "source_field": "sex",
189
+ "term_mapping": { "M": 8507 }
190
+ },
191
+ "birth_datetime": {
192
+ "source_table": "Demographics.csv",
193
+ "source_field": "date_of_birth"
194
+ },
195
+ "person_id": {
196
+ "source_table": "Demographics.csv",
197
+ "source_field": "PersonID"
198
+ }
199
+ },
200
+ "White 212273": {
201
+ "person_id": {
202
+ "source_table": "Demographics.csv",
203
+ "source_field": "PersonID"
204
+ },
205
+ "birth_datetime": {
206
+ "source_table": "Demographics.csv",
207
+ "source_field": "date_of_birth"
208
+ },
209
+ "race_source_concept_id": {
210
+ "source_table": "Demographics.csv",
211
+ "source_field": "ethnicity",
212
+ "term_mapping": { "White": 8527 }
213
+ },
214
+ "race_concept_id": {
215
+ "source_table": "Demographics.csv",
216
+ "source_field": "ethnicity",
217
+ "term_mapping": { "White": 8527 }
218
+ },
219
+ "race_source_value": {
220
+ "source_table": "Demographics.csv",
221
+ "source_field": "ethnicity"
222
+ }
223
+ },
224
+ "African 212274": {
225
+ "person_id": {
226
+ "source_table": "Demographics.csv",
227
+ "source_field": "PersonID"
228
+ },
229
+ "birth_datetime": {
230
+ "source_table": "Demographics.csv",
231
+ "source_field": "date_of_birth"
232
+ },
233
+ "race_source_concept_id": {
234
+ "source_table": "Demographics.csv",
235
+ "source_field": "ethnicity",
236
+ "term_mapping": { "Black": 38003600 }
237
+ },
238
+ "race_concept_id": {
239
+ "source_table": "Demographics.csv",
240
+ "source_field": "ethnicity",
241
+ "term_mapping": { "Black": 38003600 }
242
+ },
243
+ "race_source_value": {
244
+ "source_table": "Demographics.csv",
245
+ "source_field": "ethnicity"
246
+ }
247
+ }
248
+ },
249
+ "measurement": {
250
+ "IgG 34541": {
251
+ "measurement_source_value": {
252
+ "source_table": "covid19_antibody.csv",
253
+ "source_field": "IgG"
254
+ },
255
+ "person_id": {
256
+ "source_table": "covid19_antibody.csv",
257
+ "source_field": "PersonID"
258
+ },
259
+ "measurement_datetime": {
260
+ "source_table": "covid19_antibody.csv",
261
+ "source_field": "date"
262
+ },
263
+ "value_as_number": {
264
+ "source_table": "covid19_antibody.csv",
265
+ "source_field": "IgG"
266
+ },
267
+ "measurement_source_concept_id": {
268
+ "source_table": "covid19_antibody.csv",
269
+ "source_field": "IgG",
270
+ "term_mapping": 37398191
271
+ },
272
+ "measurement_concept_id": {
273
+ "source_table": "covid19_antibody.csv",
274
+ "source_field": "IgG",
275
+ "term_mapping": 37398191
276
+ }
277
+ }
278
+ }
279
+ }
280
+ }
@@ -0,0 +1,115 @@
1
+ {
2
+ "metadata": {
3
+ "date_created": "2025-07-22T11:13:38.203888+00:00",
4
+ "dataset": "transform"
5
+ },
6
+ "cdm": {
7
+ "condition_occurrence": {
8
+ "Symptoms.csv": {
9
+ "person_id_mapping": {
10
+ "source_field": "PersonID",
11
+ "dest_field": "person_id"
12
+ },
13
+ "date_mapping": {
14
+ "source_field": "visit_date",
15
+ "dest_field": ["condition_start_datetime", "condition_end_datetime"]
16
+ },
17
+ "concept_mappings": {
18
+ "symptom1": {
19
+ "Y": {
20
+ "condition_source_concept_id": [254761],
21
+ "condition_concept_id": [254761]
22
+ },
23
+ "original_value": ["condition_source_value"]
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "measurement": {
29
+ "covid19_antibody.csv": {
30
+ "person_id_mapping": {
31
+ "source_field": "PersonID",
32
+ "dest_field": "person_id"
33
+ },
34
+ "date_mapping": {
35
+ "source_field": "date",
36
+ "dest_field": ["measurement_datetime"]
37
+ },
38
+ "concept_mappings": {
39
+ "IgG": {
40
+ "*": {
41
+ "measurement_source_concept_id": [37398191],
42
+ "measurement_concept_id": [37398191]
43
+ },
44
+ "original_value": ["measurement_source_value", "value_as_number"]
45
+ }
46
+ }
47
+ }
48
+ },
49
+ "observation": {
50
+ "Demographics.csv": {
51
+ "person_id_mapping": {
52
+ "source_field": "PersonID",
53
+ "dest_field": "person_id"
54
+ },
55
+ "date_mapping": {
56
+ "source_field": "date_of_birth",
57
+ "dest_field": ["observation_datetime"]
58
+ },
59
+ "concept_mappings": {
60
+ "ethnicity": {
61
+ "White and Asian": {
62
+ "observation_concept_id": [35825531, 35827395],
63
+ "observation_source_concept_id": [35825531, 35827395]
64
+ },
65
+ "Indian": {
66
+ "observation_concept_id": [35826241],
67
+ "observation_source_concept_id": [35826241]
68
+ },
69
+ "Asian": {
70
+ "observation_concept_id": [35825508],
71
+ "observation_source_concept_id": [35825508]
72
+ },
73
+ "original_value": ["value_as_string", "observation_source_value"]
74
+ }
75
+ }
76
+ }
77
+ },
78
+ "person": {
79
+ "Demographics.csv": {
80
+ "person_id_mapping": {
81
+ "source_field": "PersonID",
82
+ "dest_field": "person_id"
83
+ },
84
+ "date_mapping": {
85
+ "source_field": "date_of_birth",
86
+ "dest_field": ["birth_datetime"]
87
+ },
88
+ "concept_mappings": {
89
+ "ethnicity": {
90
+ "Black": {
91
+ "race_source_concept_id": [38003600],
92
+ "race_concept_id": [38003600]
93
+ },
94
+ "White": {
95
+ "race_concept_id": [8527],
96
+ "race_source_concept_id": [8527]
97
+ },
98
+ "original_value": ["race_source_value"]
99
+ },
100
+ "sex": {
101
+ "M": {
102
+ "gender_concept_id": [8507],
103
+ "gender_source_concept_id": [8507]
104
+ },
105
+ "F": {
106
+ "gender_concept_id": [8532],
107
+ "gender_source_concept_id": [8532]
108
+ },
109
+ "original_value": ["gender_source_value"]
110
+ }
111
+ }
112
+ }
113
+ }
114
+ }
115
+ }
@@ -1,17 +1,7 @@
1
- import carrottransform
2
- import os
3
- import sys
4
- import json
5
- import time
1
+ from .file_helpers import load_json as load_json
6
2
 
7
- from .file_helpers import (
8
- load_json
9
- )
3
+ from .metrics import Metrics as Metrics
10
4
 
11
- from .metrics import (
12
- Metrics
13
- )
14
- from . import mappingrules
15
-
16
- from . import omopcdm
5
+ from . import mappingrules as mappingrules
17
6
 
7
+ from . import omopcdm as omopcdm
@@ -0,0 +1,128 @@
1
+ """
2
+ functions to handle args
3
+ """
4
+
5
+ from pathlib import Path
6
+
7
+
8
+ class OnlyOnePersonInputAllowed(Exception):
9
+ """Raised when they try to use more than one person file in the mapping"""
10
+
11
+ def __init__(self, rules_file: Path, person_file: Path, inputs: set[str]):
12
+ self._rules_file = rules_file
13
+ self._person_file = person_file
14
+ self._inputs = inputs
15
+
16
+
17
+ class NoPersonMappings(Exception):
18
+ """Raised when they try to use more than one person file in the mapping"""
19
+
20
+ def __init__(self, rules_file: Path, person_file: Path):
21
+ self._rules_file = rules_file
22
+ self._person_file = person_file
23
+
24
+
25
+ class WrongInputException(Exception):
26
+ """Raised when they try to read fromt he wrong table - and only the wrong table"""
27
+
28
+ def __init__(self, rules_file: Path, person_file: Path, source_table: str):
29
+ self._rules_file = rules_file
30
+ self._person_file = person_file
31
+ self._source_table = source_table
32
+
33
+
34
+ class ObjectQueryError(Exception):
35
+ """Raised when the object path format is invalid."""
36
+
37
+
38
+ class ObjectStructureError(Exception):
39
+ """Raised when the object path format points to inaccessible elements."""
40
+
41
+
42
+ def person_rules_check(person_file: Path, rules_file: Path) -> None:
43
+ """check that the person rules file is correct.
44
+
45
+ we need all person/patient records to come from one file - the person file. this includes the gender mapping. this should/must also be the person_file parameter.
46
+
47
+ ... this does reopen the possibility of auto-detecting the person file from the rules file
48
+ """
49
+
50
+ # check the args are real files
51
+ if not person_file.is_file():
52
+ raise Exception(f"person file not found: {person_file=}")
53
+ if not rules_file.is_file():
54
+ raise Exception(f"person file not found: {rules_file=}")
55
+
56
+ # load the rules file
57
+ with open(rules_file) as file:
58
+ import json
59
+
60
+ rules_json = json.load(file)
61
+
62
+ # loop through the rules for person rules with wrong_inputs
63
+ seen_inputs: set[str] = set()
64
+ try:
65
+ for rule_name, person in object_query(rules_json, "cdm/person").items():
66
+ found_a_rule = True
67
+ for col in person:
68
+ source_table: str = person[col]["source_table"]
69
+ seen_inputs.add(source_table)
70
+ except ObjectStructureError as e:
71
+ if "Key 'person' not found in object" == str(e):
72
+ raise NoPersonMappings(rules_file, person_file)
73
+ else:
74
+ raise e
75
+
76
+ # for theoretical cases when there is a `"people":{}` entry that's empty
77
+ # ... i don't think that carrot-mapper would emit it, but, i think that it would be valid JSON
78
+ if not found_a_rule:
79
+ raise NoPersonMappings(rules_file, person_file)
80
+
81
+ # detect too many input files
82
+ if 1 < len(seen_inputs):
83
+ raise OnlyOnePersonInputAllowed(rules_file, person_file, seen_inputs)
84
+
85
+ # check if the seen file is correct
86
+ seen_table: str = list(seen_inputs)[0]
87
+
88
+ if person_file.name != seen_table:
89
+ raise WrongInputException(rules_file, person_file, seen_table)
90
+
91
+
92
+ def object_query(data: dict[str, dict | str], path: str):
93
+ """
94
+ Navigate a nested dictionary using a `/`-delimited path string.
95
+
96
+ Args:
97
+ data: The dictionary to traverse.
98
+ path: The object path, e.g., "/foo/bar".
99
+
100
+ Returns:
101
+ The value at the given path.
102
+
103
+ Raises:
104
+ ObjectQueryError: If the path format is invalid or the key is missing.
105
+ """
106
+
107
+ if path.startswith("/") or path.endswith("/"):
108
+ raise ObjectQueryError(
109
+ f"Invalid path format: {path!r} (must not start with '/' and not end with '/')"
110
+ )
111
+
112
+ current_key, _, remaining_path = path.partition("/")
113
+ if not current_key:
114
+ raise ObjectQueryError(f"Invalid path: blank key at start in {path!r}")
115
+
116
+ if current_key not in data:
117
+ raise ObjectStructureError(f"Key {current_key!r} not found in object")
118
+
119
+ value = data[current_key]
120
+ if not remaining_path:
121
+ return value
122
+
123
+ if not isinstance(value, dict):
124
+ raise ObjectStructureError(
125
+ f"Cannot descend into non-dict value at key {current_key!r}"
126
+ )
127
+
128
+ return object_query(value, remaining_path)
@@ -0,0 +1,61 @@
1
+ from typing import Dict, List, Optional
2
+ from carrottransform.tools.mapping_types import ConceptMapping
3
+
4
+
5
+ def generate_combinations(
6
+ value_mapping: Optional[Dict[str, List[int]]],
7
+ ) -> List[Dict[str, int]]:
8
+ """
9
+ Generate all concept combinations for multiple concept IDs
10
+ NOTE: this logic can handle un-even number of concept IDs across fields, even though this scenario needs more investigation.
11
+ For now, the len of dest_fields should be equal
12
+
13
+ For example, if value_mapping is:
14
+ {
15
+ "observation_concept_id": [35827395, 35825531],
16
+ "observation_source_concept_id": [35827395, 35825531]
17
+ }
18
+
19
+ This returns:
20
+ [
21
+ {"observation_concept_id": 35827395, "observation_source_concept_id": 35827395},
22
+ {"observation_concept_id": 35825531, "observation_source_concept_id": 35825531}
23
+ ]
24
+ """
25
+ if not value_mapping:
26
+ return []
27
+
28
+ # Find the maximum number of concept IDs across all fields
29
+ max_concepts = max(
30
+ len(concept_ids) for concept_ids in value_mapping.values() if concept_ids
31
+ )
32
+
33
+ combinations = []
34
+ for i in range(max_concepts):
35
+ combo = {}
36
+ for dest_field, concept_ids in value_mapping.items():
37
+ if concept_ids:
38
+ # Use the concept at index i, or the last one if not enough concepts
39
+ concept_index = min(i, len(concept_ids) - 1)
40
+ combo[dest_field] = concept_ids[concept_index]
41
+ combinations.append(combo)
42
+
43
+ return combinations
44
+
45
+
46
+ def get_value_mapping(
47
+ concept_mapping: ConceptMapping, source_value: str
48
+ ) -> Optional[Dict[str, List[int]]]:
49
+ """
50
+ Get value mapping for a source value, handling wildcards
51
+
52
+ Priority:
53
+ 1. Exact match for source value
54
+ 2. Wildcard match (*) - maps all values to same concept
55
+ 3. None
56
+ """
57
+ if source_value in concept_mapping.value_mappings:
58
+ return concept_mapping.value_mappings[source_value]
59
+ elif "*" in concept_mapping.value_mappings:
60
+ return concept_mapping.value_mappings["*"]
61
+ return None