carrot-transform 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of carrot-transform might be problematic. Click here for more details.
- carrot_transform-0.3.5.dist-info/METADATA +106 -0
- carrot_transform-0.3.5.dist-info/RECORD +25 -0
- {carrot_transform-0.3.3.dist-info → carrot_transform-0.3.5.dist-info}/WHEEL +1 -1
- carrot_transform-0.3.5.dist-info/entry_points.txt +3 -0
- carrottransform/_version.py +6 -2
- carrottransform/cli/subcommands/run.py +445 -193
- carrottransform/examples/test/inputs/Covid19_test.csv +801 -0
- carrottransform/examples/test/inputs/Demographics.csv +1001 -0
- carrottransform/examples/test/inputs/Symptoms.csv +801 -0
- carrottransform/examples/test/inputs/covid19_antibody.csv +1001 -0
- carrottransform/examples/test/inputs/vaccine.csv +501 -0
- carrottransform/examples/test/rules/rules_14June2021.json +300 -0
- carrottransform/tools/click.py +21 -0
- carrottransform/tools/file_helpers.py +30 -4
- carrottransform/tools/mappingrules.py +13 -10
- carrottransform/tools/metrics.py +212 -40
- carrottransform/tools/omopcdm.py +17 -5
- carrot_transform-0.3.3.dist-info/METADATA +0 -48
- carrot_transform-0.3.3.dist-info/RECORD +0 -17
- {carrot_transform-0.3.3.dist-info → carrot_transform-0.3.5.dist-info}/LICENSE +0 -0
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
{
|
|
2
|
+
"metadata": {
|
|
3
|
+
"date_created": "2021-06-14T15:27:37.123947",
|
|
4
|
+
"dataset": "Test"
|
|
5
|
+
},
|
|
6
|
+
"cdm": {
|
|
7
|
+
"observation": {
|
|
8
|
+
"observation_0": {
|
|
9
|
+
"observation_concept_id": {
|
|
10
|
+
"source_table": "Demographics.csv",
|
|
11
|
+
"source_field": "ethnicity",
|
|
12
|
+
"term_mapping": {
|
|
13
|
+
"Asian": 35825508
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"observation_datetime": {
|
|
17
|
+
"source_table": "Demographics.csv",
|
|
18
|
+
"source_field": "date_of_birth"
|
|
19
|
+
},
|
|
20
|
+
"observation_source_concept_id": {
|
|
21
|
+
"source_table": "Demographics.csv",
|
|
22
|
+
"source_field": "ethnicity",
|
|
23
|
+
"term_mapping": {
|
|
24
|
+
"Asian": 35825508
|
|
25
|
+
}
|
|
26
|
+
},
|
|
27
|
+
"observation_source_value": {
|
|
28
|
+
"source_table": "Demographics.csv",
|
|
29
|
+
"source_field": "ethnicity"
|
|
30
|
+
},
|
|
31
|
+
"person_id": {
|
|
32
|
+
"source_table": "Demographics.csv",
|
|
33
|
+
"source_field": "PersonID"
|
|
34
|
+
}
|
|
35
|
+
},
|
|
36
|
+
"observation_1":{
|
|
37
|
+
"observation_concept_id": {
|
|
38
|
+
"source_table": "Demographics.csv",
|
|
39
|
+
"source_field": "ethnicity",
|
|
40
|
+
"term_mapping": {
|
|
41
|
+
"Bangladeshi": 35825531
|
|
42
|
+
}
|
|
43
|
+
},
|
|
44
|
+
"observation_datetime": {
|
|
45
|
+
"source_table": "Demographics.csv",
|
|
46
|
+
"source_field": "date_of_birth"
|
|
47
|
+
},
|
|
48
|
+
"observation_source_concept_id": {
|
|
49
|
+
"source_table": "Demographics.csv",
|
|
50
|
+
"source_field": "ethnicity",
|
|
51
|
+
"term_mapping": {
|
|
52
|
+
"Bangladeshi": 35825531
|
|
53
|
+
}
|
|
54
|
+
},
|
|
55
|
+
"observation_source_value": {
|
|
56
|
+
"source_table": "Demographics.csv",
|
|
57
|
+
"source_field": "ethnicity"
|
|
58
|
+
},
|
|
59
|
+
"person_id": {
|
|
60
|
+
"source_table": "Demographics.csv",
|
|
61
|
+
"source_field": "PersonID"
|
|
62
|
+
}
|
|
63
|
+
},
|
|
64
|
+
"observation_2":{
|
|
65
|
+
"observation_concept_id": {
|
|
66
|
+
"source_table": "Demographics.csv",
|
|
67
|
+
"source_field": "ethnicity",
|
|
68
|
+
"term_mapping": {
|
|
69
|
+
"Indian": 35826241
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
"observation_datetime": {
|
|
73
|
+
"source_table": "Demographics.csv",
|
|
74
|
+
"source_field": "date_of_birth"
|
|
75
|
+
},
|
|
76
|
+
"observation_source_concept_id": {
|
|
77
|
+
"source_table": "Demographics.csv",
|
|
78
|
+
"source_field": "ethnicity",
|
|
79
|
+
"term_mapping": {
|
|
80
|
+
"Indian": 35826241
|
|
81
|
+
}
|
|
82
|
+
},
|
|
83
|
+
"observation_source_value": {
|
|
84
|
+
"source_table": "Demographics.csv",
|
|
85
|
+
"source_field": "ethnicity"
|
|
86
|
+
},
|
|
87
|
+
"person_id": {
|
|
88
|
+
"source_table": "Demographics.csv",
|
|
89
|
+
"source_field": "PersonID"
|
|
90
|
+
}
|
|
91
|
+
},
|
|
92
|
+
"observation_3":{
|
|
93
|
+
"observation_concept_id": {
|
|
94
|
+
"source_table": "Demographics.csv",
|
|
95
|
+
"source_field": "ethnicity",
|
|
96
|
+
"term_mapping": {
|
|
97
|
+
"White": 35827394
|
|
98
|
+
}
|
|
99
|
+
},
|
|
100
|
+
"observation_datetime": {
|
|
101
|
+
"source_table": "Demographics.csv",
|
|
102
|
+
"source_field": "date_of_birth"
|
|
103
|
+
},
|
|
104
|
+
"observation_source_concept_id": {
|
|
105
|
+
"source_table": "Demographics.csv",
|
|
106
|
+
"source_field": "ethnicity",
|
|
107
|
+
"term_mapping": {
|
|
108
|
+
"White": 35827394
|
|
109
|
+
}
|
|
110
|
+
},
|
|
111
|
+
"observation_source_value": {
|
|
112
|
+
"source_table": "Demographics.csv",
|
|
113
|
+
"source_field": "ethnicity"
|
|
114
|
+
},
|
|
115
|
+
"person_id": {
|
|
116
|
+
"source_table": "Demographics.csv",
|
|
117
|
+
"source_field": "PersonID"
|
|
118
|
+
}
|
|
119
|
+
},
|
|
120
|
+
"observation_4":{
|
|
121
|
+
"observation_concept_id": {
|
|
122
|
+
"source_table": "Demographics.csv",
|
|
123
|
+
"source_field": "ethnicity",
|
|
124
|
+
"term_mapping": {
|
|
125
|
+
"Black": 35825567
|
|
126
|
+
}
|
|
127
|
+
},
|
|
128
|
+
"observation_datetime": {
|
|
129
|
+
"source_table": "Demographics.csv",
|
|
130
|
+
"source_field": "date_of_birth"
|
|
131
|
+
},
|
|
132
|
+
"observation_source_concept_id": {
|
|
133
|
+
"source_table": "Demographics.csv",
|
|
134
|
+
"source_field": "ethnicity",
|
|
135
|
+
"term_mapping": {
|
|
136
|
+
"Black": 35825567
|
|
137
|
+
}
|
|
138
|
+
},
|
|
139
|
+
"observation_source_value": {
|
|
140
|
+
"source_table": "Demographics.csv",
|
|
141
|
+
"source_field": "ethnicity"
|
|
142
|
+
},
|
|
143
|
+
"person_id": {
|
|
144
|
+
"source_table": "Demographics.csv",
|
|
145
|
+
"source_field": "PersonID"
|
|
146
|
+
}
|
|
147
|
+
},
|
|
148
|
+
"observation_5":{
|
|
149
|
+
"observation_concept_id": {
|
|
150
|
+
"source_table": "Demographics.csv",
|
|
151
|
+
"source_field": "ethnicity",
|
|
152
|
+
"term_mapping": {
|
|
153
|
+
"White and Asian": 35827395
|
|
154
|
+
}
|
|
155
|
+
},
|
|
156
|
+
"observation_datetime": {
|
|
157
|
+
"source_table": "Demographics.csv",
|
|
158
|
+
"source_field": "date_of_birth"
|
|
159
|
+
},
|
|
160
|
+
"observation_source_concept_id": {
|
|
161
|
+
"source_table": "Demographics.csv",
|
|
162
|
+
"source_field": "ethnicity",
|
|
163
|
+
"term_mapping": {
|
|
164
|
+
"White and Asian": 35827395
|
|
165
|
+
}
|
|
166
|
+
},
|
|
167
|
+
"observation_source_value": {
|
|
168
|
+
"source_table": "Demographics.csv",
|
|
169
|
+
"source_field": "ethnicity"
|
|
170
|
+
},
|
|
171
|
+
"person_id": {
|
|
172
|
+
"source_table": "Demographics.csv",
|
|
173
|
+
"source_field": "PersonID"
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
},
|
|
177
|
+
"condition_occurrence": {
|
|
178
|
+
"condition_occurrence_0":{
|
|
179
|
+
"condition_concept_id": {
|
|
180
|
+
"source_table": "Symptoms.csv",
|
|
181
|
+
"source_field": "symptom1",
|
|
182
|
+
"term_mapping": {
|
|
183
|
+
"Y": 254761
|
|
184
|
+
}
|
|
185
|
+
},
|
|
186
|
+
"condition_end_datetime": {
|
|
187
|
+
"source_table": "Symptoms.csv",
|
|
188
|
+
"source_field": "visit_date"
|
|
189
|
+
},
|
|
190
|
+
"condition_source_concept_id": {
|
|
191
|
+
"source_table": "Symptoms.csv",
|
|
192
|
+
"source_field": "symptom1",
|
|
193
|
+
"term_mapping": {
|
|
194
|
+
"Y": 254761
|
|
195
|
+
}
|
|
196
|
+
},
|
|
197
|
+
"condition_source_value": {
|
|
198
|
+
"source_table": "Symptoms.csv",
|
|
199
|
+
"source_field": "symptom1"
|
|
200
|
+
},
|
|
201
|
+
"condition_start_datetime": {
|
|
202
|
+
"source_table": "Symptoms.csv",
|
|
203
|
+
"source_field": "visit_date"
|
|
204
|
+
},
|
|
205
|
+
"person_id": {
|
|
206
|
+
"source_table": "Symptoms.csv",
|
|
207
|
+
"source_field": "PersonID"
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
},
|
|
211
|
+
"person": {
|
|
212
|
+
"female":{
|
|
213
|
+
"birth_datetime": {
|
|
214
|
+
"source_table": "Demographics.csv",
|
|
215
|
+
"source_field": "date_of_birth"
|
|
216
|
+
},
|
|
217
|
+
"gender_concept_id": {
|
|
218
|
+
"source_table": "Demographics.csv",
|
|
219
|
+
"source_field": "sex",
|
|
220
|
+
"term_mapping": {
|
|
221
|
+
"F": 8532
|
|
222
|
+
}
|
|
223
|
+
},
|
|
224
|
+
"gender_source_concept_id": {
|
|
225
|
+
"source_table": "Demographics.csv",
|
|
226
|
+
"source_field": "sex",
|
|
227
|
+
"term_mapping": {
|
|
228
|
+
"F": 8532
|
|
229
|
+
}
|
|
230
|
+
},
|
|
231
|
+
"gender_source_value": {
|
|
232
|
+
"source_table": "Demographics.csv",
|
|
233
|
+
"source_field": "sex"
|
|
234
|
+
},
|
|
235
|
+
"person_id": {
|
|
236
|
+
"source_table": "Demographics.csv",
|
|
237
|
+
"source_field": "PersonID"
|
|
238
|
+
}
|
|
239
|
+
},
|
|
240
|
+
"male":{
|
|
241
|
+
"birth_datetime": {
|
|
242
|
+
"source_table": "Demographics.csv",
|
|
243
|
+
"source_field": "date_of_birth"
|
|
244
|
+
},
|
|
245
|
+
"gender_concept_id": {
|
|
246
|
+
"source_table": "Demographics.csv",
|
|
247
|
+
"source_field": "sex",
|
|
248
|
+
"term_mapping": {
|
|
249
|
+
"M": 8507
|
|
250
|
+
}
|
|
251
|
+
},
|
|
252
|
+
"gender_source_concept_id": {
|
|
253
|
+
"source_table": "Demographics.csv",
|
|
254
|
+
"source_field": "sex",
|
|
255
|
+
"term_mapping": {
|
|
256
|
+
"M": 8507
|
|
257
|
+
}
|
|
258
|
+
},
|
|
259
|
+
"gender_source_value": {
|
|
260
|
+
"source_table": "Demographics.csv",
|
|
261
|
+
"source_field": "sex"
|
|
262
|
+
},
|
|
263
|
+
"person_id": {
|
|
264
|
+
"source_table": "Demographics.csv",
|
|
265
|
+
"source_field": "PersonID"
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
},
|
|
269
|
+
"measurement": {
|
|
270
|
+
"covid_antibody":{
|
|
271
|
+
"value_as_number": {
|
|
272
|
+
"source_table": "covid19_antibody.csv",
|
|
273
|
+
"source_field": "IgG"
|
|
274
|
+
},
|
|
275
|
+
"measurement_source_value": {
|
|
276
|
+
"source_table": "covid19_antibody.csv",
|
|
277
|
+
"source_field": "IgG"
|
|
278
|
+
},
|
|
279
|
+
"measurement_concept_id": {
|
|
280
|
+
"source_table": "covid19_antibody.csv",
|
|
281
|
+
"source_field": "IgG",
|
|
282
|
+
"term_mapping": 37398191
|
|
283
|
+
},
|
|
284
|
+
"measurement_source_concept_id": {
|
|
285
|
+
"source_table": "covid19_antibody.csv",
|
|
286
|
+
"source_field": "IgG",
|
|
287
|
+
"term_mapping": 37398191
|
|
288
|
+
},
|
|
289
|
+
"measurement_datetime": {
|
|
290
|
+
"source_table": "covid19_antibody.csv",
|
|
291
|
+
"source_field": "date"
|
|
292
|
+
},
|
|
293
|
+
"person_id": {
|
|
294
|
+
"source_table": "covid19_antibody.csv",
|
|
295
|
+
"source_field": "PersonID"
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import click
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def PathArgs():
|
|
6
|
+
"""used by the click library for CLI args that are files"""
|
|
7
|
+
|
|
8
|
+
class PathArgs(click.ParamType):
|
|
9
|
+
name = "pathlib.Path"
|
|
10
|
+
|
|
11
|
+
def convert(self, value, param, ctx):
|
|
12
|
+
try:
|
|
13
|
+
return Path(value)
|
|
14
|
+
except Exception as e:
|
|
15
|
+
self.fail(f"Invalid path: {value} ({e})", param, ctx)
|
|
16
|
+
|
|
17
|
+
return PathArgs()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# use this
|
|
21
|
+
PathArgs = PathArgs()
|
|
@@ -1,15 +1,41 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
1
3
|
import os
|
|
2
4
|
import sys
|
|
3
5
|
import json
|
|
6
|
+
import importlib.resources as resources
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
4
12
|
|
|
5
13
|
# Function inherited from the "old" CaRROT-CDM (modfied to exit on error)
|
|
6
|
-
|
|
7
|
-
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def load_json(f_in: Path):
|
|
8
17
|
try:
|
|
9
|
-
data = json.load(open(
|
|
18
|
+
data = json.load(f_in.open())
|
|
10
19
|
except Exception as err:
|
|
11
|
-
|
|
20
|
+
logger.exception("{0} not found. Or cannot parse as json".format(f_in))
|
|
12
21
|
sys.exit()
|
|
13
22
|
|
|
14
23
|
return data
|
|
15
24
|
|
|
25
|
+
|
|
26
|
+
def resolve_paths(args: List[Optional[Path]]) -> List[Optional[Path]]:
|
|
27
|
+
"""Resolve special path syntaxes in command line arguments."""
|
|
28
|
+
try:
|
|
29
|
+
with resources.files('carrottransform').joinpath('__init__.py') as f:
|
|
30
|
+
package_path = f.parent
|
|
31
|
+
except Exception:
|
|
32
|
+
# Fallback for development environment
|
|
33
|
+
import carrottransform
|
|
34
|
+
package_path = Path(carrottransform.__file__).resolve().parent
|
|
35
|
+
|
|
36
|
+
# Handle None values and replace @carrot with the actual package path
|
|
37
|
+
prefix = '@carrot'
|
|
38
|
+
return [
|
|
39
|
+
package_path / Path(str(arg).replace(prefix, '').lstrip('/')) if arg is not None and str(arg).startswith(prefix) else arg
|
|
40
|
+
for arg in args
|
|
41
|
+
]
|
|
@@ -3,13 +3,17 @@ import json
|
|
|
3
3
|
import carrottransform.tools as tools
|
|
4
4
|
from .omopcdm import OmopCDM
|
|
5
5
|
|
|
6
|
+
import logging
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
6
9
|
class MappingRules:
|
|
7
10
|
"""
|
|
8
11
|
self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
|
|
9
12
|
as a file-specific dictionary allowing rules to be "looked-up" depending on data content
|
|
10
13
|
"""
|
|
11
14
|
|
|
12
|
-
def __init__(self, rulesfilepath, omopcdm):
|
|
15
|
+
def __init__(self, rulesfilepath: os.PathLike, omopcdm: OmopCDM):
|
|
16
|
+
## just loads the json directly
|
|
13
17
|
self.rules_data = tools.load_json(rulesfilepath)
|
|
14
18
|
self.omopcdm = omopcdm
|
|
15
19
|
|
|
@@ -34,12 +38,7 @@ class MappingRules:
|
|
|
34
38
|
return self.dataset_name
|
|
35
39
|
|
|
36
40
|
def get_all_outfile_names(self):
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
for outfilename in self.rules_data["cdm"]:
|
|
40
|
-
file_list.append(outfilename)
|
|
41
|
-
|
|
42
|
-
return file_list
|
|
41
|
+
return list(self.rules_data["cdm"])
|
|
43
42
|
|
|
44
43
|
def get_all_infile_names(self):
|
|
45
44
|
file_list = []
|
|
@@ -84,11 +83,11 @@ class MappingRules:
|
|
|
84
83
|
outfile = keydata[-1]
|
|
85
84
|
for outfield_elem in outfield_data:
|
|
86
85
|
for infield, outfield_list in outfield_elem.items():
|
|
87
|
-
|
|
86
|
+
logger.debug("{0}, {1}, {2}".format(outfile, infield, str(outfield_list)))
|
|
88
87
|
for outfield in outfield_list:
|
|
89
|
-
if outfield in self.omopcdm.get_omop_datetime_fields(outfile):
|
|
88
|
+
if outfield.split('~')[0] in self.omopcdm.get_omop_datetime_fields(outfile):
|
|
90
89
|
datetime_source = infield
|
|
91
|
-
if outfield == self.omopcdm.get_omop_person_id_field(outfile):
|
|
90
|
+
if outfield.split('~')[0] == self.omopcdm.get_omop_person_id_field(outfile):
|
|
92
91
|
person_id_source = infield
|
|
93
92
|
|
|
94
93
|
return datetime_source, person_id_source
|
|
@@ -101,6 +100,7 @@ class MappingRules:
|
|
|
101
100
|
person_id_source = None
|
|
102
101
|
if tgtfilename in self.rules_data["cdm"]:
|
|
103
102
|
source_rules_data = self.rules_data["cdm"][tgtfilename]
|
|
103
|
+
## this loops over all the fields in the person part of the rules, which will lead to overwriting of the source variables and unneccesary looping
|
|
104
104
|
for rule_name, rule_fields in source_rules_data.items():
|
|
105
105
|
if "birth_datetime" in rule_fields:
|
|
106
106
|
birth_datetime_source = rule_fields["birth_datetime"]["source_field"]
|
|
@@ -113,6 +113,7 @@ class MappingRules:
|
|
|
113
113
|
"""
|
|
114
114
|
Parse rules to produce a map of source to target data for a given input file
|
|
115
115
|
"""
|
|
116
|
+
## creates a dict of dicts that has input files as keys, and infile~field~data~target as keys for the underlying keys, which contain a list of dicts of lists
|
|
116
117
|
if infilename in self.outfile_names and infilename in self.parsed_rules:
|
|
117
118
|
return self.outfile_names[infilename], self.parsed_rules[infilename]
|
|
118
119
|
outfilenames = []
|
|
@@ -141,6 +142,7 @@ class MappingRules:
|
|
|
141
142
|
plain_key = ""
|
|
142
143
|
term_value_key = ""
|
|
143
144
|
|
|
145
|
+
## iterate through the rules, looking for rules that apply to the input file.
|
|
144
146
|
for outfield, source_info in rules.items():
|
|
145
147
|
if source_info["source_field"] not in data:
|
|
146
148
|
data[source_info["source_field"]] = []
|
|
@@ -148,6 +150,7 @@ class MappingRules:
|
|
|
148
150
|
if "term_mapping" in source_info:
|
|
149
151
|
if type(source_info["term_mapping"]) is dict:
|
|
150
152
|
for inputvalue, term in source_info["term_mapping"].items():
|
|
153
|
+
## add a key/add to the list of data in the dict for the given input file
|
|
151
154
|
term_value_key = infilename + "~" + source_info["source_field"] + "~" + str(inputvalue) + "~" + outfilename
|
|
152
155
|
data[source_info["source_field"]].append(outfield + "~" + str(source_info["term_mapping"][str(inputvalue)]))
|
|
153
156
|
else:
|