carrot-transform 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

@@ -0,0 +1,300 @@
1
+ {
2
+ "metadata": {
3
+ "date_created": "2021-06-14T15:27:37.123947",
4
+ "dataset": "Test"
5
+ },
6
+ "cdm": {
7
+ "observation": {
8
+ "observation_0": {
9
+ "observation_concept_id": {
10
+ "source_table": "Demographics.csv",
11
+ "source_field": "ethnicity",
12
+ "term_mapping": {
13
+ "Asian": 35825508
14
+ }
15
+ },
16
+ "observation_datetime": {
17
+ "source_table": "Demographics.csv",
18
+ "source_field": "date_of_birth"
19
+ },
20
+ "observation_source_concept_id": {
21
+ "source_table": "Demographics.csv",
22
+ "source_field": "ethnicity",
23
+ "term_mapping": {
24
+ "Asian": 35825508
25
+ }
26
+ },
27
+ "observation_source_value": {
28
+ "source_table": "Demographics.csv",
29
+ "source_field": "ethnicity"
30
+ },
31
+ "person_id": {
32
+ "source_table": "Demographics.csv",
33
+ "source_field": "PersonID"
34
+ }
35
+ },
36
+ "observation_1":{
37
+ "observation_concept_id": {
38
+ "source_table": "Demographics.csv",
39
+ "source_field": "ethnicity",
40
+ "term_mapping": {
41
+ "Bangladeshi": 35825531
42
+ }
43
+ },
44
+ "observation_datetime": {
45
+ "source_table": "Demographics.csv",
46
+ "source_field": "date_of_birth"
47
+ },
48
+ "observation_source_concept_id": {
49
+ "source_table": "Demographics.csv",
50
+ "source_field": "ethnicity",
51
+ "term_mapping": {
52
+ "Bangladeshi": 35825531
53
+ }
54
+ },
55
+ "observation_source_value": {
56
+ "source_table": "Demographics.csv",
57
+ "source_field": "ethnicity"
58
+ },
59
+ "person_id": {
60
+ "source_table": "Demographics.csv",
61
+ "source_field": "PersonID"
62
+ }
63
+ },
64
+ "observation_2":{
65
+ "observation_concept_id": {
66
+ "source_table": "Demographics.csv",
67
+ "source_field": "ethnicity",
68
+ "term_mapping": {
69
+ "Indian": 35826241
70
+ }
71
+ },
72
+ "observation_datetime": {
73
+ "source_table": "Demographics.csv",
74
+ "source_field": "date_of_birth"
75
+ },
76
+ "observation_source_concept_id": {
77
+ "source_table": "Demographics.csv",
78
+ "source_field": "ethnicity",
79
+ "term_mapping": {
80
+ "Indian": 35826241
81
+ }
82
+ },
83
+ "observation_source_value": {
84
+ "source_table": "Demographics.csv",
85
+ "source_field": "ethnicity"
86
+ },
87
+ "person_id": {
88
+ "source_table": "Demographics.csv",
89
+ "source_field": "PersonID"
90
+ }
91
+ },
92
+ "observation_3":{
93
+ "observation_concept_id": {
94
+ "source_table": "Demographics.csv",
95
+ "source_field": "ethnicity",
96
+ "term_mapping": {
97
+ "White": 35827394
98
+ }
99
+ },
100
+ "observation_datetime": {
101
+ "source_table": "Demographics.csv",
102
+ "source_field": "date_of_birth"
103
+ },
104
+ "observation_source_concept_id": {
105
+ "source_table": "Demographics.csv",
106
+ "source_field": "ethnicity",
107
+ "term_mapping": {
108
+ "White": 35827394
109
+ }
110
+ },
111
+ "observation_source_value": {
112
+ "source_table": "Demographics.csv",
113
+ "source_field": "ethnicity"
114
+ },
115
+ "person_id": {
116
+ "source_table": "Demographics.csv",
117
+ "source_field": "PersonID"
118
+ }
119
+ },
120
+ "observation_4":{
121
+ "observation_concept_id": {
122
+ "source_table": "Demographics.csv",
123
+ "source_field": "ethnicity",
124
+ "term_mapping": {
125
+ "Black": 35825567
126
+ }
127
+ },
128
+ "observation_datetime": {
129
+ "source_table": "Demographics.csv",
130
+ "source_field": "date_of_birth"
131
+ },
132
+ "observation_source_concept_id": {
133
+ "source_table": "Demographics.csv",
134
+ "source_field": "ethnicity",
135
+ "term_mapping": {
136
+ "Black": 35825567
137
+ }
138
+ },
139
+ "observation_source_value": {
140
+ "source_table": "Demographics.csv",
141
+ "source_field": "ethnicity"
142
+ },
143
+ "person_id": {
144
+ "source_table": "Demographics.csv",
145
+ "source_field": "PersonID"
146
+ }
147
+ },
148
+ "observation_5":{
149
+ "observation_concept_id": {
150
+ "source_table": "Demographics.csv",
151
+ "source_field": "ethnicity",
152
+ "term_mapping": {
153
+ "White and Asian": 35827395
154
+ }
155
+ },
156
+ "observation_datetime": {
157
+ "source_table": "Demographics.csv",
158
+ "source_field": "date_of_birth"
159
+ },
160
+ "observation_source_concept_id": {
161
+ "source_table": "Demographics.csv",
162
+ "source_field": "ethnicity",
163
+ "term_mapping": {
164
+ "White and Asian": 35827395
165
+ }
166
+ },
167
+ "observation_source_value": {
168
+ "source_table": "Demographics.csv",
169
+ "source_field": "ethnicity"
170
+ },
171
+ "person_id": {
172
+ "source_table": "Demographics.csv",
173
+ "source_field": "PersonID"
174
+ }
175
+ }
176
+ },
177
+ "condition_occurrence": {
178
+ "condition_occurrence_0":{
179
+ "condition_concept_id": {
180
+ "source_table": "Symptoms.csv",
181
+ "source_field": "symptom1",
182
+ "term_mapping": {
183
+ "Y": 254761
184
+ }
185
+ },
186
+ "condition_end_datetime": {
187
+ "source_table": "Symptoms.csv",
188
+ "source_field": "visit_date"
189
+ },
190
+ "condition_source_concept_id": {
191
+ "source_table": "Symptoms.csv",
192
+ "source_field": "symptom1",
193
+ "term_mapping": {
194
+ "Y": 254761
195
+ }
196
+ },
197
+ "condition_source_value": {
198
+ "source_table": "Symptoms.csv",
199
+ "source_field": "symptom1"
200
+ },
201
+ "condition_start_datetime": {
202
+ "source_table": "Symptoms.csv",
203
+ "source_field": "visit_date"
204
+ },
205
+ "person_id": {
206
+ "source_table": "Symptoms.csv",
207
+ "source_field": "PersonID"
208
+ }
209
+ }
210
+ },
211
+ "person": {
212
+ "female":{
213
+ "birth_datetime": {
214
+ "source_table": "Demographics.csv",
215
+ "source_field": "date_of_birth"
216
+ },
217
+ "gender_concept_id": {
218
+ "source_table": "Demographics.csv",
219
+ "source_field": "sex",
220
+ "term_mapping": {
221
+ "F": 8532
222
+ }
223
+ },
224
+ "gender_source_concept_id": {
225
+ "source_table": "Demographics.csv",
226
+ "source_field": "sex",
227
+ "term_mapping": {
228
+ "F": 8532
229
+ }
230
+ },
231
+ "gender_source_value": {
232
+ "source_table": "Demographics.csv",
233
+ "source_field": "sex"
234
+ },
235
+ "person_id": {
236
+ "source_table": "Demographics.csv",
237
+ "source_field": "PersonID"
238
+ }
239
+ },
240
+ "male":{
241
+ "birth_datetime": {
242
+ "source_table": "Demographics.csv",
243
+ "source_field": "date_of_birth"
244
+ },
245
+ "gender_concept_id": {
246
+ "source_table": "Demographics.csv",
247
+ "source_field": "sex",
248
+ "term_mapping": {
249
+ "M": 8507
250
+ }
251
+ },
252
+ "gender_source_concept_id": {
253
+ "source_table": "Demographics.csv",
254
+ "source_field": "sex",
255
+ "term_mapping": {
256
+ "M": 8507
257
+ }
258
+ },
259
+ "gender_source_value": {
260
+ "source_table": "Demographics.csv",
261
+ "source_field": "sex"
262
+ },
263
+ "person_id": {
264
+ "source_table": "Demographics.csv",
265
+ "source_field": "PersonID"
266
+ }
267
+ }
268
+ },
269
+ "measurement": {
270
+ "covid_antibody":{
271
+ "value_as_number": {
272
+ "source_table": "covid19_antibody.csv",
273
+ "source_field": "IgG"
274
+ },
275
+ "measurement_source_value": {
276
+ "source_table": "covid19_antibody.csv",
277
+ "source_field": "IgG"
278
+ },
279
+ "measurement_concept_id": {
280
+ "source_table": "covid19_antibody.csv",
281
+ "source_field": "IgG",
282
+ "term_mapping": 37398191
283
+ },
284
+ "measurement_source_concept_id": {
285
+ "source_table": "covid19_antibody.csv",
286
+ "source_field": "IgG",
287
+ "term_mapping": 37398191
288
+ },
289
+ "measurement_datetime": {
290
+ "source_table": "covid19_antibody.csv",
291
+ "source_field": "date"
292
+ },
293
+ "person_id": {
294
+ "source_table": "covid19_antibody.csv",
295
+ "source_field": "PersonID"
296
+ }
297
+ }
298
+ }
299
+ }
300
+ }
@@ -0,0 +1,21 @@
1
+ import click
2
+ from pathlib import Path
3
+
4
+
5
+ def PathArgs():
6
+ """used by the click library for CLI args that are files"""
7
+
8
+ class PathArgs(click.ParamType):
9
+ name = "pathlib.Path"
10
+
11
+ def convert(self, value, param, ctx):
12
+ try:
13
+ return Path(value)
14
+ except Exception as e:
15
+ self.fail(f"Invalid path: {value} ({e})", param, ctx)
16
+
17
+ return PathArgs()
18
+
19
+
20
+ # use this
21
+ PathArgs = PathArgs()
@@ -1,15 +1,41 @@
1
+ import json
2
+ import logging
1
3
  import os
2
4
  import sys
3
5
  import json
6
+ import importlib.resources as resources
7
+ from typing import List, Optional
8
+ from pathlib import Path
9
+
10
+ logger = logging.getLogger(__name__)
11
+
4
12
 
5
13
  # Function inherited from the "old" CaRROT-CDM (modfied to exit on error)
6
-
7
- def load_json(f_in):
14
+
15
+
16
+ def load_json(f_in: Path):
8
17
  try:
9
- data = json.load(open(f_in))
18
+ data = json.load(f_in.open())
10
19
  except Exception as err:
11
- print ("{0} not found. Or cannot parse as json".format(f_in))
20
+ logger.exception("{0} not found. Or cannot parse as json".format(f_in))
12
21
  sys.exit()
13
22
 
14
23
  return data
15
24
 
25
+
26
+ def resolve_paths(args: List[Optional[Path]]) -> List[Optional[Path]]:
27
+ """Resolve special path syntaxes in command line arguments."""
28
+ try:
29
+ with resources.files('carrottransform').joinpath('__init__.py') as f:
30
+ package_path = f.parent
31
+ except Exception:
32
+ # Fallback for development environment
33
+ import carrottransform
34
+ package_path = Path(carrottransform.__file__).resolve().parent
35
+
36
+ # Handle None values and replace @carrot with the actual package path
37
+ prefix = '@carrot'
38
+ return [
39
+ package_path / Path(str(arg).replace(prefix, '').lstrip('/')) if arg is not None and str(arg).startswith(prefix) else arg
40
+ for arg in args
41
+ ]
@@ -3,13 +3,17 @@ import json
3
3
  import carrottransform.tools as tools
4
4
  from .omopcdm import OmopCDM
5
5
 
6
+ import logging
7
+ logger = logging.getLogger(__name__)
8
+
6
9
  class MappingRules:
7
10
  """
8
11
  self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
9
12
  as a file-specific dictionary allowing rules to be "looked-up" depending on data content
10
13
  """
11
14
 
12
- def __init__(self, rulesfilepath, omopcdm):
15
+ def __init__(self, rulesfilepath: os.PathLike, omopcdm: OmopCDM):
16
+ ## just loads the json directly
13
17
  self.rules_data = tools.load_json(rulesfilepath)
14
18
  self.omopcdm = omopcdm
15
19
 
@@ -34,12 +38,7 @@ class MappingRules:
34
38
  return self.dataset_name
35
39
 
36
40
  def get_all_outfile_names(self):
37
- file_list = []
38
-
39
- for outfilename in self.rules_data["cdm"]:
40
- file_list.append(outfilename)
41
-
42
- return file_list
41
+ return list(self.rules_data["cdm"])
43
42
 
44
43
  def get_all_infile_names(self):
45
44
  file_list = []
@@ -84,11 +83,11 @@ class MappingRules:
84
83
  outfile = keydata[-1]
85
84
  for outfield_elem in outfield_data:
86
85
  for infield, outfield_list in outfield_elem.items():
87
- #print("{0}, {1}, {2}".format(outfile, infield, str(outfield_list)))
86
+ logger.debug("{0}, {1}, {2}".format(outfile, infield, str(outfield_list)))
88
87
  for outfield in outfield_list:
89
- if outfield in self.omopcdm.get_omop_datetime_fields(outfile):
88
+ if outfield.split('~')[0] in self.omopcdm.get_omop_datetime_fields(outfile):
90
89
  datetime_source = infield
91
- if outfield == self.omopcdm.get_omop_person_id_field(outfile):
90
+ if outfield.split('~')[0] == self.omopcdm.get_omop_person_id_field(outfile):
92
91
  person_id_source = infield
93
92
 
94
93
  return datetime_source, person_id_source
@@ -101,6 +100,7 @@ class MappingRules:
101
100
  person_id_source = None
102
101
  if tgtfilename in self.rules_data["cdm"]:
103
102
  source_rules_data = self.rules_data["cdm"][tgtfilename]
103
+ ## this loops over all the fields in the person part of the rules, which will lead to overwriting of the source variables and unneccesary looping
104
104
  for rule_name, rule_fields in source_rules_data.items():
105
105
  if "birth_datetime" in rule_fields:
106
106
  birth_datetime_source = rule_fields["birth_datetime"]["source_field"]
@@ -113,6 +113,7 @@ class MappingRules:
113
113
  """
114
114
  Parse rules to produce a map of source to target data for a given input file
115
115
  """
116
+ ## creates a dict of dicts that has input files as keys, and infile~field~data~target as keys for the underlying keys, which contain a list of dicts of lists
116
117
  if infilename in self.outfile_names and infilename in self.parsed_rules:
117
118
  return self.outfile_names[infilename], self.parsed_rules[infilename]
118
119
  outfilenames = []
@@ -141,6 +142,7 @@ class MappingRules:
141
142
  plain_key = ""
142
143
  term_value_key = ""
143
144
 
145
+ ## iterate through the rules, looking for rules that apply to the input file.
144
146
  for outfield, source_info in rules.items():
145
147
  if source_info["source_field"] not in data:
146
148
  data[source_info["source_field"]] = []
@@ -148,6 +150,7 @@ class MappingRules:
148
150
  if "term_mapping" in source_info:
149
151
  if type(source_info["term_mapping"]) is dict:
150
152
  for inputvalue, term in source_info["term_mapping"].items():
153
+ ## add a key/add to the list of data in the dict for the given input file
151
154
  term_value_key = infilename + "~" + source_info["source_field"] + "~" + str(inputvalue) + "~" + outfilename
152
155
  data[source_info["source_field"]].append(outfield + "~" + str(source_info["term_mapping"][str(inputvalue)]))
153
156
  else: