carrot-transform 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,300 @@
1
+ {
2
+ "metadata": {
3
+ "date_created": "2021-06-14T15:27:37.123947",
4
+ "dataset": "Test"
5
+ },
6
+ "cdm": {
7
+ "observation": {
8
+ "observation_0": {
9
+ "observation_concept_id": {
10
+ "source_table": "Demographics.csv",
11
+ "source_field": "ethnicity",
12
+ "term_mapping": {
13
+ "Asian": 35825508
14
+ }
15
+ },
16
+ "observation_datetime": {
17
+ "source_table": "Demographics.csv",
18
+ "source_field": "date_of_birth"
19
+ },
20
+ "observation_source_concept_id": {
21
+ "source_table": "Demographics.csv",
22
+ "source_field": "ethnicity",
23
+ "term_mapping": {
24
+ "Asian": 35825508
25
+ }
26
+ },
27
+ "observation_source_value": {
28
+ "source_table": "Demographics.csv",
29
+ "source_field": "ethnicity"
30
+ },
31
+ "person_id": {
32
+ "source_table": "Demographics.csv",
33
+ "source_field": "PersonID"
34
+ }
35
+ },
36
+ "observation_1":{
37
+ "observation_concept_id": {
38
+ "source_table": "Demographics.csv",
39
+ "source_field": "ethnicity",
40
+ "term_mapping": {
41
+ "Bangladeshi": 35825531
42
+ }
43
+ },
44
+ "observation_datetime": {
45
+ "source_table": "Demographics.csv",
46
+ "source_field": "date_of_birth"
47
+ },
48
+ "observation_source_concept_id": {
49
+ "source_table": "Demographics.csv",
50
+ "source_field": "ethnicity",
51
+ "term_mapping": {
52
+ "Bangladeshi": 35825531
53
+ }
54
+ },
55
+ "observation_source_value": {
56
+ "source_table": "Demographics.csv",
57
+ "source_field": "ethnicity"
58
+ },
59
+ "person_id": {
60
+ "source_table": "Demographics.csv",
61
+ "source_field": "PersonID"
62
+ }
63
+ },
64
+ "observation_2":{
65
+ "observation_concept_id": {
66
+ "source_table": "Demographics.csv",
67
+ "source_field": "ethnicity",
68
+ "term_mapping": {
69
+ "Indian": 35826241
70
+ }
71
+ },
72
+ "observation_datetime": {
73
+ "source_table": "Demographics.csv",
74
+ "source_field": "date_of_birth"
75
+ },
76
+ "observation_source_concept_id": {
77
+ "source_table": "Demographics.csv",
78
+ "source_field": "ethnicity",
79
+ "term_mapping": {
80
+ "Indian": 35826241
81
+ }
82
+ },
83
+ "observation_source_value": {
84
+ "source_table": "Demographics.csv",
85
+ "source_field": "ethnicity"
86
+ },
87
+ "person_id": {
88
+ "source_table": "Demographics.csv",
89
+ "source_field": "PersonID"
90
+ }
91
+ },
92
+ "observation_3":{
93
+ "observation_concept_id": {
94
+ "source_table": "Demographics.csv",
95
+ "source_field": "ethnicity",
96
+ "term_mapping": {
97
+ "White": 35827394
98
+ }
99
+ },
100
+ "observation_datetime": {
101
+ "source_table": "Demographics.csv",
102
+ "source_field": "date_of_birth"
103
+ },
104
+ "observation_source_concept_id": {
105
+ "source_table": "Demographics.csv",
106
+ "source_field": "ethnicity",
107
+ "term_mapping": {
108
+ "White": 35827394
109
+ }
110
+ },
111
+ "observation_source_value": {
112
+ "source_table": "Demographics.csv",
113
+ "source_field": "ethnicity"
114
+ },
115
+ "person_id": {
116
+ "source_table": "Demographics.csv",
117
+ "source_field": "PersonID"
118
+ }
119
+ },
120
+ "observation_4":{
121
+ "observation_concept_id": {
122
+ "source_table": "Demographics.csv",
123
+ "source_field": "ethnicity",
124
+ "term_mapping": {
125
+ "Black": 35825567
126
+ }
127
+ },
128
+ "observation_datetime": {
129
+ "source_table": "Demographics.csv",
130
+ "source_field": "date_of_birth"
131
+ },
132
+ "observation_source_concept_id": {
133
+ "source_table": "Demographics.csv",
134
+ "source_field": "ethnicity",
135
+ "term_mapping": {
136
+ "Black": 35825567
137
+ }
138
+ },
139
+ "observation_source_value": {
140
+ "source_table": "Demographics.csv",
141
+ "source_field": "ethnicity"
142
+ },
143
+ "person_id": {
144
+ "source_table": "Demographics.csv",
145
+ "source_field": "PersonID"
146
+ }
147
+ },
148
+ "observation_5":{
149
+ "observation_concept_id": {
150
+ "source_table": "Demographics.csv",
151
+ "source_field": "ethnicity",
152
+ "term_mapping": {
153
+ "White and Asian": 35827395
154
+ }
155
+ },
156
+ "observation_datetime": {
157
+ "source_table": "Demographics.csv",
158
+ "source_field": "date_of_birth"
159
+ },
160
+ "observation_source_concept_id": {
161
+ "source_table": "Demographics.csv",
162
+ "source_field": "ethnicity",
163
+ "term_mapping": {
164
+ "White and Asian": 35827395
165
+ }
166
+ },
167
+ "observation_source_value": {
168
+ "source_table": "Demographics.csv",
169
+ "source_field": "ethnicity"
170
+ },
171
+ "person_id": {
172
+ "source_table": "Demographics.csv",
173
+ "source_field": "PersonID"
174
+ }
175
+ }
176
+ },
177
+ "condition_occurrence": {
178
+ "condition_occurrence_0":{
179
+ "condition_concept_id": {
180
+ "source_table": "Symptoms.csv",
181
+ "source_field": "symptom1",
182
+ "term_mapping": {
183
+ "Y": 254761
184
+ }
185
+ },
186
+ "condition_end_datetime": {
187
+ "source_table": "Symptoms.csv",
188
+ "source_field": "visit_date"
189
+ },
190
+ "condition_source_concept_id": {
191
+ "source_table": "Symptoms.csv",
192
+ "source_field": "symptom1",
193
+ "term_mapping": {
194
+ "Y": 254761
195
+ }
196
+ },
197
+ "condition_source_value": {
198
+ "source_table": "Symptoms.csv",
199
+ "source_field": "symptom1"
200
+ },
201
+ "condition_start_datetime": {
202
+ "source_table": "Symptoms.csv",
203
+ "source_field": "visit_date"
204
+ },
205
+ "person_id": {
206
+ "source_table": "Symptoms.csv",
207
+ "source_field": "PersonID"
208
+ }
209
+ }
210
+ },
211
+ "person": {
212
+ "female":{
213
+ "birth_datetime": {
214
+ "source_table": "Demographics.csv",
215
+ "source_field": "date_of_birth"
216
+ },
217
+ "gender_concept_id": {
218
+ "source_table": "Demographics.csv",
219
+ "source_field": "sex",
220
+ "term_mapping": {
221
+ "F": 8532
222
+ }
223
+ },
224
+ "gender_source_concept_id": {
225
+ "source_table": "Demographics.csv",
226
+ "source_field": "sex",
227
+ "term_mapping": {
228
+ "F": 8532
229
+ }
230
+ },
231
+ "gender_source_value": {
232
+ "source_table": "Demographics.csv",
233
+ "source_field": "sex"
234
+ },
235
+ "person_id": {
236
+ "source_table": "Demographics.csv",
237
+ "source_field": "PersonID"
238
+ }
239
+ },
240
+ "male":{
241
+ "birth_datetime": {
242
+ "source_table": "Demographics.csv",
243
+ "source_field": "date_of_birth"
244
+ },
245
+ "gender_concept_id": {
246
+ "source_table": "Demographics.csv",
247
+ "source_field": "sex",
248
+ "term_mapping": {
249
+ "M": 8507
250
+ }
251
+ },
252
+ "gender_source_concept_id": {
253
+ "source_table": "Demographics.csv",
254
+ "source_field": "sex",
255
+ "term_mapping": {
256
+ "M": 8507
257
+ }
258
+ },
259
+ "gender_source_value": {
260
+ "source_table": "Demographics.csv",
261
+ "source_field": "sex"
262
+ },
263
+ "person_id": {
264
+ "source_table": "Demographics.csv",
265
+ "source_field": "PersonID"
266
+ }
267
+ }
268
+ },
269
+ "measurement": {
270
+ "covid_antibody":{
271
+ "value_as_number": {
272
+ "source_table": "covid19_antibody.csv",
273
+ "source_field": "IgG"
274
+ },
275
+ "measurement_source_value": {
276
+ "source_table": "covid19_antibody.csv",
277
+ "source_field": "IgG"
278
+ },
279
+ "measurement_concept_id": {
280
+ "source_table": "covid19_antibody.csv",
281
+ "source_field": "IgG",
282
+ "term_mapping": 37398191
283
+ },
284
+ "measurement_source_concept_id": {
285
+ "source_table": "covid19_antibody.csv",
286
+ "source_field": "IgG",
287
+ "term_mapping": 37398191
288
+ },
289
+ "measurement_datetime": {
290
+ "source_table": "covid19_antibody.csv",
291
+ "source_field": "date"
292
+ },
293
+ "person_id": {
294
+ "source_table": "covid19_antibody.csv",
295
+ "source_field": "PersonID"
296
+ }
297
+ }
298
+ }
299
+ }
300
+ }
@@ -10,6 +10,7 @@ class MappingRules:
10
10
  """
11
11
 
12
12
  def __init__(self, rulesfilepath, omopcdm):
13
+ ## just loads the json directly
13
14
  self.rules_data = tools.load_json(rulesfilepath)
14
15
  self.omopcdm = omopcdm
15
16
 
@@ -34,12 +35,7 @@ class MappingRules:
34
35
  return self.dataset_name
35
36
 
36
37
  def get_all_outfile_names(self):
37
- file_list = []
38
-
39
- for outfilename in self.rules_data["cdm"]:
40
- file_list.append(outfilename)
41
-
42
- return file_list
38
+ return list(self.rules_data["cdm"])
43
39
 
44
40
  def get_all_infile_names(self):
45
41
  file_list = []
@@ -86,9 +82,9 @@ class MappingRules:
86
82
  for infield, outfield_list in outfield_elem.items():
87
83
  #print("{0}, {1}, {2}".format(outfile, infield, str(outfield_list)))
88
84
  for outfield in outfield_list:
89
- if outfield in self.omopcdm.get_omop_datetime_fields(outfile):
85
+ if outfield.split('~')[0] in self.omopcdm.get_omop_datetime_fields(outfile):
90
86
  datetime_source = infield
91
- if outfield == self.omopcdm.get_omop_person_id_field(outfile):
87
+ if outfield.split('~')[0] == self.omopcdm.get_omop_person_id_field(outfile):
92
88
  person_id_source = infield
93
89
 
94
90
  return datetime_source, person_id_source
@@ -101,6 +97,7 @@ class MappingRules:
101
97
  person_id_source = None
102
98
  if tgtfilename in self.rules_data["cdm"]:
103
99
  source_rules_data = self.rules_data["cdm"][tgtfilename]
100
+ ## this loops over all the fields in the person part of the rules, which will lead to overwriting of the source variables and unneccesary looping
104
101
  for rule_name, rule_fields in source_rules_data.items():
105
102
  if "birth_datetime" in rule_fields:
106
103
  birth_datetime_source = rule_fields["birth_datetime"]["source_field"]
@@ -113,6 +110,7 @@ class MappingRules:
113
110
  """
114
111
  Parse rules to produce a map of source to target data for a given input file
115
112
  """
113
+ ## creates a dict of dicts that has input files as keys, and infile~field~data~target as keys for the underlying keys, which contain a list of dicts of lists
116
114
  if infilename in self.outfile_names and infilename in self.parsed_rules:
117
115
  return self.outfile_names[infilename], self.parsed_rules[infilename]
118
116
  outfilenames = []
@@ -141,6 +139,7 @@ class MappingRules:
141
139
  plain_key = ""
142
140
  term_value_key = ""
143
141
 
142
+ ## iterate through the rules, looking for rules that apply to the input file.
144
143
  for outfield, source_info in rules.items():
145
144
  if source_info["source_field"] not in data:
146
145
  data[source_info["source_field"]] = []
@@ -148,6 +147,7 @@ class MappingRules:
148
147
  if "term_mapping" in source_info:
149
148
  if type(source_info["term_mapping"]) is dict:
150
149
  for inputvalue, term in source_info["term_mapping"].items():
150
+ ## add a key/add to the list of data in the dict for the given input file
151
151
  term_value_key = infilename + "~" + source_info["source_field"] + "~" + str(inputvalue) + "~" + outfilename
152
152
  data[source_info["source_field"]].append(outfield + "~" + str(source_info["term_mapping"][str(inputvalue)]))
153
153
  else:
@@ -14,7 +14,10 @@ class OmopCDM:
14
14
  self.numeric_types = ["integer", "numeric"]
15
15
  self.datetime_types = ["timestamp"]
16
16
  self.date_types = ["date"]
17
+ ## ddl sets the headers to go in each table, and whether or not to make it null. Also allows for more tables than we will use.
18
+ ## also adds additional useful keys, like 'all_columns' - before merge
17
19
  self.omop_json = self.load_ddl(omopddl)
20
+ ## adds fields as a dict of dicts - is this so they can get picked up by some of these get_columns?
18
21
  self.omop_json = self.merge_json(self.omop_json, omopcfg)
19
22
  self.all_columns = self.get_columns("all_columns")
20
23
  self.numeric_fields = self.get_columns("numeric_fields")
@@ -47,9 +50,13 @@ class OmopCDM:
47
50
  output_dict["datetime_fields"] = {}
48
51
  output_dict["date_fields"] = {}
49
52
 
53
+ ## matching for version number - matches '--postgres', any number of chars and some digits of the form X.Y, plus an end of string or end of line
50
54
  ver_rgx = re.compile(r'^--postgresql.*(\d+\.\d+)$')
51
- start_rgx = re.compile(r'^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([A-Z_]+)')
55
+ ## matching for table name - matches 'CREATE TABLE @', some letters (upper and lower case), '.' and some more letters (lower case)
56
+ start_rgx = re.compile(r'^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([a-zA-Z_]+)')
57
+ ## matches some whitespace, lower case letters(or underscores), whitespace, letters (upper/lower and underscores)
52
58
  datatype_rgx = re.compile(r'^\s*([a-z_]+)\s+([a-zA-Z_]+)')
59
+ ## matching for end of file - matches close bracket, semi colon, end of file or line
53
60
  end_rgx = re.compile(r'.*[)];$')
54
61
  vermatched = False
55
62
  processing_table_data = False
@@ -76,7 +83,7 @@ class OmopCDM:
76
83
  fname = idtmatch.group(1)
77
84
  ftype = idtmatch.group(2)
78
85
 
79
- # Check for dictionary element presence
86
+ # Check for dictionary element presence, adn start an empty list if it doesn't already exist
80
87
  if tabname not in output_dict["all_columns"]:
81
88
  output_dict["all_columns"][tabname] = []
82
89
  if tabname not in output_dict["numeric_fields"]:
@@ -1,48 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: carrot_transform
3
- Version: 0.3.3
4
- Summary:
5
- Author: anwarfg
6
- Author-email: 913028+anwarfg@users.noreply.github.com
7
- Requires-Python: >=3.10,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.10
10
- Classifier: Programming Language :: Python :: 3.11
11
- Classifier: Programming Language :: Python :: 3.12
12
- Classifier: Programming Language :: Python :: 3.13
13
- Requires-Dist: click (>=8.1.7,<9.0.0)
14
- Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
15
- Requires-Dist: pandas (>=2.2.3,<3.0.0)
16
- Description-Content-Type: text/markdown
17
-
18
- <p align="center">
19
- <a href="https://carrot.ac.uk/" target="_blank">
20
- <picture>
21
- <source media="(prefers-color-scheme: dark)" srcset="/images/logo-dark.png">
22
- <img alt="Carrot Logo" src="/images/logo-primary.png" width="280"/>
23
- </picture>
24
- </a>
25
- </p>
26
- <div align="center">
27
- <strong>
28
- <h2>Streamlined Data Mapping to OMOP</h2>
29
- <a href="https://carrot.ac.uk/">Carrot Tranform</a> executes the conversion of the data to the OMOP CDM.<br />
30
- </strong>
31
- </div>
32
-
33
- TODO:
34
-
35
- - Document carrot-transform
36
- - Add more comments in-code
37
- - Handle capture of ddl and json config via the command-line as optional args
38
-
39
- Reduction in complexity over the original CaRROT-CDM version for the Transform part of _ETL_ - In practice _Extract_ is always
40
- performed by Data Partners, _Load_ by database bulk-load software.
41
-
42
- Statistics
43
-
44
- External libraries imported (approximate)
45
-
46
- carrot-cdm 61
47
- carrot-transform 12
48
-
@@ -1,17 +0,0 @@
1
- carrottransform/__init__.py,sha256=cQJKTCpG2qmKxDl-VtSWQ3_WFjyzg4u_8nZacWAHFcU,73
2
- carrottransform/_version.py,sha256=NfGqG2TgfjxxrlCHaOtwl3BcE0f6UH0VPrQgoDPjV7Y,72
3
- carrottransform/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- carrottransform/cli/command.py,sha256=xYTaJsVZyRYv0CzUwrh7ZPK8hhGyC3MDfvVYxHcXYSM,508
5
- carrottransform/cli/subcommands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- carrottransform/cli/subcommands/run.py,sha256=3z5cRG4ekyPOP5tvjZOyHUxbclKfBr_Z0tQRRoKj73E,20651
7
- carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql,sha256=fXrPfdL3IzU5ux55ogsQKjjd-c1KzdP_N2A_JjlY3gk,18084
8
- carrottransform/config/omop.json,sha256=OT3jvfPjKhjsDnQcQw1OAEOHhQLoHXNxTj_MDwNbYqo,1934
9
- carrottransform/tools/__init__.py,sha256=b3JuCwgJVx0rqx5igB8hNNKO0ktlbQjHGHwy-vzpdo0,198
10
- carrottransform/tools/file_helpers.py,sha256=xlODDAUpsx0H4sweGZ81ttjJjNQGn2spNUa1Fndotw8,316
11
- carrottransform/tools/mappingrules.py,sha256=bV6tXHBwVeKAUgCwFTZE2-qTcxKtbs3zbJWedBSviVI,6567
12
- carrottransform/tools/metrics.py,sha256=LOzm80-YIVM9mvgvQXRpyArl2nSfSTTW9DikqJ5M2Yg,5700
13
- carrottransform/tools/omopcdm.py,sha256=ycyPGgUTUwui7MLxH8JXd-MyCRkG0xOfEoDhCXeogmQ,7623
14
- carrot_transform-0.3.3.dist-info/LICENSE,sha256=pqIiuuTs6Na-oFd10MMsZoZmdfhfUhHeOtQzgzSkcaw,1082
15
- carrot_transform-0.3.3.dist-info/METADATA,sha256=23mVHLHLXOqgXUFLoU7cSaqIr_yzl9mYf_zgZnteeoY,1474
16
- carrot_transform-0.3.3.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
17
- carrot_transform-0.3.3.dist-info/RECORD,,