carrot-transform 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of carrot-transform might be problematic. Click here for more details.
- {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
- carrot_transform-0.4.0.dist-info/RECORD +41 -0
- {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
- carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
- carrottransform/__init__.py +1 -1
- carrottransform/_version.py +2 -2
- carrottransform/cli/command.py +9 -5
- carrottransform/cli/subcommands/run.py +214 -526
- carrottransform/cli/subcommands/run_v2.py +145 -0
- carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
- carrottransform/examples/test/rules/v1.json +280 -0
- carrottransform/examples/test/rules/v2.json +115 -0
- carrottransform/tools/__init__.py +4 -14
- carrottransform/tools/args.py +128 -0
- carrottransform/tools/concept_helpers.py +61 -0
- carrottransform/tools/core.py +163 -0
- carrottransform/tools/date_helpers.py +79 -0
- carrottransform/tools/file_helpers.py +153 -9
- carrottransform/tools/logger.py +19 -0
- carrottransform/tools/mapping_types.py +32 -0
- carrottransform/tools/mappingrules.py +297 -34
- carrottransform/tools/metrics.py +162 -109
- carrottransform/tools/omopcdm.py +37 -32
- carrottransform/tools/orchestrator.py +381 -0
- carrottransform/tools/person_helpers.py +126 -0
- carrottransform/tools/record_builder.py +413 -0
- carrottransform/tools/stream_helpers.py +71 -0
- carrottransform/tools/types.py +71 -0
- carrottransform/tools/validation.py +62 -0
- carrot_transform-0.3.5.dist-info/RECORD +0 -25
- carrot_transform-0.3.5.dist-info/entry_points.txt +0 -3
- {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0
carrottransform/tools/metrics.py
CHANGED
|
@@ -1,17 +1,18 @@
|
|
|
1
|
-
|
|
2
|
-
import logging
|
|
3
|
-
logger = logging.getLogger(__name__)
|
|
4
|
-
|
|
5
1
|
from dataclasses import dataclass, field
|
|
6
2
|
from typing import Dict, List
|
|
7
3
|
|
|
4
|
+
from carrottransform.tools.logger import logger_setup
|
|
5
|
+
|
|
6
|
+
logger = logger_setup()
|
|
7
|
+
|
|
8
|
+
|
|
8
9
|
@dataclass
|
|
9
10
|
class DataKey:
|
|
10
11
|
source: str
|
|
11
|
-
fieldname:str
|
|
12
|
-
tablename:str
|
|
13
|
-
concept_id:str
|
|
14
|
-
additional:str
|
|
12
|
+
fieldname: str
|
|
13
|
+
tablename: str
|
|
14
|
+
concept_id: str
|
|
15
|
+
additional: str
|
|
15
16
|
|
|
16
17
|
def __str__(self) -> str:
|
|
17
18
|
"""
|
|
@@ -19,11 +20,21 @@ class DataKey:
|
|
|
19
20
|
This is here in case that representation is needed somewhere
|
|
20
21
|
"""
|
|
21
22
|
return f"{self.source}~{self.fieldname}~{self.tablename}~{self.concept_id}~{self.additional}"
|
|
23
|
+
|
|
22
24
|
def __hash__(self) -> int:
|
|
23
25
|
"""
|
|
24
26
|
The DataKey is used as a key for a dictionary of key counts
|
|
25
27
|
"""
|
|
26
|
-
return hash(
|
|
28
|
+
return hash(
|
|
29
|
+
(
|
|
30
|
+
self.source,
|
|
31
|
+
self.fieldname,
|
|
32
|
+
self.tablename,
|
|
33
|
+
self.concept_id,
|
|
34
|
+
self.additional,
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
|
|
27
38
|
|
|
28
39
|
@dataclass
|
|
29
40
|
class CountData:
|
|
@@ -33,13 +44,15 @@ class CountData:
|
|
|
33
44
|
if count_type not in self.counts:
|
|
34
45
|
self.counts[count_type] = 0
|
|
35
46
|
self.counts[count_type] += 1
|
|
36
|
-
|
|
37
|
-
def get_count(self, count_type: str, default: int=0):
|
|
47
|
+
|
|
48
|
+
def get_count(self, count_type: str, default: int = 0):
|
|
38
49
|
return self.counts.get(count_type, default)
|
|
39
50
|
|
|
51
|
+
|
|
40
52
|
@dataclass
|
|
41
53
|
class MapstreamSummaryRow:
|
|
42
54
|
"""Represents a single row in the mapstream summary"""
|
|
55
|
+
|
|
43
56
|
dataset_name: str
|
|
44
57
|
source: str
|
|
45
58
|
fieldname: str
|
|
@@ -51,57 +64,62 @@ class MapstreamSummaryRow:
|
|
|
51
64
|
invalid_date_fields: int = 0
|
|
52
65
|
invalid_source_fields: int = 0
|
|
53
66
|
output_count: int = 0
|
|
54
|
-
|
|
67
|
+
|
|
55
68
|
def to_tsv_row(self) -> str:
|
|
56
69
|
"""Convert the row to a tab-separated string"""
|
|
57
|
-
row_list = [
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
+
row_list = [
|
|
71
|
+
str(col)
|
|
72
|
+
for col in [
|
|
73
|
+
self.dataset_name,
|
|
74
|
+
self.source,
|
|
75
|
+
self.fieldname,
|
|
76
|
+
self.tablename,
|
|
77
|
+
self.concept_id,
|
|
78
|
+
self.additional,
|
|
79
|
+
self.input_count,
|
|
80
|
+
self.invalid_person_ids,
|
|
81
|
+
self.invalid_date_fields,
|
|
82
|
+
self.invalid_source_fields,
|
|
83
|
+
self.output_count,
|
|
84
|
+
]
|
|
85
|
+
]
|
|
70
86
|
# If python gets updated, you can move the row_str expression into the f-string
|
|
71
|
-
row_str =
|
|
87
|
+
row_str = "\t".join(row_list)
|
|
72
88
|
return f"{row_str}\n"
|
|
73
|
-
|
|
89
|
+
|
|
74
90
|
@classmethod
|
|
75
91
|
def get_header(cls) -> str:
|
|
76
92
|
"""Return the TSV header row"""
|
|
77
93
|
header = [
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
header_str =
|
|
94
|
+
"dsname",
|
|
95
|
+
"source",
|
|
96
|
+
"source_field",
|
|
97
|
+
"target",
|
|
98
|
+
"concept_id",
|
|
99
|
+
"additional",
|
|
100
|
+
"incount",
|
|
101
|
+
"invalid_persid",
|
|
102
|
+
"invalid_date",
|
|
103
|
+
"invalid_source",
|
|
104
|
+
"outcount",
|
|
105
|
+
]
|
|
106
|
+
header_str = "\t".join(header)
|
|
91
107
|
return f"{header_str}\n"
|
|
92
108
|
|
|
93
|
-
|
|
109
|
+
|
|
110
|
+
class Metrics:
|
|
94
111
|
"""
|
|
95
112
|
Capture metrics for output to a summary tsv file, record counts at multiple levels
|
|
96
113
|
The main principle is to increment counts associated with datakeys (dkey) at different levels
|
|
97
114
|
"""
|
|
115
|
+
|
|
98
116
|
def __init__(self, dataset_name, log_threshold=0):
|
|
99
117
|
"""
|
|
100
118
|
self.datasummary holds all the saved counts
|
|
101
119
|
"""
|
|
102
|
-
self.datasummary={}
|
|
103
|
-
self.allcounts={}
|
|
104
|
-
self.dataset_name=dataset_name
|
|
120
|
+
self.datasummary = {}
|
|
121
|
+
self.allcounts = {}
|
|
122
|
+
self.dataset_name = dataset_name
|
|
105
123
|
self.log_threshold = log_threshold
|
|
106
124
|
|
|
107
125
|
def get_new_mapstream_counts(self):
|
|
@@ -135,8 +153,18 @@ class Metrics():
|
|
|
135
153
|
prfx = "NA"
|
|
136
154
|
if "source_files" in increment:
|
|
137
155
|
if fieldname in increment["source_files"]:
|
|
138
|
-
prfx = self.get_prefix(
|
|
139
|
-
|
|
156
|
+
prfx = self.get_prefix(
|
|
157
|
+
increment["source_files"][fieldname]["table"]
|
|
158
|
+
)
|
|
159
|
+
dkey = (
|
|
160
|
+
prfx
|
|
161
|
+
+ "."
|
|
162
|
+
+ desttablename
|
|
163
|
+
+ "."
|
|
164
|
+
+ name
|
|
165
|
+
+ "."
|
|
166
|
+
+ fieldname
|
|
167
|
+
)
|
|
140
168
|
self.add_counts_to_summary(dkey, dataitem[fieldname])
|
|
141
169
|
|
|
142
170
|
def get_prefix(self, fname):
|
|
@@ -150,7 +178,9 @@ class Metrics():
|
|
|
150
178
|
self.datasummary[dkey][counttype] = 0
|
|
151
179
|
self.datasummary[dkey][counttype] += int(count_block[counttype])
|
|
152
180
|
|
|
153
|
-
def increment_key_count(
|
|
181
|
+
def increment_key_count(
|
|
182
|
+
self, source, fieldname, tablename, concept_id, additional, count_type
|
|
183
|
+
):
|
|
154
184
|
dkey = DataKey(source, fieldname, tablename, concept_id, additional)
|
|
155
185
|
|
|
156
186
|
if dkey not in self.datasummary:
|
|
@@ -159,87 +189,111 @@ class Metrics():
|
|
|
159
189
|
self.datasummary[dkey].increment(count_type)
|
|
160
190
|
|
|
161
191
|
def increment_with_datacol(
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
#Are the parameters for DataKeys hierarchical?
|
|
169
|
-
#If so, a nested structure where a Source contains n Fields etc. and each has a method to sum its children would be better
|
|
170
|
-
#But I don't know if that's the desired behaviour
|
|
171
|
-
|
|
172
|
-
#A lot of these increment the same thing, so I have defined `increment_this`
|
|
192
|
+
self,
|
|
193
|
+
source_path: str,
|
|
194
|
+
target_file: str,
|
|
195
|
+
datacol: str,
|
|
196
|
+
out_record: List[str],
|
|
197
|
+
) -> None:
|
|
198
|
+
# Are the parameters for DataKeys hierarchical?
|
|
199
|
+
# If so, a nested structure where a Source contains n Fields etc. and each has a method to sum its children would be better
|
|
200
|
+
# But I don't know if that's the desired behaviour
|
|
201
|
+
|
|
202
|
+
# A lot of these increment the same thing, so I have defined `increment_this`
|
|
173
203
|
def increment_this(
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
204
|
+
fieldname: str,
|
|
205
|
+
concept_id: str,
|
|
206
|
+
additional="",
|
|
207
|
+
) -> None:
|
|
178
208
|
self.increment_key_count(
|
|
179
|
-
source=source_path,
|
|
180
|
-
fieldname=fieldname,
|
|
181
|
-
tablename=target_file,
|
|
182
|
-
concept_id=concept_id,
|
|
183
|
-
additional=additional,
|
|
184
|
-
count_type="output_count"
|
|
185
|
-
)
|
|
186
|
-
self.increment_key_count(
|
|
187
209
|
source=source_path,
|
|
188
|
-
fieldname=
|
|
189
|
-
tablename=
|
|
190
|
-
concept_id=
|
|
191
|
-
additional=
|
|
192
|
-
count_type="output_count"
|
|
193
|
-
|
|
210
|
+
fieldname=fieldname,
|
|
211
|
+
tablename=target_file,
|
|
212
|
+
concept_id=concept_id,
|
|
213
|
+
additional=additional,
|
|
214
|
+
count_type="output_count",
|
|
215
|
+
)
|
|
194
216
|
|
|
195
217
|
self.increment_key_count(
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
218
|
+
source=source_path,
|
|
219
|
+
fieldname="all",
|
|
220
|
+
tablename="all",
|
|
221
|
+
concept_id="all",
|
|
222
|
+
additional="",
|
|
223
|
+
count_type="output_count",
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
self.increment_key_count(
|
|
227
|
+
source="all",
|
|
228
|
+
fieldname="all",
|
|
229
|
+
tablename=target_file,
|
|
230
|
+
concept_id="all",
|
|
231
|
+
additional="",
|
|
232
|
+
count_type="output_count",
|
|
233
|
+
)
|
|
203
234
|
increment_this(fieldname="all", concept_id="all")
|
|
204
|
-
|
|
235
|
+
|
|
205
236
|
if target_file == "person":
|
|
206
237
|
increment_this(fieldname="all", concept_id=out_record[1])
|
|
207
|
-
increment_this(
|
|
238
|
+
increment_this(
|
|
239
|
+
fieldname="all", concept_id=out_record[1], additional=out_record[2]
|
|
240
|
+
)
|
|
208
241
|
else:
|
|
209
242
|
increment_this(fieldname=datacol, concept_id=out_record[2])
|
|
210
243
|
increment_this(fieldname="all", concept_id=out_record[2])
|
|
211
244
|
self.increment_key_count(
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
245
|
+
source="all",
|
|
246
|
+
fieldname="all",
|
|
247
|
+
tablename=target_file,
|
|
248
|
+
concept_id=out_record[2],
|
|
249
|
+
additional="",
|
|
250
|
+
count_type="output_count",
|
|
251
|
+
)
|
|
219
252
|
self.increment_key_count(
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
253
|
+
source="all",
|
|
254
|
+
fieldname="all",
|
|
255
|
+
tablename="all",
|
|
256
|
+
concept_id=out_record[2],
|
|
257
|
+
additional="",
|
|
258
|
+
count_type="output_count",
|
|
259
|
+
)
|
|
228
260
|
|
|
229
261
|
def get_summary(self):
|
|
230
262
|
summary_str = "source\ttablename\tname\tcolumn name\tbefore\tafter content check\tpct reject content check\tafter date format check\tpct reject date format\n"
|
|
231
263
|
|
|
232
264
|
for dkey in self.datasummary:
|
|
233
265
|
logger.debug(dkey)
|
|
234
|
-
source, tablename, name, colname = dkey.split(
|
|
266
|
+
source, tablename, name, colname = dkey.split(".")
|
|
235
267
|
before_count = int(self.datasummary[dkey]["before"])
|
|
236
268
|
after_count = int(self.datasummary[dkey]["after"])
|
|
237
269
|
after_pct = (float)(before_count - after_count) * 100 / before_count
|
|
238
|
-
summary_str +=
|
|
270
|
+
summary_str += (
|
|
271
|
+
source
|
|
272
|
+
+ "\t"
|
|
273
|
+
+ tablename
|
|
274
|
+
+ "\t"
|
|
275
|
+
+ name
|
|
276
|
+
+ "\t"
|
|
277
|
+
+ colname
|
|
278
|
+
+ "\t"
|
|
279
|
+
+ str(before_count)
|
|
280
|
+
+ "\t"
|
|
281
|
+
+ str(after_count)
|
|
282
|
+
+ "\t"
|
|
283
|
+
+ "{0:.3f}".format(after_pct)
|
|
284
|
+
+ "\t"
|
|
285
|
+
)
|
|
239
286
|
if "after_formatting" in self.datasummary[dkey]:
|
|
240
287
|
after_format_count = int(self.datasummary[dkey]["after_formatting"])
|
|
241
|
-
after_format_pct = (
|
|
242
|
-
|
|
288
|
+
after_format_pct = (
|
|
289
|
+
(float)(after_count - after_format_count) * 100 / after_count
|
|
290
|
+
)
|
|
291
|
+
summary_str += (
|
|
292
|
+
str(after_format_count)
|
|
293
|
+
+ "\t"
|
|
294
|
+
+ "{0:.3f}".format(after_format_pct)
|
|
295
|
+
+ "\n"
|
|
296
|
+
)
|
|
243
297
|
else:
|
|
244
298
|
summary_str += "NA\tNA\n"
|
|
245
299
|
|
|
@@ -269,24 +323,23 @@ class Metrics():
|
|
|
269
323
|
invalid_person_ids=count_data.get_count("invalid_person_ids"),
|
|
270
324
|
invalid_date_fields=count_data.get_count("invalid_date_fields"),
|
|
271
325
|
invalid_source_fields=count_data.get_count("invalid_source_fields"),
|
|
272
|
-
output_count=count_data.get_count("output_count")
|
|
326
|
+
output_count=count_data.get_count("output_count"),
|
|
273
327
|
)
|
|
274
328
|
|
|
275
329
|
if row.output_count >= self.log_threshold:
|
|
276
330
|
rows.append(row)
|
|
277
331
|
return rows
|
|
278
332
|
|
|
279
|
-
|
|
280
333
|
def get_mapstream_summary(self) -> str:
|
|
281
334
|
"""
|
|
282
335
|
Makes a TSV string of the mapstream summary
|
|
283
336
|
"""
|
|
284
337
|
summary_rows = self.get_mapstream_summary_rows()
|
|
285
338
|
result = MapstreamSummaryRow.get_header()
|
|
286
|
-
|
|
339
|
+
|
|
287
340
|
for row in summary_rows:
|
|
288
341
|
result += row.to_tsv_row()
|
|
289
|
-
|
|
342
|
+
|
|
290
343
|
return result
|
|
291
344
|
|
|
292
345
|
def get_mapstream_summary_dict(self) -> Dict:
|
|
@@ -297,5 +350,5 @@ class Metrics():
|
|
|
297
350
|
return {
|
|
298
351
|
"dataset": self.dataset_name,
|
|
299
352
|
"threshold": self.log_threshold,
|
|
300
|
-
"rows": [vars(row) for row in rows]
|
|
353
|
+
"rows": [vars(row) for row in rows],
|
|
301
354
|
}
|
carrottransform/tools/omopcdm.py
CHANGED
|
@@ -1,18 +1,19 @@
|
|
|
1
1
|
import carrottransform.tools as tools
|
|
2
2
|
import json
|
|
3
|
-
import
|
|
3
|
+
from carrottransform.tools.logger import logger_setup
|
|
4
4
|
import re
|
|
5
5
|
import sys
|
|
6
6
|
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
|
|
9
|
-
logger =
|
|
9
|
+
logger = logger_setup()
|
|
10
|
+
|
|
10
11
|
|
|
11
12
|
class OmopCDM:
|
|
12
13
|
"""
|
|
13
14
|
Load and parse OMOP DDL data, to make an in-memory json CDM
|
|
14
15
|
Merge in manual additions (currently necessary to identify, person id, date / time fields and autonumber fields)
|
|
15
|
-
Define a series of "get" functions to allow CDM component discovery
|
|
16
|
+
Define a series of "get" functions to allow CDM component discovery
|
|
16
17
|
"""
|
|
17
18
|
|
|
18
19
|
def __init__(self, omopddl, omopcfg):
|
|
@@ -33,15 +34,14 @@ class OmopCDM:
|
|
|
33
34
|
self.person_id_field = self.get_columns("person_id_field")
|
|
34
35
|
self.auto_number_field = self.get_columns("auto_number_field")
|
|
35
36
|
|
|
36
|
-
|
|
37
37
|
def load_ddl(self, omopddl: Path):
|
|
38
38
|
try:
|
|
39
|
-
fp = omopddl.open("r")
|
|
40
|
-
except Exception
|
|
39
|
+
fp = omopddl.open("r")
|
|
40
|
+
except Exception:
|
|
41
41
|
logger.exception("OMOP ddl file ({0}) not found".format(omopddl))
|
|
42
42
|
sys.exit()
|
|
43
|
-
|
|
44
|
-
return
|
|
43
|
+
|
|
44
|
+
return self.process_ddl(fp)
|
|
45
45
|
|
|
46
46
|
def process_ddl(self, fp):
|
|
47
47
|
"""
|
|
@@ -56,13 +56,13 @@ class OmopCDM:
|
|
|
56
56
|
output_dict["date_fields"] = {}
|
|
57
57
|
|
|
58
58
|
## matching for version number - matches '--postgres', any number of chars and some digits of the form X.Y, plus an end of string or end of line
|
|
59
|
-
ver_rgx = re.compile(r
|
|
59
|
+
ver_rgx = re.compile(r"^--postgresql.*(\d+\.\d+)$")
|
|
60
60
|
## matching for table name - matches 'CREATE TABLE @', some letters (upper and lower case), '.' and some more letters (lower case)
|
|
61
|
-
start_rgx = re.compile(r
|
|
61
|
+
start_rgx = re.compile(r"^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([a-zA-Z_]+)")
|
|
62
62
|
## matches some whitespace, lower case letters(or underscores), whitespace, letters (upper/lower and underscores)
|
|
63
|
-
datatype_rgx = re.compile(r
|
|
63
|
+
datatype_rgx = re.compile(r"^\s*([a-z_]+)\s+([a-zA-Z_]+)")
|
|
64
64
|
## matching for end of file - matches close bracket, semi colon, end of file or line
|
|
65
|
-
end_rgx = re.compile(r
|
|
65
|
+
end_rgx = re.compile(r".*[)];$")
|
|
66
66
|
vermatched = False
|
|
67
67
|
processing_table_data = False
|
|
68
68
|
tabname = ""
|
|
@@ -70,21 +70,22 @@ class OmopCDM:
|
|
|
70
70
|
for line in fp:
|
|
71
71
|
line = line.strip()
|
|
72
72
|
# check for line with version, if present
|
|
73
|
-
if vermatched
|
|
73
|
+
if not vermatched:
|
|
74
74
|
vmatch = ver_rgx.search(line)
|
|
75
|
-
if vmatch
|
|
75
|
+
if vmatch is not None:
|
|
76
76
|
version_string = vmatch.group(1)
|
|
77
77
|
output_dict["omop_version"] = version_string
|
|
78
78
|
vermatched = True
|
|
79
|
+
|
|
79
80
|
# check for start of table definition
|
|
80
|
-
if processing_table_data
|
|
81
|
+
if not processing_table_data:
|
|
81
82
|
smatch = start_rgx.search(line)
|
|
82
|
-
if smatch
|
|
83
|
+
if smatch is not None:
|
|
83
84
|
processing_table_data = True
|
|
84
85
|
tabname = smatch.group(2).lower()
|
|
85
86
|
else:
|
|
86
87
|
idtmatch = datatype_rgx.search(line)
|
|
87
|
-
if idtmatch
|
|
88
|
+
if idtmatch is not None:
|
|
88
89
|
fname = idtmatch.group(1)
|
|
89
90
|
ftype = idtmatch.group(2)
|
|
90
91
|
|
|
@@ -99,12 +100,16 @@ class OmopCDM:
|
|
|
99
100
|
output_dict["datetime_fields"][tabname] = []
|
|
100
101
|
if tabname not in output_dict["date_fields"]:
|
|
101
102
|
output_dict["date_fields"][tabname] = []
|
|
102
|
-
|
|
103
|
+
|
|
103
104
|
# Add in required column / field data
|
|
104
105
|
output_dict["all_columns"][tabname].append(fname)
|
|
105
106
|
if ftype.lower() in self.numeric_types:
|
|
106
107
|
output_dict["numeric_fields"][tabname].append(fname)
|
|
107
|
-
if
|
|
108
|
+
if (
|
|
109
|
+
ftype.lower() in self.numeric_types
|
|
110
|
+
and "NOT" in line
|
|
111
|
+
and "NULL" in line
|
|
112
|
+
):
|
|
108
113
|
output_dict["notnull_numeric_fields"][tabname].append(fname)
|
|
109
114
|
if ftype.lower() in self.datetime_types:
|
|
110
115
|
output_dict["datetime_fields"][tabname].append(fname)
|
|
@@ -112,19 +117,19 @@ class OmopCDM:
|
|
|
112
117
|
output_dict["date_fields"][tabname].append(fname)
|
|
113
118
|
|
|
114
119
|
ematch = end_rgx.search(line)
|
|
115
|
-
if ematch
|
|
120
|
+
if ematch is not None:
|
|
116
121
|
processing_table_data = False
|
|
117
|
-
|
|
118
|
-
return
|
|
119
|
-
|
|
122
|
+
|
|
123
|
+
return output_dict
|
|
124
|
+
|
|
120
125
|
def dump_ddl(self):
|
|
121
|
-
return
|
|
126
|
+
return json.dumps(self.omop_json, indent=2)
|
|
122
127
|
|
|
123
128
|
def merge_json(self, omopjson, omopcfg):
|
|
124
129
|
tmp_json = tools.load_json(omopcfg)
|
|
125
130
|
for key, data in tmp_json.items():
|
|
126
131
|
omopjson[key] = data
|
|
127
|
-
return
|
|
132
|
+
return omopjson
|
|
128
133
|
|
|
129
134
|
def get_columns(self, colkey):
|
|
130
135
|
if colkey in self.omop_json:
|
|
@@ -157,43 +162,43 @@ class OmopCDM:
|
|
|
157
162
|
return True
|
|
158
163
|
|
|
159
164
|
def get_omop_numeric_fields(self, tablename):
|
|
160
|
-
if self.numeric_fields
|
|
165
|
+
if self.numeric_fields is not None:
|
|
161
166
|
if tablename in self.numeric_fields:
|
|
162
167
|
return self.numeric_fields[tablename]
|
|
163
168
|
return []
|
|
164
169
|
|
|
165
170
|
def get_omop_notnull_numeric_fields(self, tablename):
|
|
166
|
-
if self.notnull_numeric_fields
|
|
171
|
+
if self.notnull_numeric_fields is not None:
|
|
167
172
|
if tablename in self.notnull_numeric_fields:
|
|
168
173
|
return self.notnull_numeric_fields[tablename]
|
|
169
174
|
return []
|
|
170
175
|
|
|
171
176
|
def get_omop_datetime_linked_fields(self, tablename):
|
|
172
|
-
if self.datetime_linked_fields
|
|
177
|
+
if self.datetime_linked_fields is not None:
|
|
173
178
|
if tablename in self.datetime_linked_fields:
|
|
174
179
|
return self.datetime_linked_fields[tablename]
|
|
175
180
|
return {}
|
|
176
181
|
|
|
177
182
|
def get_omop_date_field_components(self, tablename):
|
|
178
|
-
if self.date_field_components
|
|
183
|
+
if self.date_field_components is not None:
|
|
179
184
|
if tablename in self.date_field_components:
|
|
180
185
|
return self.date_field_components[tablename]
|
|
181
186
|
return {}
|
|
182
187
|
|
|
183
188
|
def get_omop_datetime_fields(self, tablename):
|
|
184
|
-
if self.datetime_fields
|
|
189
|
+
if self.datetime_fields is not None:
|
|
185
190
|
if tablename in self.datetime_fields:
|
|
186
191
|
return self.datetime_fields[tablename]
|
|
187
192
|
return []
|
|
188
193
|
|
|
189
194
|
def get_omop_person_id_field(self, tablename):
|
|
190
|
-
if self.person_id_field
|
|
195
|
+
if self.person_id_field is not None:
|
|
191
196
|
if tablename in self.person_id_field:
|
|
192
197
|
return self.person_id_field[tablename]
|
|
193
198
|
return None
|
|
194
199
|
|
|
195
200
|
def get_omop_auto_number_field(self, tablename):
|
|
196
|
-
if self.auto_number_field
|
|
201
|
+
if self.auto_number_field is not None:
|
|
197
202
|
if tablename in self.auto_number_field:
|
|
198
203
|
return self.auto_number_field[tablename]
|
|
199
204
|
return None
|