carrot-transform 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of carrot-transform might be problematic. Click here for more details.
- {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
- carrot_transform-0.4.0.dist-info/RECORD +41 -0
- {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
- carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
- carrottransform/__init__.py +1 -1
- carrottransform/_version.py +2 -2
- carrottransform/cli/command.py +9 -5
- carrottransform/cli/subcommands/run.py +302 -443
- carrottransform/cli/subcommands/run_v2.py +145 -0
- carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
- carrottransform/examples/test/rules/v1.json +280 -0
- carrottransform/examples/test/rules/v2.json +115 -0
- carrottransform/tools/__init__.py +4 -14
- carrottransform/tools/args.py +128 -0
- carrottransform/tools/click.py +21 -0
- carrottransform/tools/concept_helpers.py +61 -0
- carrottransform/tools/core.py +163 -0
- carrottransform/tools/date_helpers.py +79 -0
- carrottransform/tools/file_helpers.py +177 -7
- carrottransform/tools/logger.py +19 -0
- carrottransform/tools/mapping_types.py +32 -0
- carrottransform/tools/mappingrules.py +298 -32
- carrottransform/tools/metrics.py +274 -49
- carrottransform/tools/omopcdm.py +42 -32
- carrottransform/tools/orchestrator.py +381 -0
- carrottransform/tools/person_helpers.py +126 -0
- carrottransform/tools/record_builder.py +413 -0
- carrottransform/tools/stream_helpers.py +71 -0
- carrottransform/tools/types.py +71 -0
- carrottransform/tools/validation.py +62 -0
- carrot_transform-0.3.4.dist-info/RECORD +0 -24
- carrot_transform-0.3.4.dist-info/entry_points.txt +0 -3
- {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0
carrottransform/tools/metrics.py
CHANGED
|
@@ -1,15 +1,125 @@
|
|
|
1
|
-
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
from carrottransform.tools.logger import logger_setup
|
|
5
|
+
|
|
6
|
+
logger = logger_setup()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class DataKey:
|
|
11
|
+
source: str
|
|
12
|
+
fieldname: str
|
|
13
|
+
tablename: str
|
|
14
|
+
concept_id: str
|
|
15
|
+
additional: str
|
|
16
|
+
|
|
17
|
+
def __str__(self) -> str:
|
|
18
|
+
"""
|
|
19
|
+
The original implementation used strings as keys, then split by `~`.
|
|
20
|
+
This is here in case that representation is needed somewhere
|
|
21
|
+
"""
|
|
22
|
+
return f"{self.source}~{self.fieldname}~{self.tablename}~{self.concept_id}~{self.additional}"
|
|
23
|
+
|
|
24
|
+
def __hash__(self) -> int:
|
|
25
|
+
"""
|
|
26
|
+
The DataKey is used as a key for a dictionary of key counts
|
|
27
|
+
"""
|
|
28
|
+
return hash(
|
|
29
|
+
(
|
|
30
|
+
self.source,
|
|
31
|
+
self.fieldname,
|
|
32
|
+
self.tablename,
|
|
33
|
+
self.concept_id,
|
|
34
|
+
self.additional,
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class CountData:
|
|
41
|
+
counts: Dict[str, int] = field(default_factory=dict)
|
|
42
|
+
|
|
43
|
+
def increment(self, count_type: str):
|
|
44
|
+
if count_type not in self.counts:
|
|
45
|
+
self.counts[count_type] = 0
|
|
46
|
+
self.counts[count_type] += 1
|
|
47
|
+
|
|
48
|
+
def get_count(self, count_type: str, default: int = 0):
|
|
49
|
+
return self.counts.get(count_type, default)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class MapstreamSummaryRow:
|
|
54
|
+
"""Represents a single row in the mapstream summary"""
|
|
55
|
+
|
|
56
|
+
dataset_name: str
|
|
57
|
+
source: str
|
|
58
|
+
fieldname: str
|
|
59
|
+
tablename: str
|
|
60
|
+
concept_id: str
|
|
61
|
+
additional: str
|
|
62
|
+
input_count: int = 0
|
|
63
|
+
invalid_person_ids: int = 0
|
|
64
|
+
invalid_date_fields: int = 0
|
|
65
|
+
invalid_source_fields: int = 0
|
|
66
|
+
output_count: int = 0
|
|
67
|
+
|
|
68
|
+
def to_tsv_row(self) -> str:
|
|
69
|
+
"""Convert the row to a tab-separated string"""
|
|
70
|
+
row_list = [
|
|
71
|
+
str(col)
|
|
72
|
+
for col in [
|
|
73
|
+
self.dataset_name,
|
|
74
|
+
self.source,
|
|
75
|
+
self.fieldname,
|
|
76
|
+
self.tablename,
|
|
77
|
+
self.concept_id,
|
|
78
|
+
self.additional,
|
|
79
|
+
self.input_count,
|
|
80
|
+
self.invalid_person_ids,
|
|
81
|
+
self.invalid_date_fields,
|
|
82
|
+
self.invalid_source_fields,
|
|
83
|
+
self.output_count,
|
|
84
|
+
]
|
|
85
|
+
]
|
|
86
|
+
# If python gets updated, you can move the row_str expression into the f-string
|
|
87
|
+
row_str = "\t".join(row_list)
|
|
88
|
+
return f"{row_str}\n"
|
|
89
|
+
|
|
90
|
+
@classmethod
|
|
91
|
+
def get_header(cls) -> str:
|
|
92
|
+
"""Return the TSV header row"""
|
|
93
|
+
header = [
|
|
94
|
+
"dsname",
|
|
95
|
+
"source",
|
|
96
|
+
"source_field",
|
|
97
|
+
"target",
|
|
98
|
+
"concept_id",
|
|
99
|
+
"additional",
|
|
100
|
+
"incount",
|
|
101
|
+
"invalid_persid",
|
|
102
|
+
"invalid_date",
|
|
103
|
+
"invalid_source",
|
|
104
|
+
"outcount",
|
|
105
|
+
]
|
|
106
|
+
header_str = "\t".join(header)
|
|
107
|
+
return f"{header_str}\n"
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class Metrics:
|
|
2
111
|
"""
|
|
3
112
|
Capture metrics for output to a summary tsv file, record counts at multiple levels
|
|
4
113
|
The main principle is to increment counts associated with datakeys (dkey) at different levels
|
|
5
114
|
"""
|
|
115
|
+
|
|
6
116
|
def __init__(self, dataset_name, log_threshold=0):
|
|
7
117
|
"""
|
|
8
118
|
self.datasummary holds all the saved counts
|
|
9
119
|
"""
|
|
10
|
-
self.datasummary={}
|
|
11
|
-
self.allcounts={}
|
|
12
|
-
self.dataset_name=dataset_name
|
|
120
|
+
self.datasummary = {}
|
|
121
|
+
self.allcounts = {}
|
|
122
|
+
self.dataset_name = dataset_name
|
|
13
123
|
self.log_threshold = log_threshold
|
|
14
124
|
|
|
15
125
|
def get_new_mapstream_counts(self):
|
|
@@ -43,8 +153,18 @@ class Metrics():
|
|
|
43
153
|
prfx = "NA"
|
|
44
154
|
if "source_files" in increment:
|
|
45
155
|
if fieldname in increment["source_files"]:
|
|
46
|
-
prfx = self.get_prefix(
|
|
47
|
-
|
|
156
|
+
prfx = self.get_prefix(
|
|
157
|
+
increment["source_files"][fieldname]["table"]
|
|
158
|
+
)
|
|
159
|
+
dkey = (
|
|
160
|
+
prfx
|
|
161
|
+
+ "."
|
|
162
|
+
+ desttablename
|
|
163
|
+
+ "."
|
|
164
|
+
+ name
|
|
165
|
+
+ "."
|
|
166
|
+
+ fieldname
|
|
167
|
+
)
|
|
48
168
|
self.add_counts_to_summary(dkey, dataitem[fieldname])
|
|
49
169
|
|
|
50
170
|
def get_prefix(self, fname):
|
|
@@ -58,30 +178,122 @@ class Metrics():
|
|
|
58
178
|
self.datasummary[dkey][counttype] = 0
|
|
59
179
|
self.datasummary[dkey][counttype] += int(count_block[counttype])
|
|
60
180
|
|
|
61
|
-
def increment_key_count(
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
181
|
+
def increment_key_count(
|
|
182
|
+
self, source, fieldname, tablename, concept_id, additional, count_type
|
|
183
|
+
):
|
|
184
|
+
dkey = DataKey(source, fieldname, tablename, concept_id, additional)
|
|
185
|
+
|
|
65
186
|
if dkey not in self.datasummary:
|
|
66
|
-
self.datasummary[dkey] =
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
187
|
+
self.datasummary[dkey] = CountData()
|
|
188
|
+
|
|
189
|
+
self.datasummary[dkey].increment(count_type)
|
|
190
|
+
|
|
191
|
+
def increment_with_datacol(
|
|
192
|
+
self,
|
|
193
|
+
source_path: str,
|
|
194
|
+
target_file: str,
|
|
195
|
+
datacol: str,
|
|
196
|
+
out_record: List[str],
|
|
197
|
+
) -> None:
|
|
198
|
+
# Are the parameters for DataKeys hierarchical?
|
|
199
|
+
# If so, a nested structure where a Source contains n Fields etc. and each has a method to sum its children would be better
|
|
200
|
+
# But I don't know if that's the desired behaviour
|
|
201
|
+
|
|
202
|
+
# A lot of these increment the same thing, so I have defined `increment_this`
|
|
203
|
+
def increment_this(
|
|
204
|
+
fieldname: str,
|
|
205
|
+
concept_id: str,
|
|
206
|
+
additional="",
|
|
207
|
+
) -> None:
|
|
208
|
+
self.increment_key_count(
|
|
209
|
+
source=source_path,
|
|
210
|
+
fieldname=fieldname,
|
|
211
|
+
tablename=target_file,
|
|
212
|
+
concept_id=concept_id,
|
|
213
|
+
additional=additional,
|
|
214
|
+
count_type="output_count",
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
self.increment_key_count(
|
|
218
|
+
source=source_path,
|
|
219
|
+
fieldname="all",
|
|
220
|
+
tablename="all",
|
|
221
|
+
concept_id="all",
|
|
222
|
+
additional="",
|
|
223
|
+
count_type="output_count",
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
self.increment_key_count(
|
|
227
|
+
source="all",
|
|
228
|
+
fieldname="all",
|
|
229
|
+
tablename=target_file,
|
|
230
|
+
concept_id="all",
|
|
231
|
+
additional="",
|
|
232
|
+
count_type="output_count",
|
|
233
|
+
)
|
|
234
|
+
increment_this(fieldname="all", concept_id="all")
|
|
235
|
+
|
|
236
|
+
if target_file == "person":
|
|
237
|
+
increment_this(fieldname="all", concept_id=out_record[1])
|
|
238
|
+
increment_this(
|
|
239
|
+
fieldname="all", concept_id=out_record[1], additional=out_record[2]
|
|
240
|
+
)
|
|
241
|
+
else:
|
|
242
|
+
increment_this(fieldname=datacol, concept_id=out_record[2])
|
|
243
|
+
increment_this(fieldname="all", concept_id=out_record[2])
|
|
244
|
+
self.increment_key_count(
|
|
245
|
+
source="all",
|
|
246
|
+
fieldname="all",
|
|
247
|
+
tablename=target_file,
|
|
248
|
+
concept_id=out_record[2],
|
|
249
|
+
additional="",
|
|
250
|
+
count_type="output_count",
|
|
251
|
+
)
|
|
252
|
+
self.increment_key_count(
|
|
253
|
+
source="all",
|
|
254
|
+
fieldname="all",
|
|
255
|
+
tablename="all",
|
|
256
|
+
concept_id=out_record[2],
|
|
257
|
+
additional="",
|
|
258
|
+
count_type="output_count",
|
|
259
|
+
)
|
|
70
260
|
|
|
71
261
|
def get_summary(self):
|
|
72
262
|
summary_str = "source\ttablename\tname\tcolumn name\tbefore\tafter content check\tpct reject content check\tafter date format check\tpct reject date format\n"
|
|
73
263
|
|
|
74
264
|
for dkey in self.datasummary:
|
|
75
|
-
|
|
76
|
-
source, tablename, name, colname = dkey.split(
|
|
265
|
+
logger.debug(dkey)
|
|
266
|
+
source, tablename, name, colname = dkey.split(".")
|
|
77
267
|
before_count = int(self.datasummary[dkey]["before"])
|
|
78
268
|
after_count = int(self.datasummary[dkey]["after"])
|
|
79
269
|
after_pct = (float)(before_count - after_count) * 100 / before_count
|
|
80
|
-
summary_str +=
|
|
270
|
+
summary_str += (
|
|
271
|
+
source
|
|
272
|
+
+ "\t"
|
|
273
|
+
+ tablename
|
|
274
|
+
+ "\t"
|
|
275
|
+
+ name
|
|
276
|
+
+ "\t"
|
|
277
|
+
+ colname
|
|
278
|
+
+ "\t"
|
|
279
|
+
+ str(before_count)
|
|
280
|
+
+ "\t"
|
|
281
|
+
+ str(after_count)
|
|
282
|
+
+ "\t"
|
|
283
|
+
+ "{0:.3f}".format(after_pct)
|
|
284
|
+
+ "\t"
|
|
285
|
+
)
|
|
81
286
|
if "after_formatting" in self.datasummary[dkey]:
|
|
82
287
|
after_format_count = int(self.datasummary[dkey]["after_formatting"])
|
|
83
|
-
after_format_pct = (
|
|
84
|
-
|
|
288
|
+
after_format_pct = (
|
|
289
|
+
(float)(after_count - after_format_count) * 100 / after_count
|
|
290
|
+
)
|
|
291
|
+
summary_str += (
|
|
292
|
+
str(after_format_count)
|
|
293
|
+
+ "\t"
|
|
294
|
+
+ "{0:.3f}".format(after_format_pct)
|
|
295
|
+
+ "\n"
|
|
296
|
+
)
|
|
85
297
|
else:
|
|
86
298
|
summary_str += "NA\tNA\n"
|
|
87
299
|
|
|
@@ -90,40 +302,53 @@ class Metrics():
|
|
|
90
302
|
def get_data_summary(self):
|
|
91
303
|
return self.datasummary
|
|
92
304
|
|
|
93
|
-
def
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
source, fieldname, tablename, concept_id, additional = dkey.split('~')
|
|
99
|
-
except ValueError:
|
|
100
|
-
print("get_mapstream_summary - ValueError: {0}".format(dkey))
|
|
101
|
-
break
|
|
102
|
-
|
|
103
|
-
source = self.get_prefix(source)
|
|
104
|
-
dvalue = self.datasummary[dkey]
|
|
305
|
+
def get_mapstream_summary_rows(self) -> List[MapstreamSummaryRow]:
|
|
306
|
+
"""
|
|
307
|
+
Creates a list of MapstreamSummaryRow from the datasummary
|
|
308
|
+
"""
|
|
309
|
+
rows = []
|
|
105
310
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
311
|
+
for d_key in sorted(self.datasummary.keys(), key=str):
|
|
312
|
+
source = self.get_prefix(d_key.source)
|
|
313
|
+
count_data = self.datasummary[d_key]
|
|
109
314
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
315
|
+
row = MapstreamSummaryRow(
|
|
316
|
+
dataset_name=self.dataset_name,
|
|
317
|
+
source=source,
|
|
318
|
+
fieldname=d_key.fieldname,
|
|
319
|
+
tablename=d_key.tablename,
|
|
320
|
+
concept_id=d_key.concept_id,
|
|
321
|
+
additional=d_key.additional,
|
|
322
|
+
input_count=count_data.get_count("input_count"),
|
|
323
|
+
invalid_person_ids=count_data.get_count("invalid_person_ids"),
|
|
324
|
+
invalid_date_fields=count_data.get_count("invalid_date_fields"),
|
|
325
|
+
invalid_source_fields=count_data.get_count("invalid_source_fields"),
|
|
326
|
+
output_count=count_data.get_count("output_count"),
|
|
327
|
+
)
|
|
113
328
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
329
|
+
if row.output_count >= self.log_threshold:
|
|
330
|
+
rows.append(row)
|
|
331
|
+
return rows
|
|
117
332
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
333
|
+
def get_mapstream_summary(self) -> str:
|
|
334
|
+
"""
|
|
335
|
+
Makes a TSV string of the mapstream summary
|
|
336
|
+
"""
|
|
337
|
+
summary_rows = self.get_mapstream_summary_rows()
|
|
338
|
+
result = MapstreamSummaryRow.get_header()
|
|
121
339
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
output_count = str(dvalue["output_count"])
|
|
340
|
+
for row in summary_rows:
|
|
341
|
+
result += row.to_tsv_row()
|
|
125
342
|
|
|
126
|
-
|
|
127
|
-
summary_str += self.dataset_name + "\t" + source + "\t" + fieldname + "\t" + tablename + "\t" + concept_id + "\t" + additional +"\t" + input_count + "\t" + invalid_person_ids + "\t" + invalid_date_fields + "\t" + invalid_source_fields + "\t" + output_count + "\n"
|
|
343
|
+
return result
|
|
128
344
|
|
|
129
|
-
|
|
345
|
+
def get_mapstream_summary_dict(self) -> Dict:
|
|
346
|
+
"""
|
|
347
|
+
Makes a dict of the mapstream summary
|
|
348
|
+
"""
|
|
349
|
+
rows = self.get_mapstream_summary_rows()
|
|
350
|
+
return {
|
|
351
|
+
"dataset": self.dataset_name,
|
|
352
|
+
"threshold": self.log_threshold,
|
|
353
|
+
"rows": [vars(row) for row in rows],
|
|
354
|
+
}
|
carrottransform/tools/omopcdm.py
CHANGED
|
@@ -1,13 +1,19 @@
|
|
|
1
1
|
import carrottransform.tools as tools
|
|
2
2
|
import json
|
|
3
|
+
from carrottransform.tools.logger import logger_setup
|
|
3
4
|
import re
|
|
4
5
|
import sys
|
|
5
6
|
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
logger = logger_setup()
|
|
10
|
+
|
|
11
|
+
|
|
6
12
|
class OmopCDM:
|
|
7
13
|
"""
|
|
8
14
|
Load and parse OMOP DDL data, to make an in-memory json CDM
|
|
9
15
|
Merge in manual additions (currently necessary to identify, person id, date / time fields and autonumber fields)
|
|
10
|
-
Define a series of "get" functions to allow CDM component discovery
|
|
16
|
+
Define a series of "get" functions to allow CDM component discovery
|
|
11
17
|
"""
|
|
12
18
|
|
|
13
19
|
def __init__(self, omopddl, omopcfg):
|
|
@@ -28,15 +34,14 @@ class OmopCDM:
|
|
|
28
34
|
self.person_id_field = self.get_columns("person_id_field")
|
|
29
35
|
self.auto_number_field = self.get_columns("auto_number_field")
|
|
30
36
|
|
|
31
|
-
|
|
32
|
-
def load_ddl(self, omopddl):
|
|
37
|
+
def load_ddl(self, omopddl: Path):
|
|
33
38
|
try:
|
|
34
|
-
fp = open(
|
|
35
|
-
except Exception
|
|
36
|
-
|
|
39
|
+
fp = omopddl.open("r")
|
|
40
|
+
except Exception:
|
|
41
|
+
logger.exception("OMOP ddl file ({0}) not found".format(omopddl))
|
|
37
42
|
sys.exit()
|
|
38
|
-
|
|
39
|
-
return
|
|
43
|
+
|
|
44
|
+
return self.process_ddl(fp)
|
|
40
45
|
|
|
41
46
|
def process_ddl(self, fp):
|
|
42
47
|
"""
|
|
@@ -51,13 +56,13 @@ class OmopCDM:
|
|
|
51
56
|
output_dict["date_fields"] = {}
|
|
52
57
|
|
|
53
58
|
## matching for version number - matches '--postgres', any number of chars and some digits of the form X.Y, plus an end of string or end of line
|
|
54
|
-
ver_rgx = re.compile(r
|
|
59
|
+
ver_rgx = re.compile(r"^--postgresql.*(\d+\.\d+)$")
|
|
55
60
|
## matching for table name - matches 'CREATE TABLE @', some letters (upper and lower case), '.' and some more letters (lower case)
|
|
56
|
-
start_rgx = re.compile(r
|
|
61
|
+
start_rgx = re.compile(r"^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([a-zA-Z_]+)")
|
|
57
62
|
## matches some whitespace, lower case letters(or underscores), whitespace, letters (upper/lower and underscores)
|
|
58
|
-
datatype_rgx = re.compile(r
|
|
63
|
+
datatype_rgx = re.compile(r"^\s*([a-z_]+)\s+([a-zA-Z_]+)")
|
|
59
64
|
## matching for end of file - matches close bracket, semi colon, end of file or line
|
|
60
|
-
end_rgx = re.compile(r
|
|
65
|
+
end_rgx = re.compile(r".*[)];$")
|
|
61
66
|
vermatched = False
|
|
62
67
|
processing_table_data = False
|
|
63
68
|
tabname = ""
|
|
@@ -65,21 +70,22 @@ class OmopCDM:
|
|
|
65
70
|
for line in fp:
|
|
66
71
|
line = line.strip()
|
|
67
72
|
# check for line with version, if present
|
|
68
|
-
if vermatched
|
|
73
|
+
if not vermatched:
|
|
69
74
|
vmatch = ver_rgx.search(line)
|
|
70
|
-
if vmatch
|
|
75
|
+
if vmatch is not None:
|
|
71
76
|
version_string = vmatch.group(1)
|
|
72
77
|
output_dict["omop_version"] = version_string
|
|
73
78
|
vermatched = True
|
|
79
|
+
|
|
74
80
|
# check for start of table definition
|
|
75
|
-
if processing_table_data
|
|
81
|
+
if not processing_table_data:
|
|
76
82
|
smatch = start_rgx.search(line)
|
|
77
|
-
if smatch
|
|
83
|
+
if smatch is not None:
|
|
78
84
|
processing_table_data = True
|
|
79
85
|
tabname = smatch.group(2).lower()
|
|
80
86
|
else:
|
|
81
87
|
idtmatch = datatype_rgx.search(line)
|
|
82
|
-
if idtmatch
|
|
88
|
+
if idtmatch is not None:
|
|
83
89
|
fname = idtmatch.group(1)
|
|
84
90
|
ftype = idtmatch.group(2)
|
|
85
91
|
|
|
@@ -94,12 +100,16 @@ class OmopCDM:
|
|
|
94
100
|
output_dict["datetime_fields"][tabname] = []
|
|
95
101
|
if tabname not in output_dict["date_fields"]:
|
|
96
102
|
output_dict["date_fields"][tabname] = []
|
|
97
|
-
|
|
103
|
+
|
|
98
104
|
# Add in required column / field data
|
|
99
105
|
output_dict["all_columns"][tabname].append(fname)
|
|
100
106
|
if ftype.lower() in self.numeric_types:
|
|
101
107
|
output_dict["numeric_fields"][tabname].append(fname)
|
|
102
|
-
if
|
|
108
|
+
if (
|
|
109
|
+
ftype.lower() in self.numeric_types
|
|
110
|
+
and "NOT" in line
|
|
111
|
+
and "NULL" in line
|
|
112
|
+
):
|
|
103
113
|
output_dict["notnull_numeric_fields"][tabname].append(fname)
|
|
104
114
|
if ftype.lower() in self.datetime_types:
|
|
105
115
|
output_dict["datetime_fields"][tabname].append(fname)
|
|
@@ -107,19 +117,19 @@ class OmopCDM:
|
|
|
107
117
|
output_dict["date_fields"][tabname].append(fname)
|
|
108
118
|
|
|
109
119
|
ematch = end_rgx.search(line)
|
|
110
|
-
if ematch
|
|
120
|
+
if ematch is not None:
|
|
111
121
|
processing_table_data = False
|
|
112
|
-
|
|
113
|
-
return
|
|
114
|
-
|
|
122
|
+
|
|
123
|
+
return output_dict
|
|
124
|
+
|
|
115
125
|
def dump_ddl(self):
|
|
116
|
-
return
|
|
126
|
+
return json.dumps(self.omop_json, indent=2)
|
|
117
127
|
|
|
118
128
|
def merge_json(self, omopjson, omopcfg):
|
|
119
129
|
tmp_json = tools.load_json(omopcfg)
|
|
120
130
|
for key, data in tmp_json.items():
|
|
121
131
|
omopjson[key] = data
|
|
122
|
-
return
|
|
132
|
+
return omopjson
|
|
123
133
|
|
|
124
134
|
def get_columns(self, colkey):
|
|
125
135
|
if colkey in self.omop_json:
|
|
@@ -152,43 +162,43 @@ class OmopCDM:
|
|
|
152
162
|
return True
|
|
153
163
|
|
|
154
164
|
def get_omop_numeric_fields(self, tablename):
|
|
155
|
-
if self.numeric_fields
|
|
165
|
+
if self.numeric_fields is not None:
|
|
156
166
|
if tablename in self.numeric_fields:
|
|
157
167
|
return self.numeric_fields[tablename]
|
|
158
168
|
return []
|
|
159
169
|
|
|
160
170
|
def get_omop_notnull_numeric_fields(self, tablename):
|
|
161
|
-
if self.notnull_numeric_fields
|
|
171
|
+
if self.notnull_numeric_fields is not None:
|
|
162
172
|
if tablename in self.notnull_numeric_fields:
|
|
163
173
|
return self.notnull_numeric_fields[tablename]
|
|
164
174
|
return []
|
|
165
175
|
|
|
166
176
|
def get_omop_datetime_linked_fields(self, tablename):
|
|
167
|
-
if self.datetime_linked_fields
|
|
177
|
+
if self.datetime_linked_fields is not None:
|
|
168
178
|
if tablename in self.datetime_linked_fields:
|
|
169
179
|
return self.datetime_linked_fields[tablename]
|
|
170
180
|
return {}
|
|
171
181
|
|
|
172
182
|
def get_omop_date_field_components(self, tablename):
|
|
173
|
-
if self.date_field_components
|
|
183
|
+
if self.date_field_components is not None:
|
|
174
184
|
if tablename in self.date_field_components:
|
|
175
185
|
return self.date_field_components[tablename]
|
|
176
186
|
return {}
|
|
177
187
|
|
|
178
188
|
def get_omop_datetime_fields(self, tablename):
|
|
179
|
-
if self.datetime_fields
|
|
189
|
+
if self.datetime_fields is not None:
|
|
180
190
|
if tablename in self.datetime_fields:
|
|
181
191
|
return self.datetime_fields[tablename]
|
|
182
192
|
return []
|
|
183
193
|
|
|
184
194
|
def get_omop_person_id_field(self, tablename):
|
|
185
|
-
if self.person_id_field
|
|
195
|
+
if self.person_id_field is not None:
|
|
186
196
|
if tablename in self.person_id_field:
|
|
187
197
|
return self.person_id_field[tablename]
|
|
188
198
|
return None
|
|
189
199
|
|
|
190
200
|
def get_omop_auto_number_field(self, tablename):
|
|
191
|
-
if self.auto_number_field
|
|
201
|
+
if self.auto_number_field is not None:
|
|
192
202
|
if tablename in self.auto_number_field:
|
|
193
203
|
return self.auto_number_field[tablename]
|
|
194
204
|
return None
|