carrot-transform 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of carrot-transform might be problematic. Click here for more details.
- {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
- carrot_transform-0.4.0.dist-info/RECORD +41 -0
- {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
- carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
- carrottransform/__init__.py +1 -1
- carrottransform/_version.py +2 -2
- carrottransform/cli/command.py +9 -5
- carrottransform/cli/subcommands/run.py +214 -526
- carrottransform/cli/subcommands/run_v2.py +145 -0
- carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
- carrottransform/examples/test/rules/v1.json +280 -0
- carrottransform/examples/test/rules/v2.json +115 -0
- carrottransform/tools/__init__.py +4 -14
- carrottransform/tools/args.py +128 -0
- carrottransform/tools/concept_helpers.py +61 -0
- carrottransform/tools/core.py +163 -0
- carrottransform/tools/date_helpers.py +79 -0
- carrottransform/tools/file_helpers.py +153 -9
- carrottransform/tools/logger.py +19 -0
- carrottransform/tools/mapping_types.py +32 -0
- carrottransform/tools/mappingrules.py +297 -34
- carrottransform/tools/metrics.py +162 -109
- carrottransform/tools/omopcdm.py +37 -32
- carrottransform/tools/orchestrator.py +381 -0
- carrottransform/tools/person_helpers.py +126 -0
- carrottransform/tools/record_builder.py +413 -0
- carrottransform/tools/stream_helpers.py +71 -0
- carrottransform/tools/types.py +71 -0
- carrottransform/tools/validation.py +62 -0
- carrot_transform-0.3.5.dist-info/RECORD +0 -25
- carrot_transform-0.3.5.dist-info/entry_points.txt +0 -3
- {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,84 +1,102 @@
|
|
|
1
|
-
import carrottransform
|
|
2
|
-
import carrottransform.tools as tools
|
|
3
|
-
import click
|
|
4
|
-
import csv
|
|
5
|
-
import datetime
|
|
6
|
-
import fnmatch
|
|
7
|
-
import importlib.resources
|
|
8
|
-
import json
|
|
9
|
-
import logging
|
|
10
|
-
import os
|
|
11
1
|
import sys
|
|
12
2
|
import time
|
|
13
|
-
|
|
14
|
-
from carrottransform.tools.click import PathArgs
|
|
15
|
-
from carrottransform.tools.omopcdm import OmopCDM
|
|
16
3
|
from pathlib import Path
|
|
4
|
+
import click
|
|
17
5
|
|
|
18
|
-
|
|
19
|
-
from
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
6
|
+
import carrottransform.tools as tools
|
|
7
|
+
from carrottransform.tools.click import PathArgs
|
|
8
|
+
from carrottransform.tools.file_helpers import (
|
|
9
|
+
check_dir_isvalid,
|
|
10
|
+
check_files_in_rules_exist,
|
|
11
|
+
open_file,
|
|
12
|
+
resolve_paths,
|
|
13
|
+
set_omop_filenames,
|
|
14
|
+
)
|
|
15
|
+
from carrottransform.tools.logger import logger_setup
|
|
16
|
+
from carrottransform.tools.core import (
|
|
17
|
+
get_target_records,
|
|
18
|
+
)
|
|
19
|
+
from carrottransform.tools.date_helpers import normalise_to8601
|
|
20
|
+
from carrottransform.tools.person_helpers import (
|
|
21
|
+
load_last_used_ids,
|
|
22
|
+
load_person_ids,
|
|
23
|
+
set_saved_person_id_file,
|
|
24
|
+
)
|
|
25
|
+
from carrottransform.tools.args import person_rules_check, OnlyOnePersonInputAllowed
|
|
26
|
+
|
|
27
|
+
logger = logger_setup()
|
|
36
28
|
|
|
37
29
|
|
|
38
30
|
@click.command()
|
|
39
|
-
@click.option(
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
help="Quoted string containing omop version - eg '5.3'")
|
|
62
|
-
@click.option("--saved-person-id-file", type=PathArgs,
|
|
63
|
-
default=None,
|
|
64
|
-
required=False,
|
|
65
|
-
help="Full path to person id file used to save person_id state and share person_ids between data sets")
|
|
66
|
-
@click.option("--use-input-person-ids",
|
|
67
|
-
required=False,
|
|
68
|
-
default='N',
|
|
69
|
-
help="Use person ids as input without generating new integers")
|
|
70
|
-
@click.option("--last-used-ids-file", type=PathArgs,
|
|
71
|
-
default=None,
|
|
72
|
-
required=False,
|
|
73
|
-
help="Full path to last used ids file for OMOP tables - format: tablename\tlast_used_id, \nwhere last_used_id must be an integer")
|
|
74
|
-
@click.option("--log-file-threshold",
|
|
75
|
-
required=False,
|
|
76
|
-
default=0,
|
|
77
|
-
help="Lower outcount limit for logfile output")
|
|
78
|
-
@click.option("--input-dir", type=PathArgs,
|
|
31
|
+
@click.option(
|
|
32
|
+
"--rules-file",
|
|
33
|
+
type=PathArgs,
|
|
34
|
+
required=True,
|
|
35
|
+
help="json file containing mapping rules",
|
|
36
|
+
)
|
|
37
|
+
@click.option(
|
|
38
|
+
"--output-dir",
|
|
39
|
+
type=PathArgs,
|
|
40
|
+
default=None,
|
|
41
|
+
required=True,
|
|
42
|
+
help="define the output directory for OMOP-format tsv files",
|
|
43
|
+
)
|
|
44
|
+
@click.option(
|
|
45
|
+
"--write-mode",
|
|
46
|
+
default="w",
|
|
47
|
+
type=click.Choice(["w", "a"]),
|
|
48
|
+
help="force write-mode on output files",
|
|
49
|
+
)
|
|
50
|
+
@click.option(
|
|
51
|
+
"--person-file",
|
|
52
|
+
type=PathArgs,
|
|
79
53
|
required=True,
|
|
80
|
-
|
|
81
|
-
|
|
54
|
+
help="File containing person_ids in the first column",
|
|
55
|
+
)
|
|
56
|
+
@click.option(
|
|
57
|
+
"--omop-ddl-file",
|
|
58
|
+
type=PathArgs,
|
|
59
|
+
required=False,
|
|
60
|
+
help="File containing OHDSI ddl statements for OMOP tables",
|
|
61
|
+
)
|
|
62
|
+
@click.option(
|
|
63
|
+
"--omop-config-file",
|
|
64
|
+
type=PathArgs,
|
|
65
|
+
required=False,
|
|
66
|
+
help="File containing additional / override json config for omop outputs",
|
|
67
|
+
)
|
|
68
|
+
@click.option(
|
|
69
|
+
"--omop-version",
|
|
70
|
+
required=False,
|
|
71
|
+
help="Quoted string containing omop version - eg '5.3'",
|
|
72
|
+
)
|
|
73
|
+
@click.option(
|
|
74
|
+
"--saved-person-id-file",
|
|
75
|
+
type=PathArgs,
|
|
76
|
+
default=None,
|
|
77
|
+
required=False,
|
|
78
|
+
help="Full path to person id file used to save person_id state and share person_ids between data sets",
|
|
79
|
+
)
|
|
80
|
+
@click.option(
|
|
81
|
+
"--use-input-person-ids",
|
|
82
|
+
required=False,
|
|
83
|
+
default="N",
|
|
84
|
+
help="Use person ids as input without generating new integers",
|
|
85
|
+
)
|
|
86
|
+
@click.option(
|
|
87
|
+
"--last-used-ids-file",
|
|
88
|
+
type=PathArgs,
|
|
89
|
+
default=None,
|
|
90
|
+
required=False,
|
|
91
|
+
help="Full path to last used ids file for OMOP tables - format: tablename\tlast_used_id, \nwhere last_used_id must be an integer",
|
|
92
|
+
)
|
|
93
|
+
@click.option(
|
|
94
|
+
"--log-file-threshold",
|
|
95
|
+
required=False,
|
|
96
|
+
default=0,
|
|
97
|
+
help="Lower outcount limit for logfile output",
|
|
98
|
+
)
|
|
99
|
+
@click.option("--input-dir", type=PathArgs, required=True, help="Input directories")
|
|
82
100
|
def mapstream(
|
|
83
101
|
rules_file: Path,
|
|
84
102
|
output_dir: Path,
|
|
@@ -91,15 +109,14 @@ def mapstream(
|
|
|
91
109
|
use_input_person_ids,
|
|
92
110
|
last_used_ids_file: Path,
|
|
93
111
|
log_file_threshold,
|
|
94
|
-
input_dir:
|
|
112
|
+
input_dir: Path,
|
|
95
113
|
):
|
|
96
114
|
"""
|
|
97
115
|
Map to output using input streams
|
|
98
116
|
"""
|
|
99
117
|
|
|
100
|
-
|
|
101
118
|
# Resolve any @package paths in the arguments
|
|
102
|
-
|
|
119
|
+
[
|
|
103
120
|
rules_file,
|
|
104
121
|
output_dir,
|
|
105
122
|
person_file,
|
|
@@ -107,20 +124,19 @@ def mapstream(
|
|
|
107
124
|
omop_config_file,
|
|
108
125
|
saved_person_id_file,
|
|
109
126
|
last_used_ids_file,
|
|
110
|
-
input_dir
|
|
111
|
-
]
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
# If it's already a list, leave it as is
|
|
127
|
+
input_dir,
|
|
128
|
+
] = resolve_paths(
|
|
129
|
+
[
|
|
130
|
+
rules_file,
|
|
131
|
+
output_dir,
|
|
132
|
+
person_file,
|
|
133
|
+
omop_ddl_file,
|
|
134
|
+
omop_config_file,
|
|
135
|
+
saved_person_id_file,
|
|
136
|
+
last_used_ids_file,
|
|
137
|
+
input_dir,
|
|
138
|
+
]
|
|
139
|
+
)
|
|
124
140
|
|
|
125
141
|
# Initialisation
|
|
126
142
|
# - check for values in optional arguments
|
|
@@ -150,18 +166,37 @@ def mapstream(
|
|
|
150
166
|
)
|
|
151
167
|
)
|
|
152
168
|
|
|
169
|
+
# check on the rules file
|
|
170
|
+
if (rules_file is None) or (not rules_file.is_file()):
|
|
171
|
+
logger.exception(f"rules file was set to `{rules_file=}` and is missing")
|
|
172
|
+
sys.exit(-1)
|
|
173
|
+
|
|
153
174
|
## set omop filenames
|
|
154
175
|
omop_config_file, omop_ddl_file = set_omop_filenames(
|
|
155
176
|
omop_ddl_file, omop_config_file, omop_version
|
|
156
177
|
)
|
|
157
178
|
## check directories are valid
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
179
|
+
check_dir_isvalid(input_dir) # Input directory must exist - we need the files in it
|
|
180
|
+
check_dir_isvalid(
|
|
181
|
+
output_dir, create_if_missing=True
|
|
182
|
+
) # Create output directory if needed
|
|
162
183
|
|
|
163
184
|
saved_person_id_file = set_saved_person_id_file(saved_person_id_file, output_dir)
|
|
164
185
|
|
|
186
|
+
## check on the person_file_rules
|
|
187
|
+
try:
|
|
188
|
+
person_rules_check(rules_file=rules_file, person_file=person_file)
|
|
189
|
+
except OnlyOnePersonInputAllowed as e:
|
|
190
|
+
inputs = list(sorted(list(e._inputs)))
|
|
191
|
+
|
|
192
|
+
logger.error(
|
|
193
|
+
f"Person properties were mapped from ({inputs}) but can only come from the person file {person_file.name=}"
|
|
194
|
+
)
|
|
195
|
+
sys.exit(-1)
|
|
196
|
+
except Exception as e:
|
|
197
|
+
logger.exception(f"person_file_rules check failed: {e}")
|
|
198
|
+
sys.exit(-1)
|
|
199
|
+
|
|
165
200
|
start_time = time.time()
|
|
166
201
|
## create OmopCDM object, which contains attributes and methods for the omop data tables.
|
|
167
202
|
omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
|
|
@@ -192,21 +227,24 @@ def mapstream(
|
|
|
192
227
|
|
|
193
228
|
try:
|
|
194
229
|
## get all person_ids from file and either renumber with an int or take directly, and add to a dict
|
|
195
|
-
person_lookup, rejected_person_count = load_person_ids(
|
|
196
|
-
|
|
197
|
-
|
|
230
|
+
person_lookup, rejected_person_count = load_person_ids(
|
|
231
|
+
saved_person_id_file, person_file, mappingrules, use_input_person_ids
|
|
232
|
+
)
|
|
233
|
+
|
|
198
234
|
## open person_ids output file
|
|
199
235
|
with saved_person_id_file.open(mode="w") as fhpout:
|
|
200
236
|
## write the header to the file
|
|
201
237
|
fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
|
|
202
238
|
##iterate through the ids and write them to the file.
|
|
203
239
|
for person_id, person_assigned_id in person_lookup.items():
|
|
204
|
-
fhpout.write(f"{str(person_id)}\t{str(person_assigned_id)}")
|
|
240
|
+
fhpout.write(f"{str(person_id)}\t{str(person_assigned_id)}\n")
|
|
205
241
|
|
|
206
242
|
## Initialise output files (adding them to a dict), output a header for each
|
|
207
243
|
## these aren't being closed deliberately
|
|
208
244
|
for tgtfile in output_files:
|
|
209
|
-
fhd[tgtfile] = (
|
|
245
|
+
fhd[tgtfile] = (
|
|
246
|
+
(output_dir / tgtfile).with_suffix(".tsv").open(mode=write_mode)
|
|
247
|
+
)
|
|
210
248
|
if write_mode == "w":
|
|
211
249
|
outhdr = omopcdm.get_omop_column_list(tgtfile)
|
|
212
250
|
fhd[tgtfile].write("\t".join(outhdr) + "\n")
|
|
@@ -218,10 +256,12 @@ def mapstream(
|
|
|
218
256
|
logger.exception(f"I/O - error({e.errno}): {e.strerror} -> {str(e)}")
|
|
219
257
|
exit()
|
|
220
258
|
|
|
221
|
-
logger.info(
|
|
259
|
+
logger.info(
|
|
260
|
+
f"person_id stats: total loaded {len(person_lookup)}, reject count {rejected_person_count}"
|
|
261
|
+
)
|
|
222
262
|
|
|
223
263
|
## Compare files found in the input_dir with those expected based on mapping rules
|
|
224
|
-
existing_input_files = [f.name for f in input_dir
|
|
264
|
+
existing_input_files = [f.name for f in input_dir.glob("*.csv")]
|
|
225
265
|
rules_input_files = mappingrules.get_all_infile_names()
|
|
226
266
|
|
|
227
267
|
## Log mismatches but continue
|
|
@@ -239,27 +279,31 @@ def mapstream(
|
|
|
239
279
|
|
|
240
280
|
## main processing loop, for each input file
|
|
241
281
|
for srcfilename in rules_input_files:
|
|
242
|
-
outcounts = {}
|
|
243
|
-
rejcounts = {}
|
|
244
282
|
rcount = 0
|
|
245
283
|
|
|
246
|
-
|
|
247
|
-
if
|
|
248
|
-
|
|
249
|
-
|
|
284
|
+
fhcsvr = open_file(input_dir / srcfilename)
|
|
285
|
+
if fhcsvr is None: # check if it's none before unpacking
|
|
286
|
+
raise Exception(f"Couldn't find file {srcfilename} in {input_dir}")
|
|
287
|
+
fh, csvr = fhcsvr # unpack now because we can't unpack none
|
|
250
288
|
|
|
251
289
|
## create dict for input file, giving the data and output file
|
|
252
290
|
tgtfiles, src_to_tgt = mappingrules.parse_rules_src_to_tgt(srcfilename)
|
|
253
|
-
infile_datetime_source, infile_person_id_source =
|
|
291
|
+
infile_datetime_source, infile_person_id_source = (
|
|
292
|
+
mappingrules.get_infile_date_person_id(srcfilename)
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
outcounts = {}
|
|
296
|
+
rejcounts = {}
|
|
254
297
|
for tgtfile in tgtfiles:
|
|
255
298
|
outcounts[tgtfile] = 0
|
|
256
299
|
rejcounts[tgtfile] = 0
|
|
300
|
+
|
|
257
301
|
datacolsall = []
|
|
258
|
-
|
|
302
|
+
csv_column_headers = next(csvr)
|
|
259
303
|
dflist = mappingrules.get_infile_data_fields(srcfilename)
|
|
260
|
-
for colname in
|
|
304
|
+
for colname in csv_column_headers:
|
|
261
305
|
datacolsall.append(colname)
|
|
262
|
-
inputcolmap = omopcdm.get_column_map(
|
|
306
|
+
inputcolmap = omopcdm.get_column_map(csv_column_headers)
|
|
263
307
|
pers_id_col = inputcolmap[infile_person_id_source]
|
|
264
308
|
datetime_col = inputcolmap[infile_datetime_source]
|
|
265
309
|
|
|
@@ -271,28 +315,28 @@ def mapstream(
|
|
|
271
315
|
# for each input record
|
|
272
316
|
for indata in csvr:
|
|
273
317
|
metrics.increment_key_count(
|
|
318
|
+
source=srcfilename,
|
|
319
|
+
fieldname="all",
|
|
320
|
+
tablename="all",
|
|
321
|
+
concept_id="all",
|
|
322
|
+
additional="",
|
|
323
|
+
count_type="input_count",
|
|
324
|
+
)
|
|
325
|
+
rcount += 1
|
|
326
|
+
|
|
327
|
+
# if there is a date, parse it - read it is a string and convert to YYYY-MM-DD HH:MM:SS
|
|
328
|
+
fulldate = normalise_to8601(indata[datetime_col])
|
|
329
|
+
if fulldate is not None:
|
|
330
|
+
indata[datetime_col] = fulldate
|
|
331
|
+
else:
|
|
332
|
+
metrics.increment_key_count(
|
|
274
333
|
source=srcfilename,
|
|
275
334
|
fieldname="all",
|
|
276
335
|
tablename="all",
|
|
277
336
|
concept_id="all",
|
|
278
337
|
additional="",
|
|
279
|
-
count_type="
|
|
338
|
+
count_type="input_date_fields",
|
|
280
339
|
)
|
|
281
|
-
rcount += 1
|
|
282
|
-
# if there is a date, parse it - read it is a string and convert to YYYY-MM-DD
|
|
283
|
-
strdate = indata[datetime_col].split(" ")[0]
|
|
284
|
-
fulldate = parse_date(strdate)
|
|
285
|
-
if fulldate is not None:
|
|
286
|
-
indata[datetime_col] = fulldate
|
|
287
|
-
else:
|
|
288
|
-
metrics.increment_key_count(
|
|
289
|
-
source=srcfilename,
|
|
290
|
-
fieldname="all",
|
|
291
|
-
tablename="all",
|
|
292
|
-
concept_id="all",
|
|
293
|
-
additional="",
|
|
294
|
-
count_type="input_date_fields"
|
|
295
|
-
)
|
|
296
340
|
continue
|
|
297
341
|
|
|
298
342
|
for tgtfile in tgtfiles:
|
|
@@ -305,40 +349,61 @@ def mapstream(
|
|
|
305
349
|
datacols = dflist[tgtfile]
|
|
306
350
|
|
|
307
351
|
for datacol in datacols:
|
|
308
|
-
built_records, outrecords, metrics = get_target_records(
|
|
352
|
+
built_records, outrecords, metrics = get_target_records(
|
|
353
|
+
tgtfile,
|
|
354
|
+
tgtcolmap,
|
|
355
|
+
src_to_tgt,
|
|
356
|
+
datacol,
|
|
357
|
+
indata,
|
|
358
|
+
inputcolmap,
|
|
359
|
+
srcfilename,
|
|
360
|
+
omopcdm,
|
|
361
|
+
metrics,
|
|
362
|
+
)
|
|
363
|
+
|
|
309
364
|
if built_records:
|
|
310
365
|
for outrecord in outrecords:
|
|
311
366
|
if auto_num_col is not None:
|
|
312
|
-
outrecord[tgtcolmap[auto_num_col]] = str(
|
|
367
|
+
outrecord[tgtcolmap[auto_num_col]] = str(
|
|
368
|
+
record_numbers[tgtfile]
|
|
369
|
+
)
|
|
313
370
|
### most of the rest of this section is actually to do with metrics
|
|
314
371
|
record_numbers[tgtfile] += 1
|
|
372
|
+
|
|
315
373
|
if (outrecord[tgtcolmap[pers_id_col]]) in person_lookup:
|
|
316
|
-
outrecord[tgtcolmap[pers_id_col]] = person_lookup[
|
|
374
|
+
outrecord[tgtcolmap[pers_id_col]] = person_lookup[
|
|
375
|
+
outrecord[tgtcolmap[pers_id_col]]
|
|
376
|
+
]
|
|
317
377
|
outcounts[tgtfile] += 1
|
|
318
378
|
|
|
319
379
|
metrics.increment_with_datacol(
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
380
|
+
source_path=srcfilename,
|
|
381
|
+
target_file=tgtfile,
|
|
382
|
+
datacol=datacol,
|
|
383
|
+
out_record=outrecord,
|
|
384
|
+
)
|
|
325
385
|
|
|
326
386
|
# write the line to the file
|
|
327
387
|
fhd[tgtfile].write("\t".join(outrecord) + "\n")
|
|
328
388
|
else:
|
|
329
389
|
metrics.increment_key_count(
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
390
|
+
source=srcfilename,
|
|
391
|
+
fieldname="all",
|
|
392
|
+
tablename=tgtfile,
|
|
393
|
+
concept_id="all",
|
|
394
|
+
additional="",
|
|
395
|
+
count_type="invalid_person_ids",
|
|
396
|
+
)
|
|
337
397
|
rejidcounts[srcfilename] += 1
|
|
338
398
|
|
|
399
|
+
if tgtfile == "person":
|
|
400
|
+
break
|
|
401
|
+
|
|
339
402
|
fh.close()
|
|
340
403
|
|
|
341
|
-
logger.info(
|
|
404
|
+
logger.info(
|
|
405
|
+
f"INPUT file data : {srcfilename}: input count {str(rcount)}, time since start {time.time() - start_time:.5} secs"
|
|
406
|
+
)
|
|
342
407
|
for outtablename, count in outcounts.items():
|
|
343
408
|
logger.info(f"TARGET: {outtablename}: output count {str(count)}")
|
|
344
409
|
# END main processing loop
|
|
@@ -346,7 +411,7 @@ def mapstream(
|
|
|
346
411
|
logger.info(
|
|
347
412
|
"--------------------------------------------------------------------------------"
|
|
348
413
|
)
|
|
349
|
-
|
|
414
|
+
|
|
350
415
|
data_summary = metrics.get_mapstream_summary()
|
|
351
416
|
try:
|
|
352
417
|
dsfh = (output_dir / "summary_mapstream.tsv").open(mode="w")
|
|
@@ -361,388 +426,11 @@ def mapstream(
|
|
|
361
426
|
logger.info(f"Elapsed time = {time.time() - start_time:.5f} secs")
|
|
362
427
|
|
|
363
428
|
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
tgtcolmap: dict[str, dict[str, int]],
|
|
367
|
-
rulesmap: dict[str, list[dict[str, list[str]]]],
|
|
368
|
-
srcfield: str,
|
|
369
|
-
srcdata: list[str],
|
|
370
|
-
srccolmap: dict[str, int],
|
|
371
|
-
srcfilename: str,
|
|
372
|
-
omopcdm: OmopCDM,
|
|
373
|
-
metrics: tools.metrics.Metrics) -> tuple[bool, list[str], tools.metrics.Metrics]:
|
|
374
|
-
"""
|
|
375
|
-
build all target records for a given input field
|
|
376
|
-
"""
|
|
377
|
-
build_records = False
|
|
378
|
-
tgtrecords = []
|
|
379
|
-
date_col_data = omopcdm.get_omop_datetime_linked_fields(tgtfilename)
|
|
380
|
-
date_component_data = omopcdm.get_omop_date_field_components(tgtfilename)
|
|
381
|
-
notnull_numeric_fields = omopcdm.get_omop_notnull_numeric_fields(tgtfilename)
|
|
382
|
-
|
|
383
|
-
srckey = f"{srcfilename}~{srcfield}~{tgtfilename}"
|
|
384
|
-
summarykey = srckey + "~all~"
|
|
385
|
-
if valid_value(str(srcdata[srccolmap[srcfield]])):
|
|
386
|
-
## check if either or both of the srckey and summarykey are in the rules
|
|
387
|
-
srcfullkey = srcfilename + "~" + srcfield + "~" + str(srcdata[srccolmap[srcfield]]) + "~" + tgtfilename
|
|
388
|
-
dictkeys = []
|
|
389
|
-
if srcfullkey in rulesmap:
|
|
390
|
-
build_records = True
|
|
391
|
-
dictkeys.append(srcfullkey)
|
|
392
|
-
if srckey in rulesmap:
|
|
393
|
-
build_records = True
|
|
394
|
-
dictkeys.append(srckey)
|
|
395
|
-
if build_records:
|
|
396
|
-
for dictkey in dictkeys:
|
|
397
|
-
for out_data_elem in rulesmap[dictkey]:
|
|
398
|
-
valid_data_elem = True
|
|
399
|
-
## create empty list to store the data. Populate numerical data elements with 0 instead of empty string.
|
|
400
|
-
tgtarray = ['']*len(tgtcolmap)
|
|
401
|
-
for req_integer in notnull_numeric_fields:
|
|
402
|
-
tgtarray[tgtcolmap[req_integer]] = "0"
|
|
403
|
-
for infield, outfield_list in out_data_elem.items():
|
|
404
|
-
for output_col_data in outfield_list:
|
|
405
|
-
if "~" in output_col_data:
|
|
406
|
-
outcol, term = output_col_data.split("~")
|
|
407
|
-
tgtarray[tgtcolmap[outcol]] = term
|
|
408
|
-
else:
|
|
409
|
-
tgtarray[tgtcolmap[output_col_data]] = srcdata[srccolmap[infield]]
|
|
410
|
-
if output_col_data in date_component_data:
|
|
411
|
-
## parse the date and store it in the proper format
|
|
412
|
-
strdate = srcdata[srccolmap[infield]].split(" ")[0]
|
|
413
|
-
dt = get_datetime_value(strdate)
|
|
414
|
-
if dt != None:
|
|
415
|
-
year_field = date_component_data[output_col_data]["year"]
|
|
416
|
-
month_field = date_component_data[output_col_data]["month"]
|
|
417
|
-
day_field = date_component_data[output_col_data]["day"]
|
|
418
|
-
tgtarray[tgtcolmap[year_field]] = str(dt.year)
|
|
419
|
-
tgtarray[tgtcolmap[month_field]] = str(dt.month)
|
|
420
|
-
tgtarray[tgtcolmap[day_field]] = str(dt.day)
|
|
421
|
-
fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
|
|
422
|
-
tgtarray[tgtcolmap[output_col_data]] = fulldate
|
|
423
|
-
else:
|
|
424
|
-
metrics.increment_key_count(
|
|
425
|
-
source=srcfilename,
|
|
426
|
-
fieldname=srcfield,
|
|
427
|
-
tablename=tgtfilename,
|
|
428
|
-
concept_id="all",
|
|
429
|
-
additional="",
|
|
430
|
-
count_type="invalid_date_fields"
|
|
431
|
-
)
|
|
432
|
-
valid_data_elem = False
|
|
433
|
-
elif output_col_data in date_col_data:
|
|
434
|
-
fulldate = srcdata[srccolmap[infield]]
|
|
435
|
-
tgtarray[tgtcolmap[output_col_data]] = fulldate
|
|
436
|
-
tgtarray[tgtcolmap[date_col_data[output_col_data]]] = fulldate
|
|
437
|
-
if valid_data_elem:
|
|
438
|
-
tgtrecords.append(tgtarray)
|
|
439
|
-
else:
|
|
440
|
-
metrics.increment_key_count(
|
|
441
|
-
source=srcfilename,
|
|
442
|
-
fieldname=srcfield,
|
|
443
|
-
tablename=tgtfilename,
|
|
444
|
-
concept_id="all",
|
|
445
|
-
additional="",
|
|
446
|
-
count_type="invalid_source_fields"
|
|
447
|
-
)
|
|
448
|
-
|
|
449
|
-
return build_records, tgtrecords, metrics
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
def valid_value(item):
|
|
453
|
-
"""
|
|
454
|
-
Check if an item is non blank (null)
|
|
455
|
-
"""
|
|
456
|
-
if item.strip() == "":
|
|
457
|
-
return False
|
|
458
|
-
return True
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
# DATE TESTING
|
|
462
|
-
# ------------
|
|
463
|
-
# I started by changing the get_datetime_value to be neater.
|
|
464
|
-
# I think it should be handled all as one thing, but I've spent too much time doing this already
|
|
465
|
-
|
|
466
|
-
def valid_date_value(item):
|
|
467
|
-
"""
|
|
468
|
-
Check if a date item is non null and parses as ISO (YYYY-MM-DD), reverse-ISO
|
|
469
|
-
or dd/mm/yyyy or mm/dd/yyyy
|
|
470
|
-
"""
|
|
471
|
-
if item.strip() == "":
|
|
472
|
-
return(False)
|
|
473
|
-
if not valid_iso_date(item) and not valid_reverse_iso_date(item) and not valid_uk_date(item):
|
|
474
|
-
logger.warning("Bad date : {0}".format(item))
|
|
475
|
-
return False
|
|
476
|
-
return True
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
def get_datetime_value(item):
|
|
480
|
-
"""
|
|
481
|
-
Check if a date item is non-null and parses as ISO (YYYY-MM-DD), reverse-ISO (DD-MM-YYYY),
|
|
482
|
-
or UK format (DD/MM/YYYY).
|
|
483
|
-
Returns a datetime object if successful, None otherwise.
|
|
484
|
-
"""
|
|
485
|
-
date_formats = [
|
|
486
|
-
"%Y-%m-%d", # ISO format (YYYY-MM-DD)
|
|
487
|
-
"%d-%m-%Y", # Reverse ISO format (DD-MM-YYYY)
|
|
488
|
-
"%d/%m/%Y", # UK old-style format (DD/MM/YYYY)
|
|
489
|
-
]
|
|
490
|
-
|
|
491
|
-
for date_format in date_formats:
|
|
492
|
-
try:
|
|
493
|
-
return datetime.datetime.strptime(item, date_format)
|
|
494
|
-
except ValueError:
|
|
495
|
-
continue
|
|
496
|
-
|
|
497
|
-
# If we get here, none of the formats worked
|
|
498
|
-
return None
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
def parse_date(item):
|
|
502
|
-
"""
|
|
503
|
-
Crude hand-coded check on date format
|
|
504
|
-
"""
|
|
505
|
-
datedata = item.split("-")
|
|
506
|
-
if len(datedata) != 3:
|
|
507
|
-
datedata = item.split("/")
|
|
508
|
-
if len(datedata) != 3:
|
|
509
|
-
return None
|
|
510
|
-
if len(datedata[2]) == 4:
|
|
511
|
-
return(f"{datedata[2]}-{datedata[1]}-{datedata[0]}".format(datedata[2], datedata[1], datedata[0]))
|
|
512
|
-
return "-".join(datedata[:3])
|
|
513
|
-
|
|
514
|
-
def valid_iso_date(item):
|
|
515
|
-
"""
|
|
516
|
-
Check if a date item is non null and parses as ISO (YYYY-MM-DD)
|
|
517
|
-
"""
|
|
518
|
-
try:
|
|
519
|
-
datetime.datetime.strptime(item, "%Y-%m-%d")
|
|
520
|
-
except ValueError:
|
|
521
|
-
return False
|
|
522
|
-
|
|
523
|
-
return True
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
def valid_reverse_iso_date(item):
|
|
527
|
-
"""
|
|
528
|
-
Check if a date item is non null and parses as reverse ISO (DD-MM-YYYY)
|
|
529
|
-
"""
|
|
530
|
-
try:
|
|
531
|
-
datetime.datetime.strptime(item, "%d-%m-%Y")
|
|
532
|
-
except ValueError:
|
|
533
|
-
return False
|
|
534
|
-
|
|
535
|
-
return True
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
def valid_uk_date(item):
|
|
539
|
-
"""
|
|
540
|
-
Check if a date item is non null and parses as UK format (DD/MM/YYYY)
|
|
541
|
-
"""
|
|
542
|
-
try:
|
|
543
|
-
datetime.datetime.strptime(item, "%d/%m/%Y")
|
|
544
|
-
except ValueError:
|
|
545
|
-
return False
|
|
546
|
-
|
|
547
|
-
return True
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
# End of date code
|
|
551
|
-
|
|
552
|
-
def load_last_used_ids(last_used_ids_file: Path, last_used_ids):
|
|
553
|
-
fh = last_used_ids_file.open(mode="r", encoding="utf-8-sig")
|
|
554
|
-
csvr = csv.reader(fh, delimiter="\t")
|
|
555
|
-
|
|
556
|
-
for last_ids_data in csvr:
|
|
557
|
-
last_used_ids[last_ids_data[0]] = int(last_ids_data[1]) + 1
|
|
558
|
-
|
|
559
|
-
fh.close()
|
|
560
|
-
return last_used_ids
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
def load_saved_person_ids(person_file: Path):
|
|
564
|
-
fh = person_file.open(mode="r", encoding="utf-8-sig")
|
|
565
|
-
csvr = csv.reader(fh, delimiter="\t")
|
|
566
|
-
last_int = 1
|
|
567
|
-
person_ids = {}
|
|
568
|
-
|
|
569
|
-
next(csvr)
|
|
570
|
-
for persondata in csvr:
|
|
571
|
-
person_ids[persondata[0]] = persondata[1]
|
|
572
|
-
last_int += 1
|
|
573
|
-
|
|
574
|
-
fh.close()
|
|
575
|
-
return person_ids, last_int
|
|
576
|
-
|
|
577
|
-
def load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_person_ids, delim=","):
|
|
578
|
-
person_ids, person_number = get_person_lookup(saved_person_id_file)
|
|
579
|
-
|
|
580
|
-
fh = person_file.open(mode="r", encoding="utf-8-sig")
|
|
581
|
-
csvr = csv.reader(fh, delimiter=delim)
|
|
582
|
-
person_columns = {}
|
|
583
|
-
person_col_in_hdr_number = 0
|
|
584
|
-
reject_count = 0
|
|
585
|
-
|
|
586
|
-
personhdr = next(csvr)
|
|
587
|
-
logger.info(personhdr)
|
|
588
|
-
|
|
589
|
-
# Make a dictionary of column names vs their positions
|
|
590
|
-
for col in personhdr:
|
|
591
|
-
person_columns[col] = person_col_in_hdr_number
|
|
592
|
-
person_col_in_hdr_number += 1
|
|
593
|
-
|
|
594
|
-
## check the mapping rules for person to find where to get the person data) i.e., which column in the person file contains dob, sex
|
|
595
|
-
birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info(
|
|
596
|
-
"person"
|
|
597
|
-
)
|
|
598
|
-
logger.info(
|
|
599
|
-
"Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source)
|
|
600
|
-
)
|
|
601
|
-
|
|
602
|
-
## get the column index of the PersonID from the input file
|
|
603
|
-
person_col = person_columns[person_id_source]
|
|
604
|
-
|
|
605
|
-
for persondata in csvr:
|
|
606
|
-
if not valid_value(persondata[person_columns[person_id_source]]): #just checking that the id is not an empty string
|
|
607
|
-
reject_count += 1
|
|
608
|
-
continue
|
|
609
|
-
if not valid_date_value(persondata[person_columns[birth_datetime_source]]):
|
|
610
|
-
reject_count += 1
|
|
611
|
-
continue
|
|
612
|
-
if persondata[person_col] not in person_ids: #if not already in person_ids dict, add it
|
|
613
|
-
if use_input_person_ids == "N":
|
|
614
|
-
person_ids[persondata[person_col]] = str(person_number) #create a new integer person_id
|
|
615
|
-
person_number += 1
|
|
616
|
-
else:
|
|
617
|
-
person_ids[persondata[person_col]] = str(persondata[person_col]) #use existing person_id
|
|
618
|
-
fh.close()
|
|
619
|
-
|
|
620
|
-
return person_ids, reject_count
|
|
621
|
-
|
|
622
|
-
@click.group(help="Commands for using python configurations to run the ETL transformation.")
|
|
623
|
-
def py():
|
|
429
|
+
@click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
|
|
430
|
+
def run():
|
|
624
431
|
pass
|
|
625
432
|
|
|
626
433
|
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
Args:
|
|
631
|
-
directory: Directory path as string or tuple
|
|
632
|
-
create_if_missing: If True, create directory if it doesn't exist
|
|
633
|
-
"""
|
|
634
|
-
|
|
635
|
-
## check directory has been set
|
|
636
|
-
if directory is None:
|
|
637
|
-
logger.warning("Directory not provided.")
|
|
638
|
-
sys.exit(1)
|
|
639
|
-
|
|
640
|
-
## check output dir is valid
|
|
641
|
-
elif type(directory) is tuple:
|
|
642
|
-
directory = directory[0]
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
## if not a directory, create it if requested (including parents. This option is for the output directory only).
|
|
646
|
-
if not directory.is_dir():
|
|
647
|
-
if create_if_missing:
|
|
648
|
-
try:
|
|
649
|
-
## deliberately not using the exist_ok option, as we want to know whether it was created or not to provide different logger messages.
|
|
650
|
-
directory.mkdir(parents = True)
|
|
651
|
-
logger.info(f"Created directory: {directory}")
|
|
652
|
-
except OSError as e:
|
|
653
|
-
logger.warning(f"Failed to create directory {directory}: {e}")
|
|
654
|
-
sys.exit(1)
|
|
655
|
-
else:
|
|
656
|
-
logger.warning(f"Not a directory, dir {directory}")
|
|
657
|
-
sys.exit(1)
|
|
658
|
-
|
|
659
|
-
# Handle tuple input (like input_dir)
|
|
660
|
-
if isinstance(directory, tuple):
|
|
661
|
-
if not directory: # Empty tuple
|
|
662
|
-
print("No directory provided")
|
|
663
|
-
sys.exit(1)
|
|
664
|
-
directory = directory[0]
|
|
665
|
-
|
|
666
|
-
# Handle string input
|
|
667
|
-
dir_path = str(directory)
|
|
668
|
-
if not os.path.isdir(dir_path):
|
|
669
|
-
if create_if_missing:
|
|
670
|
-
try:
|
|
671
|
-
os.makedirs(dir_path)
|
|
672
|
-
print(f"Created directory: {dir_path}")
|
|
673
|
-
except OSError as e:
|
|
674
|
-
print(f"Failed to create directory {dir_path}: {e}")
|
|
675
|
-
sys.exit(1)
|
|
676
|
-
else:
|
|
677
|
-
print(f"Not a directory, dir {dir_path}")
|
|
678
|
-
sys.exit(1)
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
def set_saved_person_id_file(
|
|
682
|
-
saved_person_id_file: Path | None, output_dir: Path
|
|
683
|
-
) -> Path:
|
|
684
|
-
"""check if there is a saved person id file set in options - if not, check if the file exists and remove it"""
|
|
685
|
-
|
|
686
|
-
if saved_person_id_file is None:
|
|
687
|
-
saved_person_id_file = output_dir / "person_ids.tsv"
|
|
688
|
-
if saved_person_id_file.exists():
|
|
689
|
-
assert not saved_person_id_file.is_dir()
|
|
690
|
-
saved_person_id_file.unlink()
|
|
691
|
-
else:
|
|
692
|
-
assert not saved_person_id_file.is_dir()
|
|
693
|
-
return saved_person_id_file
|
|
694
|
-
|
|
695
|
-
def check_files_in_rules_exist(rules_input_files: list[str], existing_input_files: list[str]) -> None:
|
|
696
|
-
for infile in existing_input_files:
|
|
697
|
-
if infile not in rules_input_files:
|
|
698
|
-
msg = (
|
|
699
|
-
"WARNING: no mapping rules found for existing input file - {0}".format(
|
|
700
|
-
infile
|
|
701
|
-
)
|
|
702
|
-
)
|
|
703
|
-
logger.warning(msg)
|
|
704
|
-
for infile in rules_input_files:
|
|
705
|
-
if infile not in existing_input_files:
|
|
706
|
-
msg = "WARNING: no data for mapped input file - {0}".format(infile)
|
|
707
|
-
logger.warning(msg)
|
|
708
|
-
|
|
709
|
-
def open_file(file_path: Path) -> tuple[IO[str], Iterator[list[str]]] | None:
|
|
710
|
-
"""opens a file and does something related to CSVs"""
|
|
711
|
-
try:
|
|
712
|
-
fh = file_path.open(mode="r", encoding="utf-8-sig")
|
|
713
|
-
csvr = csv.reader(fh)
|
|
714
|
-
return fh, csvr
|
|
715
|
-
except IOError as e:
|
|
716
|
-
logger.exception("Unable to open: {0}".format(file_path))
|
|
717
|
-
logger.exception("I/O error({0}): {1}".format(e.errno, e.strerror))
|
|
718
|
-
return None
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
def set_omop_filenames(
|
|
722
|
-
omop_ddl_file: Path, omop_config_file: Path, omop_version: str
|
|
723
|
-
) -> tuple[Path, Path]:
|
|
724
|
-
if (
|
|
725
|
-
(omop_ddl_file is None)
|
|
726
|
-
and (omop_config_file is None)
|
|
727
|
-
and (omop_version is not None)
|
|
728
|
-
):
|
|
729
|
-
omop_config_file = (
|
|
730
|
-
importlib.resources.files("carrottransform") / "config/omop.json"
|
|
731
|
-
)
|
|
732
|
-
omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
|
|
733
|
-
omop_ddl_file = (
|
|
734
|
-
importlib.resources.files("carrottransform") / "config" / omop_ddl_file_name
|
|
735
|
-
)
|
|
736
|
-
return omop_config_file, omop_ddl_file
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
def get_person_lookup(saved_person_id_file: Path) -> tuple[dict[str, str], int]:
|
|
740
|
-
# Saved-person-file existence test, reload if found, return last used integer
|
|
741
|
-
if saved_person_id_file.is_file():
|
|
742
|
-
person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
|
|
743
|
-
else:
|
|
744
|
-
person_lookup = {}
|
|
745
|
-
last_used_integer = 1
|
|
746
|
-
return person_lookup, last_used_integer
|
|
747
|
-
|
|
748
|
-
run.add_command(mapstream,"mapstream")
|
|
434
|
+
run.add_command(mapstream, "mapstream")
|
|
435
|
+
if __name__ == "__main__":
|
|
436
|
+
run()
|