carrot-transform 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of carrot-transform might be problematic. Click here for more details.
- carrot_transform-0.3.5.dist-info/METADATA +106 -0
- carrot_transform-0.3.5.dist-info/RECORD +25 -0
- {carrot_transform-0.3.3.dist-info → carrot_transform-0.3.5.dist-info}/WHEEL +1 -1
- carrot_transform-0.3.5.dist-info/entry_points.txt +3 -0
- carrottransform/_version.py +6 -2
- carrottransform/cli/subcommands/run.py +445 -193
- carrottransform/examples/test/inputs/Covid19_test.csv +801 -0
- carrottransform/examples/test/inputs/Demographics.csv +1001 -0
- carrottransform/examples/test/inputs/Symptoms.csv +801 -0
- carrottransform/examples/test/inputs/covid19_antibody.csv +1001 -0
- carrottransform/examples/test/inputs/vaccine.csv +501 -0
- carrottransform/examples/test/rules/rules_14June2021.json +300 -0
- carrottransform/tools/click.py +21 -0
- carrottransform/tools/file_helpers.py +30 -4
- carrottransform/tools/mappingrules.py +13 -10
- carrottransform/tools/metrics.py +212 -40
- carrottransform/tools/omopcdm.py +17 -5
- carrot_transform-0.3.3.dist-info/METADATA +0 -48
- carrot_transform-0.3.3.dist-info/RECORD +0 -17
- {carrot_transform-0.3.3.dist-info → carrot_transform-0.3.5.dist-info}/LICENSE +0 -0
|
@@ -1,42 +1,65 @@
|
|
|
1
|
+
import carrottransform
|
|
2
|
+
import carrottransform.tools as tools
|
|
3
|
+
import click
|
|
1
4
|
import csv
|
|
2
|
-
import os, time
|
|
3
5
|
import datetime
|
|
4
6
|
import fnmatch
|
|
5
|
-
import sys
|
|
6
|
-
import click
|
|
7
|
-
import json
|
|
8
7
|
import importlib.resources
|
|
9
|
-
import
|
|
10
|
-
import
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
import sys
|
|
12
|
+
import time
|
|
13
|
+
|
|
14
|
+
from carrottransform.tools.click import PathArgs
|
|
15
|
+
from carrottransform.tools.omopcdm import OmopCDM
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from typing import Iterator, IO, Iterable
|
|
19
|
+
from ...tools.file_helpers import resolve_paths
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
if not logger.handlers:
|
|
23
|
+
logger.setLevel(logging.INFO)
|
|
24
|
+
|
|
25
|
+
console_handler = logging.StreamHandler()
|
|
26
|
+
console_handler.setLevel(logging.INFO)
|
|
27
|
+
|
|
28
|
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
29
|
+
console_handler.setFormatter(formatter)
|
|
30
|
+
|
|
31
|
+
logger.addHandler(console_handler)
|
|
11
32
|
|
|
12
33
|
@click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
|
|
13
34
|
def run():
|
|
14
35
|
pass
|
|
15
36
|
|
|
37
|
+
|
|
16
38
|
@click.command()
|
|
17
|
-
@click.option("--rules-file",
|
|
39
|
+
@click.option("--rules-file", type=PathArgs,
|
|
18
40
|
required=True,
|
|
19
41
|
help="json file containing mapping rules")
|
|
20
|
-
@click.option("--output-dir",
|
|
42
|
+
@click.option("--output-dir", type=PathArgs,
|
|
21
43
|
default=None,
|
|
44
|
+
required=True,
|
|
22
45
|
help="define the output directory for OMOP-format tsv files")
|
|
23
46
|
@click.option("--write-mode",
|
|
24
47
|
default='w',
|
|
25
48
|
type=click.Choice(['w','a']),
|
|
26
49
|
help="force write-mode on output files")
|
|
27
|
-
@click.option("--person-file",
|
|
50
|
+
@click.option("--person-file", type=PathArgs,
|
|
28
51
|
required=True,
|
|
29
52
|
help="File containing person_ids in the first column")
|
|
30
|
-
@click.option("--omop-ddl-file",
|
|
53
|
+
@click.option("--omop-ddl-file", type=PathArgs,
|
|
31
54
|
required=False,
|
|
32
55
|
help="File containing OHDSI ddl statements for OMOP tables")
|
|
33
|
-
@click.option("--omop-config-file",
|
|
56
|
+
@click.option("--omop-config-file", type=PathArgs,
|
|
34
57
|
required=False,
|
|
35
58
|
help="File containing additional / override json config for omop outputs")
|
|
36
59
|
@click.option("--omop-version",
|
|
37
60
|
required=False,
|
|
38
|
-
help="Quoted string containing
|
|
39
|
-
@click.option("--saved-person-id-file",
|
|
61
|
+
help="Quoted string containing omop version - eg '5.3'")
|
|
62
|
+
@click.option("--saved-person-id-file", type=PathArgs,
|
|
40
63
|
default=None,
|
|
41
64
|
required=False,
|
|
42
65
|
help="Full path to person id file used to save person_id state and share person_ids between data sets")
|
|
@@ -44,7 +67,7 @@ def run():
|
|
|
44
67
|
required=False,
|
|
45
68
|
default='N',
|
|
46
69
|
help="Use person ids as input without generating new integers")
|
|
47
|
-
@click.option("--last-used-ids-file",
|
|
70
|
+
@click.option("--last-used-ids-file", type=PathArgs,
|
|
48
71
|
default=None,
|
|
49
72
|
required=False,
|
|
50
73
|
help="Full path to last used ids file for OMOP tables - format: tablename\tlast_used_id, \nwhere last_used_id must be an integer")
|
|
@@ -52,124 +75,180 @@ def run():
|
|
|
52
75
|
required=False,
|
|
53
76
|
default=0,
|
|
54
77
|
help="Lower outcount limit for logfile output")
|
|
55
|
-
@click.
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
78
|
+
@click.option("--input-dir", type=PathArgs,
|
|
79
|
+
required=True,
|
|
80
|
+
multiple=True,
|
|
81
|
+
help="Input directories")
|
|
82
|
+
def mapstream(
|
|
83
|
+
rules_file: Path,
|
|
84
|
+
output_dir: Path,
|
|
85
|
+
write_mode,
|
|
86
|
+
person_file: Path,
|
|
87
|
+
omop_ddl_file: Path,
|
|
88
|
+
omop_config_file: Path,
|
|
89
|
+
omop_version,
|
|
90
|
+
saved_person_id_file: Path,
|
|
91
|
+
use_input_person_ids,
|
|
92
|
+
last_used_ids_file: Path,
|
|
93
|
+
log_file_threshold,
|
|
94
|
+
input_dir: Iterable[Path],
|
|
95
|
+
):
|
|
62
96
|
"""
|
|
63
97
|
Map to output using input streams
|
|
64
98
|
"""
|
|
65
|
-
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# Resolve any @package paths in the arguments
|
|
102
|
+
resolved_paths = resolve_paths([
|
|
103
|
+
rules_file,
|
|
104
|
+
output_dir,
|
|
105
|
+
person_file,
|
|
106
|
+
omop_ddl_file,
|
|
107
|
+
omop_config_file,
|
|
108
|
+
saved_person_id_file,
|
|
109
|
+
last_used_ids_file,
|
|
110
|
+
input_dir[0] if input_dir else None # Take first element of input_dir tuple
|
|
111
|
+
])
|
|
112
|
+
|
|
113
|
+
# Assign back resolved paths
|
|
114
|
+
[rules_file, output_dir, person_file, omop_ddl_file,
|
|
115
|
+
omop_config_file, saved_person_id_file, last_used_ids_file,
|
|
116
|
+
input_dir] = resolved_paths
|
|
117
|
+
|
|
118
|
+
# Ensure input_dir is a list of paths
|
|
119
|
+
if isinstance(input_dir, (Path, str)):
|
|
120
|
+
input_dir = [input_dir]
|
|
121
|
+
elif isinstance(input_dir, tuple):
|
|
122
|
+
input_dir = list(input_dir)
|
|
123
|
+
# If it's already a list, leave it as is
|
|
124
|
+
|
|
125
|
+
# Initialisation
|
|
66
126
|
# - check for values in optional arguments
|
|
67
127
|
# - read in configuration files
|
|
68
128
|
# - check main directories for existence
|
|
69
|
-
# - handle saved
|
|
70
|
-
# - initialise metrics
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
129
|
+
# - handle saved person ids
|
|
130
|
+
# - initialise metrics
|
|
131
|
+
logger.info(
|
|
132
|
+
",".join(
|
|
133
|
+
map(
|
|
134
|
+
str,
|
|
135
|
+
[
|
|
136
|
+
rules_file,
|
|
137
|
+
output_dir,
|
|
138
|
+
write_mode,
|
|
139
|
+
person_file,
|
|
140
|
+
omop_ddl_file,
|
|
141
|
+
omop_config_file,
|
|
142
|
+
omop_version,
|
|
143
|
+
saved_person_id_file,
|
|
144
|
+
use_input_person_ids,
|
|
145
|
+
last_used_ids_file,
|
|
146
|
+
log_file_threshold,
|
|
147
|
+
input_dir,
|
|
148
|
+
],
|
|
149
|
+
)
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
## set omop filenames
|
|
154
|
+
omop_config_file, omop_ddl_file = set_omop_filenames(
|
|
155
|
+
omop_ddl_file, omop_config_file, omop_version
|
|
156
|
+
)
|
|
157
|
+
## check directories are valid
|
|
158
|
+
for idir in input_dir:
|
|
159
|
+
check_dir_isvalid(idir) # Input directory must exist
|
|
160
|
+
check_dir_isvalid(output_dir, create_if_missing=True) # Create output directory if needed
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
saved_person_id_file = set_saved_person_id_file(saved_person_id_file, output_dir)
|
|
164
|
+
|
|
165
|
+
start_time = time.time()
|
|
166
|
+
## create OmopCDM object, which contains attributes and methods for the omop data tables.
|
|
90
167
|
omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
|
|
168
|
+
|
|
169
|
+
## mapping rules determine the ouput files? which input files and fields in the source data, AND the mappings to omop concepts
|
|
91
170
|
mappingrules = tools.mappingrules.MappingRules(rules_file, omopcdm)
|
|
92
171
|
metrics = tools.metrics.Metrics(mappingrules.get_dataset_name(), log_file_threshold)
|
|
93
|
-
nowtime = time.time()
|
|
94
172
|
|
|
95
|
-
|
|
96
|
-
|
|
173
|
+
logger.info(
|
|
174
|
+
"--------------------------------------------------------------------------------"
|
|
175
|
+
)
|
|
176
|
+
logger.info(
|
|
177
|
+
f"Loaded mapping rules from: {rules_file} in {time.time() - start_time:.5f} secs"
|
|
178
|
+
)
|
|
179
|
+
|
|
97
180
|
output_files = mappingrules.get_all_outfile_names()
|
|
181
|
+
|
|
182
|
+
## set record number
|
|
183
|
+
## will keep track of the current record number in each file, e.g., measurement_id, observation_id.
|
|
98
184
|
record_numbers = {}
|
|
99
185
|
for output_file in output_files:
|
|
100
186
|
record_numbers[output_file] = 1
|
|
187
|
+
if (last_used_ids_file is not None) and last_used_ids_file.is_file():
|
|
188
|
+
record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
|
|
101
189
|
|
|
102
190
|
fhd = {}
|
|
103
191
|
tgtcolmaps = {}
|
|
104
192
|
|
|
105
193
|
try:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
|
|
121
|
-
fhpout.close()
|
|
122
|
-
# Initialise output files, output a header for each
|
|
194
|
+
## get all person_ids from file and either renumber with an int or take directly, and add to a dict
|
|
195
|
+
person_lookup, rejected_person_count = load_person_ids(saved_person_id_file,
|
|
196
|
+
person_file, mappingrules,
|
|
197
|
+
use_input_person_ids)
|
|
198
|
+
## open person_ids output file
|
|
199
|
+
with saved_person_id_file.open(mode="w") as fhpout:
|
|
200
|
+
## write the header to the file
|
|
201
|
+
fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
|
|
202
|
+
##iterate through the ids and write them to the file.
|
|
203
|
+
for person_id, person_assigned_id in person_lookup.items():
|
|
204
|
+
fhpout.write(f"{str(person_id)}\t{str(person_assigned_id)}")
|
|
205
|
+
|
|
206
|
+
## Initialise output files (adding them to a dict), output a header for each
|
|
207
|
+
## these aren't being closed deliberately
|
|
123
208
|
for tgtfile in output_files:
|
|
124
|
-
fhd[tgtfile] =
|
|
125
|
-
if write_mode ==
|
|
209
|
+
fhd[tgtfile] = (output_dir / tgtfile).with_suffix(".tsv").open(mode=write_mode)
|
|
210
|
+
if write_mode == "w":
|
|
126
211
|
outhdr = omopcdm.get_omop_column_list(tgtfile)
|
|
127
212
|
fhd[tgtfile].write("\t".join(outhdr) + "\n")
|
|
213
|
+
## maps all omop columns for each file into a dict containing the column name and the index
|
|
214
|
+
## so tgtcolmaps is a dict of dicts.
|
|
128
215
|
tgtcolmaps[tgtfile] = omopcdm.get_omop_column_map(tgtfile)
|
|
129
216
|
|
|
130
217
|
except IOError as e:
|
|
131
|
-
|
|
218
|
+
logger.exception(f"I/O - error({e.errno}): {e.strerror} -> {str(e)}")
|
|
132
219
|
exit()
|
|
133
220
|
|
|
134
|
-
|
|
221
|
+
logger.info(f"person_id stats: total loaded {len(person_lookup)}, reject count {rejected_person_count}")
|
|
135
222
|
|
|
136
|
-
|
|
137
|
-
existing_input_files =
|
|
223
|
+
## Compare files found in the input_dir with those expected based on mapping rules
|
|
224
|
+
existing_input_files = [f.name for f in input_dir[0].glob("*.csv")]
|
|
138
225
|
rules_input_files = mappingrules.get_all_infile_names()
|
|
139
|
-
# Log mismatches but continue
|
|
140
|
-
for infile in existing_input_files:
|
|
141
|
-
if infile not in rules_input_files:
|
|
142
|
-
msg = "ERROR: no mapping rules found for existing input file - {0}".format(infile)
|
|
143
|
-
print(msg)
|
|
144
|
-
for infile in rules_input_files:
|
|
145
|
-
if infile not in existing_input_files:
|
|
146
|
-
msg = "ERROR: no data for mapped input file - {0}".format(infile)
|
|
147
|
-
print(msg)
|
|
148
226
|
|
|
149
|
-
|
|
227
|
+
## Log mismatches but continue
|
|
228
|
+
check_files_in_rules_exist(rules_input_files, existing_input_files)
|
|
229
|
+
|
|
230
|
+
## set up overall counts
|
|
150
231
|
rejidcounts = {}
|
|
151
232
|
rejdatecounts = {}
|
|
152
|
-
|
|
233
|
+
logger.info(rules_input_files)
|
|
153
234
|
|
|
154
|
-
|
|
235
|
+
## set up per-input counts
|
|
155
236
|
for srcfilename in rules_input_files:
|
|
156
237
|
rejidcounts[srcfilename] = 0
|
|
157
238
|
rejdatecounts[srcfilename] = 0
|
|
158
239
|
|
|
159
|
-
|
|
240
|
+
## main processing loop, for each input file
|
|
160
241
|
for srcfilename in rules_input_files:
|
|
161
242
|
outcounts = {}
|
|
162
243
|
rejcounts = {}
|
|
163
244
|
rcount = 0
|
|
164
245
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
csvr = csv.reader(fh)
|
|
168
|
-
except IOError as e:
|
|
169
|
-
print("Unable to open: {0}".format(input_dir[0] + "/" + srcfilename))
|
|
170
|
-
print("I/O error({0}): {1}".format(e.errno, e.strerror))
|
|
246
|
+
fh, csvr = open_file(input_dir[0] / srcfilename)
|
|
247
|
+
if fh is None:
|
|
171
248
|
continue
|
|
172
249
|
|
|
250
|
+
|
|
251
|
+
## create dict for input file, giving the data and output file
|
|
173
252
|
tgtfiles, src_to_tgt = mappingrules.parse_rules_src_to_tgt(srcfilename)
|
|
174
253
|
infile_datetime_source, infile_person_id_source = mappingrules.get_infile_date_person_id(srcfilename)
|
|
175
254
|
for tgtfile in tgtfiles:
|
|
@@ -183,20 +262,37 @@ def mapstream(rules_file, output_dir, write_mode,
|
|
|
183
262
|
inputcolmap = omopcdm.get_column_map(hdrdata)
|
|
184
263
|
pers_id_col = inputcolmap[infile_person_id_source]
|
|
185
264
|
datetime_col = inputcolmap[infile_datetime_source]
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
265
|
+
|
|
266
|
+
logger.info(
|
|
267
|
+
"--------------------------------------------------------------------------------"
|
|
268
|
+
)
|
|
269
|
+
logger.info(f"Processing input: {srcfilename}")
|
|
270
|
+
|
|
189
271
|
# for each input record
|
|
190
272
|
for indata in csvr:
|
|
191
|
-
|
|
192
|
-
|
|
273
|
+
metrics.increment_key_count(
|
|
274
|
+
source=srcfilename,
|
|
275
|
+
fieldname="all",
|
|
276
|
+
tablename="all",
|
|
277
|
+
concept_id="all",
|
|
278
|
+
additional="",
|
|
279
|
+
count_type="input_count"
|
|
280
|
+
)
|
|
193
281
|
rcount += 1
|
|
282
|
+
# if there is a date, parse it - read it is a string and convert to YYYY-MM-DD
|
|
194
283
|
strdate = indata[datetime_col].split(" ")[0]
|
|
195
284
|
fulldate = parse_date(strdate)
|
|
196
|
-
if fulldate
|
|
285
|
+
if fulldate is not None:
|
|
197
286
|
indata[datetime_col] = fulldate
|
|
198
287
|
else:
|
|
199
|
-
metrics.increment_key_count(
|
|
288
|
+
metrics.increment_key_count(
|
|
289
|
+
source=srcfilename,
|
|
290
|
+
fieldname="all",
|
|
291
|
+
tablename="all",
|
|
292
|
+
concept_id="all",
|
|
293
|
+
additional="",
|
|
294
|
+
count_type="input_date_fields"
|
|
295
|
+
)
|
|
200
296
|
continue
|
|
201
297
|
|
|
202
298
|
for tgtfile in tgtfiles:
|
|
@@ -210,63 +306,71 @@ def mapstream(rules_file, output_dir, write_mode,
|
|
|
210
306
|
|
|
211
307
|
for datacol in datacols:
|
|
212
308
|
built_records, outrecords, metrics = get_target_records(tgtfile, tgtcolmap, src_to_tgt, datacol, indata, inputcolmap, srcfilename, omopcdm, metrics)
|
|
213
|
-
if built_records
|
|
309
|
+
if built_records:
|
|
214
310
|
for outrecord in outrecords:
|
|
215
|
-
if auto_num_col
|
|
311
|
+
if auto_num_col is not None:
|
|
216
312
|
outrecord[tgtcolmap[auto_num_col]] = str(record_numbers[tgtfile])
|
|
313
|
+
### most of the rest of this section is actually to do with metrics
|
|
217
314
|
record_numbers[tgtfile] += 1
|
|
218
315
|
if (outrecord[tgtcolmap[pers_id_col]]) in person_lookup:
|
|
219
316
|
outrecord[tgtcolmap[pers_id_col]] = person_lookup[outrecord[tgtcolmap[pers_id_col]]]
|
|
220
317
|
outcounts[tgtfile] += 1
|
|
221
|
-
|
|
222
|
-
metrics.
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
key = srcfilename + "~" + datacol +"~" + tgtfile + "~" + outrecord[1] + "~" + outrecord[2]
|
|
231
|
-
metrics.increment_key_count(key, "output_count")
|
|
232
|
-
else:
|
|
233
|
-
key = srcfilename + "~" + datacol +"~" + tgtfile + "~" + outrecord[2] + "~"
|
|
234
|
-
metrics.increment_key_count(key, "output_count")
|
|
235
|
-
key = srcfilename + "~all~" + tgtfile + "~" + outrecord[2] + "~"
|
|
236
|
-
metrics.increment_key_count(key, "output_count")
|
|
237
|
-
key = "all~all~" + tgtfile + "~" + outrecord[2] + "~"
|
|
238
|
-
metrics.increment_key_count(key, "output_count")
|
|
239
|
-
key = "all~all~all~" + outrecord[2] + "~"
|
|
240
|
-
metrics.increment_key_count(key, "output_count")
|
|
318
|
+
|
|
319
|
+
metrics.increment_with_datacol(
|
|
320
|
+
source_path=srcfilename,
|
|
321
|
+
target_file=tgtfile,
|
|
322
|
+
datacol=datacol,
|
|
323
|
+
out_record=outrecord
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
# write the line to the file
|
|
241
327
|
fhd[tgtfile].write("\t".join(outrecord) + "\n")
|
|
242
328
|
else:
|
|
243
|
-
|
|
244
|
-
|
|
329
|
+
metrics.increment_key_count(
|
|
330
|
+
source=srcfilename,
|
|
331
|
+
fieldname="all",
|
|
332
|
+
tablename=tgtfile,
|
|
333
|
+
concept_id="all",
|
|
334
|
+
additional="",
|
|
335
|
+
count_type="invalid_person_ids",
|
|
336
|
+
)
|
|
245
337
|
rejidcounts[srcfilename] += 1
|
|
246
338
|
|
|
247
339
|
fh.close()
|
|
248
340
|
|
|
249
|
-
|
|
250
|
-
print("INPUT file data : {0}: input count {1}, time since start {2:.5} secs".format(srcfilename, str(rcount), (nowtime - starttime)))
|
|
341
|
+
logger.info(f"INPUT file data : {srcfilename}: input count {str(rcount)}, time since start {time.time() - start_time:.5} secs")
|
|
251
342
|
for outtablename, count in outcounts.items():
|
|
252
|
-
|
|
343
|
+
logger.info(f"TARGET: {outtablename}: output count {str(count)}")
|
|
253
344
|
# END main processing loop
|
|
254
345
|
|
|
255
|
-
|
|
346
|
+
logger.info(
|
|
347
|
+
"--------------------------------------------------------------------------------"
|
|
348
|
+
)
|
|
349
|
+
|
|
256
350
|
data_summary = metrics.get_mapstream_summary()
|
|
257
351
|
try:
|
|
258
|
-
dsfh =
|
|
352
|
+
dsfh = (output_dir / "summary_mapstream.tsv").open(mode="w")
|
|
259
353
|
dsfh.write(data_summary)
|
|
260
354
|
dsfh.close()
|
|
261
355
|
except IOError as e:
|
|
262
|
-
|
|
263
|
-
|
|
356
|
+
logger.exception(f"I/O error({e.errno}): {e.strerror}")
|
|
357
|
+
logger.exception("Unable to write file")
|
|
358
|
+
raise e
|
|
264
359
|
|
|
265
360
|
# END mapstream
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
def get_target_records(
|
|
361
|
+
logger.info(f"Elapsed time = {time.time() - start_time:.5f} secs")
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def get_target_records(
|
|
365
|
+
tgtfilename: str,
|
|
366
|
+
tgtcolmap: dict[str, dict[str, int]],
|
|
367
|
+
rulesmap: dict[str, list[dict[str, list[str]]]],
|
|
368
|
+
srcfield: str,
|
|
369
|
+
srcdata: list[str],
|
|
370
|
+
srccolmap: dict[str, int],
|
|
371
|
+
srcfilename: str,
|
|
372
|
+
omopcdm: OmopCDM,
|
|
373
|
+
metrics: tools.metrics.Metrics) -> tuple[bool, list[str], tools.metrics.Metrics]:
|
|
270
374
|
"""
|
|
271
375
|
build all target records for a given input field
|
|
272
376
|
"""
|
|
@@ -276,9 +380,10 @@ def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srcc
|
|
|
276
380
|
date_component_data = omopcdm.get_omop_date_field_components(tgtfilename)
|
|
277
381
|
notnull_numeric_fields = omopcdm.get_omop_notnull_numeric_fields(tgtfilename)
|
|
278
382
|
|
|
279
|
-
srckey = srcfilename
|
|
280
|
-
summarykey =
|
|
383
|
+
srckey = f"{srcfilename}~{srcfield}~{tgtfilename}"
|
|
384
|
+
summarykey = srckey + "~all~"
|
|
281
385
|
if valid_value(str(srcdata[srccolmap[srcfield]])):
|
|
386
|
+
## check if either or both of the srckey and summarykey are in the rules
|
|
282
387
|
srcfullkey = srcfilename + "~" + srcfield + "~" + str(srcdata[srccolmap[srcfield]]) + "~" + tgtfilename
|
|
283
388
|
dictkeys = []
|
|
284
389
|
if srcfullkey in rulesmap:
|
|
@@ -287,10 +392,11 @@ def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srcc
|
|
|
287
392
|
if srckey in rulesmap:
|
|
288
393
|
build_records = True
|
|
289
394
|
dictkeys.append(srckey)
|
|
290
|
-
if build_records
|
|
395
|
+
if build_records:
|
|
291
396
|
for dictkey in dictkeys:
|
|
292
397
|
for out_data_elem in rulesmap[dictkey]:
|
|
293
398
|
valid_data_elem = True
|
|
399
|
+
## create empty list to store the data. Populate numerical data elements with 0 instead of empty string.
|
|
294
400
|
tgtarray = ['']*len(tgtcolmap)
|
|
295
401
|
for req_integer in notnull_numeric_fields:
|
|
296
402
|
tgtarray[tgtcolmap[req_integer]] = "0"
|
|
@@ -302,6 +408,7 @@ def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srcc
|
|
|
302
408
|
else:
|
|
303
409
|
tgtarray[tgtcolmap[output_col_data]] = srcdata[srccolmap[infield]]
|
|
304
410
|
if output_col_data in date_component_data:
|
|
411
|
+
## parse the date and store it in the proper format
|
|
305
412
|
strdate = srcdata[srccolmap[infield]].split(" ")[0]
|
|
306
413
|
dt = get_datetime_value(strdate)
|
|
307
414
|
if dt != None:
|
|
@@ -314,27 +421,47 @@ def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srcc
|
|
|
314
421
|
fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
|
|
315
422
|
tgtarray[tgtcolmap[output_col_data]] = fulldate
|
|
316
423
|
else:
|
|
317
|
-
metrics.increment_key_count(
|
|
424
|
+
metrics.increment_key_count(
|
|
425
|
+
source=srcfilename,
|
|
426
|
+
fieldname=srcfield,
|
|
427
|
+
tablename=tgtfilename,
|
|
428
|
+
concept_id="all",
|
|
429
|
+
additional="",
|
|
430
|
+
count_type="invalid_date_fields"
|
|
431
|
+
)
|
|
318
432
|
valid_data_elem = False
|
|
319
433
|
elif output_col_data in date_col_data:
|
|
320
434
|
fulldate = srcdata[srccolmap[infield]]
|
|
321
435
|
tgtarray[tgtcolmap[output_col_data]] = fulldate
|
|
322
436
|
tgtarray[tgtcolmap[date_col_data[output_col_data]]] = fulldate
|
|
323
|
-
if valid_data_elem
|
|
437
|
+
if valid_data_elem:
|
|
324
438
|
tgtrecords.append(tgtarray)
|
|
325
439
|
else:
|
|
326
|
-
metrics.increment_key_count(
|
|
327
|
-
|
|
440
|
+
metrics.increment_key_count(
|
|
441
|
+
source=srcfilename,
|
|
442
|
+
fieldname=srcfield,
|
|
443
|
+
tablename=tgtfilename,
|
|
444
|
+
concept_id="all",
|
|
445
|
+
additional="",
|
|
446
|
+
count_type="invalid_source_fields"
|
|
447
|
+
)
|
|
328
448
|
|
|
329
449
|
return build_records, tgtrecords, metrics
|
|
330
450
|
|
|
451
|
+
|
|
331
452
|
def valid_value(item):
|
|
332
453
|
"""
|
|
333
454
|
Check if an item is non blank (null)
|
|
334
455
|
"""
|
|
335
456
|
if item.strip() == "":
|
|
336
|
-
return
|
|
337
|
-
return
|
|
457
|
+
return False
|
|
458
|
+
return True
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
# DATE TESTING
|
|
462
|
+
# ------------
|
|
463
|
+
# I started by changing the get_datetime_value to be neater.
|
|
464
|
+
# I think it should be handled all as one thing, but I've spent too much time doing this already
|
|
338
465
|
|
|
339
466
|
def valid_date_value(item):
|
|
340
467
|
"""
|
|
@@ -344,44 +471,33 @@ def valid_date_value(item):
|
|
|
344
471
|
if item.strip() == "":
|
|
345
472
|
return(False)
|
|
346
473
|
if not valid_iso_date(item) and not valid_reverse_iso_date(item) and not valid_uk_date(item):
|
|
347
|
-
|
|
348
|
-
return
|
|
349
|
-
return
|
|
474
|
+
logger.warning("Bad date : {0}".format(item))
|
|
475
|
+
return False
|
|
476
|
+
return True
|
|
477
|
+
|
|
350
478
|
|
|
351
479
|
def get_datetime_value(item):
|
|
352
480
|
"""
|
|
353
|
-
Check if a date item is non
|
|
354
|
-
or
|
|
481
|
+
Check if a date item is non-null and parses as ISO (YYYY-MM-DD), reverse-ISO (DD-MM-YYYY),
|
|
482
|
+
or UK format (DD/MM/YYYY).
|
|
483
|
+
Returns a datetime object if successful, None otherwise.
|
|
355
484
|
"""
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
pass
|
|
370
|
-
|
|
371
|
-
if dt != None:
|
|
372
|
-
return(dt)
|
|
373
|
-
|
|
374
|
-
# Does the date parse as a UK old-style date?
|
|
375
|
-
try:
|
|
376
|
-
dt = datetime.datetime.strptime(item, "%d/%m/%Y")
|
|
377
|
-
except ValueError:
|
|
378
|
-
pass
|
|
379
|
-
|
|
380
|
-
if dt != None:
|
|
381
|
-
return(dt)
|
|
382
|
-
|
|
485
|
+
date_formats = [
|
|
486
|
+
"%Y-%m-%d", # ISO format (YYYY-MM-DD)
|
|
487
|
+
"%d-%m-%Y", # Reverse ISO format (DD-MM-YYYY)
|
|
488
|
+
"%d/%m/%Y", # UK old-style format (DD/MM/YYYY)
|
|
489
|
+
]
|
|
490
|
+
|
|
491
|
+
for date_format in date_formats:
|
|
492
|
+
try:
|
|
493
|
+
return datetime.datetime.strptime(item, date_format)
|
|
494
|
+
except ValueError:
|
|
495
|
+
continue
|
|
496
|
+
|
|
497
|
+
# If we get here, none of the formats worked
|
|
383
498
|
return None
|
|
384
499
|
|
|
500
|
+
|
|
385
501
|
def parse_date(item):
|
|
386
502
|
"""
|
|
387
503
|
Crude hand-coded check on date format
|
|
@@ -392,9 +508,8 @@ def parse_date(item):
|
|
|
392
508
|
if len(datedata) != 3:
|
|
393
509
|
return None
|
|
394
510
|
if len(datedata[2]) == 4:
|
|
395
|
-
return("{
|
|
396
|
-
return
|
|
397
|
-
|
|
511
|
+
return(f"{datedata[2]}-{datedata[1]}-{datedata[0]}".format(datedata[2], datedata[1], datedata[0]))
|
|
512
|
+
return "-".join(datedata[:3])
|
|
398
513
|
|
|
399
514
|
def valid_iso_date(item):
|
|
400
515
|
"""
|
|
@@ -403,9 +518,10 @@ def valid_iso_date(item):
|
|
|
403
518
|
try:
|
|
404
519
|
datetime.datetime.strptime(item, "%Y-%m-%d")
|
|
405
520
|
except ValueError:
|
|
406
|
-
return
|
|
521
|
+
return False
|
|
522
|
+
|
|
523
|
+
return True
|
|
407
524
|
|
|
408
|
-
return(True)
|
|
409
525
|
|
|
410
526
|
def valid_reverse_iso_date(item):
|
|
411
527
|
"""
|
|
@@ -414,9 +530,10 @@ def valid_reverse_iso_date(item):
|
|
|
414
530
|
try:
|
|
415
531
|
datetime.datetime.strptime(item, "%d-%m-%Y")
|
|
416
532
|
except ValueError:
|
|
417
|
-
return
|
|
533
|
+
return False
|
|
534
|
+
|
|
535
|
+
return True
|
|
418
536
|
|
|
419
|
-
return(True)
|
|
420
537
|
|
|
421
538
|
def valid_uk_date(item):
|
|
422
539
|
"""
|
|
@@ -425,12 +542,15 @@ def valid_uk_date(item):
|
|
|
425
542
|
try:
|
|
426
543
|
datetime.datetime.strptime(item, "%d/%m/%Y")
|
|
427
544
|
except ValueError:
|
|
428
|
-
return
|
|
545
|
+
return False
|
|
546
|
+
|
|
547
|
+
return True
|
|
548
|
+
|
|
429
549
|
|
|
430
|
-
|
|
550
|
+
# End of date code
|
|
431
551
|
|
|
432
|
-
def load_last_used_ids(last_used_ids_file, last_used_ids):
|
|
433
|
-
fh = open(
|
|
552
|
+
def load_last_used_ids(last_used_ids_file: Path, last_used_ids):
|
|
553
|
+
fh = last_used_ids_file.open(mode="r", encoding="utf-8-sig")
|
|
434
554
|
csvr = csv.reader(fh, delimiter="\t")
|
|
435
555
|
|
|
436
556
|
for last_ids_data in csvr:
|
|
@@ -439,8 +559,9 @@ def load_last_used_ids(last_used_ids_file, last_used_ids):
|
|
|
439
559
|
fh.close()
|
|
440
560
|
return last_used_ids
|
|
441
561
|
|
|
442
|
-
|
|
443
|
-
|
|
562
|
+
|
|
563
|
+
def load_saved_person_ids(person_file: Path):
|
|
564
|
+
fh = person_file.open(mode="r", encoding="utf-8-sig")
|
|
444
565
|
csvr = csv.reader(fh, delimiter="\t")
|
|
445
566
|
last_int = 1
|
|
446
567
|
person_ids = {}
|
|
@@ -453,38 +574,47 @@ def load_saved_person_ids(person_file):
|
|
|
453
574
|
fh.close()
|
|
454
575
|
return person_ids, last_int
|
|
455
576
|
|
|
456
|
-
def load_person_ids(
|
|
457
|
-
|
|
577
|
+
def load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_person_ids, delim=","):
|
|
578
|
+
person_ids, person_number = get_person_lookup(saved_person_id_file)
|
|
579
|
+
|
|
580
|
+
fh = person_file.open(mode="r", encoding="utf-8-sig")
|
|
458
581
|
csvr = csv.reader(fh, delimiter=delim)
|
|
459
582
|
person_columns = {}
|
|
460
583
|
person_col_in_hdr_number = 0
|
|
461
584
|
reject_count = 0
|
|
462
585
|
|
|
463
586
|
personhdr = next(csvr)
|
|
464
|
-
|
|
587
|
+
logger.info(personhdr)
|
|
465
588
|
|
|
466
589
|
# Make a dictionary of column names vs their positions
|
|
467
590
|
for col in personhdr:
|
|
468
591
|
person_columns[col] = person_col_in_hdr_number
|
|
469
592
|
person_col_in_hdr_number += 1
|
|
470
593
|
|
|
471
|
-
|
|
472
|
-
|
|
594
|
+
## check the mapping rules for person to find where to get the person data) i.e., which column in the person file contains dob, sex
|
|
595
|
+
birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info(
|
|
596
|
+
"person"
|
|
597
|
+
)
|
|
598
|
+
logger.info(
|
|
599
|
+
"Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source)
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
## get the column index of the PersonID from the input file
|
|
473
603
|
person_col = person_columns[person_id_source]
|
|
474
604
|
|
|
475
605
|
for persondata in csvr:
|
|
476
|
-
if not valid_value(persondata[person_columns[person_id_source]]):
|
|
606
|
+
if not valid_value(persondata[person_columns[person_id_source]]): #just checking that the id is not an empty string
|
|
477
607
|
reject_count += 1
|
|
478
608
|
continue
|
|
479
609
|
if not valid_date_value(persondata[person_columns[birth_datetime_source]]):
|
|
480
610
|
reject_count += 1
|
|
481
611
|
continue
|
|
482
|
-
if persondata[person_col] not in person_ids:
|
|
612
|
+
if persondata[person_col] not in person_ids: #if not already in person_ids dict, add it
|
|
483
613
|
if use_input_person_ids == "N":
|
|
484
|
-
person_ids[persondata[person_col]] = str(person_number)
|
|
614
|
+
person_ids[persondata[person_col]] = str(person_number) #create a new integer person_id
|
|
485
615
|
person_number += 1
|
|
486
616
|
else:
|
|
487
|
-
person_ids[persondata[person_col]] = str(persondata[person_col])
|
|
617
|
+
person_ids[persondata[person_col]] = str(persondata[person_col]) #use existing person_id
|
|
488
618
|
fh.close()
|
|
489
619
|
|
|
490
620
|
return person_ids, reject_count
|
|
@@ -493,4 +623,126 @@ def load_person_ids(person_file, person_ids, mappingrules, use_input_person_ids,
|
|
|
493
623
|
def py():
|
|
494
624
|
pass
|
|
495
625
|
|
|
626
|
+
|
|
627
|
+
def check_dir_isvalid(directory: Path | tuple[Path, ...], create_if_missing: bool = False) -> None:
|
|
628
|
+
"""Check if directory is valid, optionally create it if missing.
|
|
629
|
+
|
|
630
|
+
Args:
|
|
631
|
+
directory: Directory path as string or tuple
|
|
632
|
+
create_if_missing: If True, create directory if it doesn't exist
|
|
633
|
+
"""
|
|
634
|
+
|
|
635
|
+
## check directory has been set
|
|
636
|
+
if directory is None:
|
|
637
|
+
logger.warning("Directory not provided.")
|
|
638
|
+
sys.exit(1)
|
|
639
|
+
|
|
640
|
+
## check output dir is valid
|
|
641
|
+
elif type(directory) is tuple:
|
|
642
|
+
directory = directory[0]
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
## if not a directory, create it if requested (including parents. This option is for the output directory only).
|
|
646
|
+
if not directory.is_dir():
|
|
647
|
+
if create_if_missing:
|
|
648
|
+
try:
|
|
649
|
+
## deliberately not using the exist_ok option, as we want to know whether it was created or not to provide different logger messages.
|
|
650
|
+
directory.mkdir(parents = True)
|
|
651
|
+
logger.info(f"Created directory: {directory}")
|
|
652
|
+
except OSError as e:
|
|
653
|
+
logger.warning(f"Failed to create directory {directory}: {e}")
|
|
654
|
+
sys.exit(1)
|
|
655
|
+
else:
|
|
656
|
+
logger.warning(f"Not a directory, dir {directory}")
|
|
657
|
+
sys.exit(1)
|
|
658
|
+
|
|
659
|
+
# Handle tuple input (like input_dir)
|
|
660
|
+
if isinstance(directory, tuple):
|
|
661
|
+
if not directory: # Empty tuple
|
|
662
|
+
print("No directory provided")
|
|
663
|
+
sys.exit(1)
|
|
664
|
+
directory = directory[0]
|
|
665
|
+
|
|
666
|
+
# Handle string input
|
|
667
|
+
dir_path = str(directory)
|
|
668
|
+
if not os.path.isdir(dir_path):
|
|
669
|
+
if create_if_missing:
|
|
670
|
+
try:
|
|
671
|
+
os.makedirs(dir_path)
|
|
672
|
+
print(f"Created directory: {dir_path}")
|
|
673
|
+
except OSError as e:
|
|
674
|
+
print(f"Failed to create directory {dir_path}: {e}")
|
|
675
|
+
sys.exit(1)
|
|
676
|
+
else:
|
|
677
|
+
print(f"Not a directory, dir {dir_path}")
|
|
678
|
+
sys.exit(1)
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
def set_saved_person_id_file(
|
|
682
|
+
saved_person_id_file: Path | None, output_dir: Path
|
|
683
|
+
) -> Path:
|
|
684
|
+
"""check if there is a saved person id file set in options - if not, check if the file exists and remove it"""
|
|
685
|
+
|
|
686
|
+
if saved_person_id_file is None:
|
|
687
|
+
saved_person_id_file = output_dir / "person_ids.tsv"
|
|
688
|
+
if saved_person_id_file.exists():
|
|
689
|
+
assert not saved_person_id_file.is_dir()
|
|
690
|
+
saved_person_id_file.unlink()
|
|
691
|
+
else:
|
|
692
|
+
assert not saved_person_id_file.is_dir()
|
|
693
|
+
return saved_person_id_file
|
|
694
|
+
|
|
695
|
+
def check_files_in_rules_exist(rules_input_files: list[str], existing_input_files: list[str]) -> None:
|
|
696
|
+
for infile in existing_input_files:
|
|
697
|
+
if infile not in rules_input_files:
|
|
698
|
+
msg = (
|
|
699
|
+
"WARNING: no mapping rules found for existing input file - {0}".format(
|
|
700
|
+
infile
|
|
701
|
+
)
|
|
702
|
+
)
|
|
703
|
+
logger.warning(msg)
|
|
704
|
+
for infile in rules_input_files:
|
|
705
|
+
if infile not in existing_input_files:
|
|
706
|
+
msg = "WARNING: no data for mapped input file - {0}".format(infile)
|
|
707
|
+
logger.warning(msg)
|
|
708
|
+
|
|
709
|
+
def open_file(file_path: Path) -> tuple[IO[str], Iterator[list[str]]] | None:
|
|
710
|
+
"""opens a file and does something related to CSVs"""
|
|
711
|
+
try:
|
|
712
|
+
fh = file_path.open(mode="r", encoding="utf-8-sig")
|
|
713
|
+
csvr = csv.reader(fh)
|
|
714
|
+
return fh, csvr
|
|
715
|
+
except IOError as e:
|
|
716
|
+
logger.exception("Unable to open: {0}".format(file_path))
|
|
717
|
+
logger.exception("I/O error({0}): {1}".format(e.errno, e.strerror))
|
|
718
|
+
return None
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
def set_omop_filenames(
|
|
722
|
+
omop_ddl_file: Path, omop_config_file: Path, omop_version: str
|
|
723
|
+
) -> tuple[Path, Path]:
|
|
724
|
+
if (
|
|
725
|
+
(omop_ddl_file is None)
|
|
726
|
+
and (omop_config_file is None)
|
|
727
|
+
and (omop_version is not None)
|
|
728
|
+
):
|
|
729
|
+
omop_config_file = (
|
|
730
|
+
importlib.resources.files("carrottransform") / "config/omop.json"
|
|
731
|
+
)
|
|
732
|
+
omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
|
|
733
|
+
omop_ddl_file = (
|
|
734
|
+
importlib.resources.files("carrottransform") / "config" / omop_ddl_file_name
|
|
735
|
+
)
|
|
736
|
+
return omop_config_file, omop_ddl_file
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
def get_person_lookup(saved_person_id_file: Path) -> tuple[dict[str, str], int]:
|
|
740
|
+
# Saved-person-file existence test, reload if found, return last used integer
|
|
741
|
+
if saved_person_id_file.is_file():
|
|
742
|
+
person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
|
|
743
|
+
else:
|
|
744
|
+
person_lookup = {}
|
|
745
|
+
last_used_integer = 1
|
|
746
|
+
return person_lookup, last_used_integer
|
|
747
|
+
|
|
496
748
|
run.add_command(mapstream,"mapstream")
|