carrot-transform 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of carrot-transform might be problematic. Click here for more details.
- {carrot_transform-0.3.4.dist-info → carrot_transform-0.3.5.dist-info}/METADATA +2 -2
- {carrot_transform-0.3.4.dist-info → carrot_transform-0.3.5.dist-info}/RECORD +11 -10
- {carrot_transform-0.3.4.dist-info → carrot_transform-0.3.5.dist-info}/WHEEL +1 -1
- carrottransform/cli/subcommands/run.py +361 -190
- carrottransform/tools/click.py +21 -0
- carrottransform/tools/file_helpers.py +30 -4
- carrottransform/tools/mappingrules.py +5 -2
- carrottransform/tools/metrics.py +212 -40
- carrottransform/tools/omopcdm.py +8 -3
- {carrot_transform-0.3.4.dist-info → carrot_transform-0.3.5.dist-info}/LICENSE +0 -0
- {carrot_transform-0.3.4.dist-info → carrot_transform-0.3.5.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: carrot_transform
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.5
|
|
4
4
|
Summary:
|
|
5
5
|
Author: anwarfg
|
|
6
6
|
Author-email: 913028+anwarfg@users.noreply.github.com
|
|
@@ -12,8 +12,8 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.13
|
|
13
13
|
Requires-Dist: click (>=8.1.7,<9.0.0)
|
|
14
14
|
Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
|
|
15
|
+
Requires-Dist: numpy (<2)
|
|
15
16
|
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
16
|
-
Requires-Dist: pytest (>=8.3.4,<9.0.0)
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
|
|
19
19
|
<p align="center">
|
|
@@ -3,7 +3,7 @@ carrottransform/_version.py,sha256=bm7SM-_MN0gstlNsCDO6dAajKcjQD-NxI_xpvfRx0Ts,1
|
|
|
3
3
|
carrottransform/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
carrottransform/cli/command.py,sha256=xYTaJsVZyRYv0CzUwrh7ZPK8hhGyC3MDfvVYxHcXYSM,508
|
|
5
5
|
carrottransform/cli/subcommands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
carrottransform/cli/subcommands/run.py,sha256=
|
|
6
|
+
carrottransform/cli/subcommands/run.py,sha256=GfRHG_aLoBxuXkpGTTrRmsEcNUjTUB6cl8f1B7lTBt8,28461
|
|
7
7
|
carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql,sha256=fXrPfdL3IzU5ux55ogsQKjjd-c1KzdP_N2A_JjlY3gk,18084
|
|
8
8
|
carrottransform/config/omop.json,sha256=OT3jvfPjKhjsDnQcQw1OAEOHhQLoHXNxTj_MDwNbYqo,1934
|
|
9
9
|
carrottransform/examples/test/inputs/Covid19_test.csv,sha256=d5t7Lfhkwbfe3Uk2IBqB2ZT5o0h9QaeraC8E5-IMERo,67521
|
|
@@ -13,12 +13,13 @@ carrottransform/examples/test/inputs/covid19_antibody.csv,sha256=SPCpyqpTbVq9987
|
|
|
13
13
|
carrottransform/examples/test/inputs/vaccine.csv,sha256=_gcM-SIymyt2Dkkr_zGmQI9keIdmDm-gDI_QvXXLFrY,44037
|
|
14
14
|
carrottransform/examples/test/rules/rules_14June2021.json,sha256=n2OYNFhbx-NLhmqjAad6RsfXjQFknZIgQ7a5uyJF0Co,13226
|
|
15
15
|
carrottransform/tools/__init__.py,sha256=b3JuCwgJVx0rqx5igB8hNNKO0ktlbQjHGHwy-vzpdo0,198
|
|
16
|
-
carrottransform/tools/
|
|
17
|
-
carrottransform/tools/
|
|
18
|
-
carrottransform/tools/
|
|
19
|
-
carrottransform/tools/
|
|
20
|
-
|
|
21
|
-
carrot_transform-0.3.
|
|
22
|
-
carrot_transform-0.3.
|
|
23
|
-
carrot_transform-0.3.
|
|
24
|
-
carrot_transform-0.3.
|
|
16
|
+
carrottransform/tools/click.py,sha256=5fxl9zL6piwWMN4cSule0tG90E9g7eFNosoSu1ES1og,471
|
|
17
|
+
carrottransform/tools/file_helpers.py,sha256=_NRswYjqpBBkp4efMBhFf9XIRaqYTw1-jA22usyrbqA,1204
|
|
18
|
+
carrottransform/tools/mappingrules.py,sha256=jvWTLCQoLoCegmLWHPyRSRVOTLejp7LzmFMr-ENmuTU,7121
|
|
19
|
+
carrottransform/tools/metrics.py,sha256=VrcePVGwgHCJqQ1i9Q_KqL6Cv8IbIce2pSRSBth9808,11011
|
|
20
|
+
carrottransform/tools/omopcdm.py,sha256=fcqIub5ud57i-5J3iUvPi2dqfGgyjWnWJTH1djQzq9E,8603
|
|
21
|
+
carrot_transform-0.3.5.dist-info/LICENSE,sha256=pqIiuuTs6Na-oFd10MMsZoZmdfhfUhHeOtQzgzSkcaw,1082
|
|
22
|
+
carrot_transform-0.3.5.dist-info/METADATA,sha256=cW5wfZRrZoai-nnV5k9FVYY8-XGm24Qadu0hYV4P9R8,4206
|
|
23
|
+
carrot_transform-0.3.5.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
|
24
|
+
carrot_transform-0.3.5.dist-info/entry_points.txt,sha256=z7qmjTl7C8shrYiPBy6yZo9RRZ31Jcvo6L8ntdqbs2E,74
|
|
25
|
+
carrot_transform-0.3.5.dist-info/RECORD,,
|
|
@@ -1,45 +1,65 @@
|
|
|
1
|
+
import carrottransform
|
|
2
|
+
import carrottransform.tools as tools
|
|
3
|
+
import click
|
|
1
4
|
import csv
|
|
2
|
-
import os, time
|
|
3
5
|
import datetime
|
|
4
6
|
import fnmatch
|
|
5
|
-
import sys
|
|
6
|
-
import click
|
|
7
|
-
import json
|
|
8
7
|
import importlib.resources
|
|
9
|
-
import
|
|
10
|
-
import
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
import sys
|
|
12
|
+
import time
|
|
13
|
+
|
|
14
|
+
from carrottransform.tools.click import PathArgs
|
|
11
15
|
from carrottransform.tools.omopcdm import OmopCDM
|
|
12
|
-
from
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from typing import Iterator, IO, Iterable
|
|
19
|
+
from ...tools.file_helpers import resolve_paths
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
if not logger.handlers:
|
|
23
|
+
logger.setLevel(logging.INFO)
|
|
24
|
+
|
|
25
|
+
console_handler = logging.StreamHandler()
|
|
26
|
+
console_handler.setLevel(logging.INFO)
|
|
27
|
+
|
|
28
|
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
29
|
+
console_handler.setFormatter(formatter)
|
|
13
30
|
|
|
31
|
+
logger.addHandler(console_handler)
|
|
14
32
|
|
|
15
33
|
@click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
|
|
16
34
|
def run():
|
|
17
35
|
pass
|
|
18
36
|
|
|
37
|
+
|
|
19
38
|
@click.command()
|
|
20
|
-
@click.option("--rules-file",
|
|
39
|
+
@click.option("--rules-file", type=PathArgs,
|
|
21
40
|
required=True,
|
|
22
41
|
help="json file containing mapping rules")
|
|
23
|
-
@click.option("--output-dir",
|
|
42
|
+
@click.option("--output-dir", type=PathArgs,
|
|
24
43
|
default=None,
|
|
44
|
+
required=True,
|
|
25
45
|
help="define the output directory for OMOP-format tsv files")
|
|
26
46
|
@click.option("--write-mode",
|
|
27
47
|
default='w',
|
|
28
48
|
type=click.Choice(['w','a']),
|
|
29
49
|
help="force write-mode on output files")
|
|
30
|
-
@click.option("--person-file",
|
|
50
|
+
@click.option("--person-file", type=PathArgs,
|
|
31
51
|
required=True,
|
|
32
52
|
help="File containing person_ids in the first column")
|
|
33
|
-
@click.option("--omop-ddl-file",
|
|
53
|
+
@click.option("--omop-ddl-file", type=PathArgs,
|
|
34
54
|
required=False,
|
|
35
55
|
help="File containing OHDSI ddl statements for OMOP tables")
|
|
36
|
-
@click.option("--omop-config-file",
|
|
56
|
+
@click.option("--omop-config-file", type=PathArgs,
|
|
37
57
|
required=False,
|
|
38
58
|
help="File containing additional / override json config for omop outputs")
|
|
39
59
|
@click.option("--omop-version",
|
|
40
60
|
required=False,
|
|
41
61
|
help="Quoted string containing omop version - eg '5.3'")
|
|
42
|
-
@click.option("--saved-person-id-file",
|
|
62
|
+
@click.option("--saved-person-id-file", type=PathArgs,
|
|
43
63
|
default=None,
|
|
44
64
|
required=False,
|
|
45
65
|
help="Full path to person id file used to save person_id state and share person_ids between data sets")
|
|
@@ -47,7 +67,7 @@ def run():
|
|
|
47
67
|
required=False,
|
|
48
68
|
default='N',
|
|
49
69
|
help="Use person ids as input without generating new integers")
|
|
50
|
-
@click.option("--last-used-ids-file",
|
|
70
|
+
@click.option("--last-used-ids-file", type=PathArgs,
|
|
51
71
|
default=None,
|
|
52
72
|
required=False,
|
|
53
73
|
help="Full path to last used ids file for OMOP tables - format: tablename\tlast_used_id, \nwhere last_used_id must be an integer")
|
|
@@ -55,46 +75,108 @@ def run():
|
|
|
55
75
|
required=False,
|
|
56
76
|
default=0,
|
|
57
77
|
help="Lower outcount limit for logfile output")
|
|
58
|
-
@click.
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
78
|
+
@click.option("--input-dir", type=PathArgs,
|
|
79
|
+
required=True,
|
|
80
|
+
multiple=True,
|
|
81
|
+
help="Input directories")
|
|
82
|
+
def mapstream(
|
|
83
|
+
rules_file: Path,
|
|
84
|
+
output_dir: Path,
|
|
85
|
+
write_mode,
|
|
86
|
+
person_file: Path,
|
|
87
|
+
omop_ddl_file: Path,
|
|
88
|
+
omop_config_file: Path,
|
|
89
|
+
omop_version,
|
|
90
|
+
saved_person_id_file: Path,
|
|
91
|
+
use_input_person_ids,
|
|
92
|
+
last_used_ids_file: Path,
|
|
93
|
+
log_file_threshold,
|
|
94
|
+
input_dir: Iterable[Path],
|
|
95
|
+
):
|
|
65
96
|
"""
|
|
66
97
|
Map to output using input streams
|
|
67
98
|
"""
|
|
68
|
-
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# Resolve any @package paths in the arguments
|
|
102
|
+
resolved_paths = resolve_paths([
|
|
103
|
+
rules_file,
|
|
104
|
+
output_dir,
|
|
105
|
+
person_file,
|
|
106
|
+
omop_ddl_file,
|
|
107
|
+
omop_config_file,
|
|
108
|
+
saved_person_id_file,
|
|
109
|
+
last_used_ids_file,
|
|
110
|
+
input_dir[0] if input_dir else None # Take first element of input_dir tuple
|
|
111
|
+
])
|
|
112
|
+
|
|
113
|
+
# Assign back resolved paths
|
|
114
|
+
[rules_file, output_dir, person_file, omop_ddl_file,
|
|
115
|
+
omop_config_file, saved_person_id_file, last_used_ids_file,
|
|
116
|
+
input_dir] = resolved_paths
|
|
117
|
+
|
|
118
|
+
# Ensure input_dir is a list of paths
|
|
119
|
+
if isinstance(input_dir, (Path, str)):
|
|
120
|
+
input_dir = [input_dir]
|
|
121
|
+
elif isinstance(input_dir, tuple):
|
|
122
|
+
input_dir = list(input_dir)
|
|
123
|
+
# If it's already a list, leave it as is
|
|
124
|
+
|
|
125
|
+
# Initialisation
|
|
69
126
|
# - check for values in optional arguments
|
|
70
127
|
# - read in configuration files
|
|
71
128
|
# - check main directories for existence
|
|
72
129
|
# - handle saved person ids
|
|
73
130
|
# - initialise metrics
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
131
|
+
logger.info(
|
|
132
|
+
",".join(
|
|
133
|
+
map(
|
|
134
|
+
str,
|
|
135
|
+
[
|
|
136
|
+
rules_file,
|
|
137
|
+
output_dir,
|
|
138
|
+
write_mode,
|
|
139
|
+
person_file,
|
|
140
|
+
omop_ddl_file,
|
|
141
|
+
omop_config_file,
|
|
142
|
+
omop_version,
|
|
143
|
+
saved_person_id_file,
|
|
144
|
+
use_input_person_ids,
|
|
145
|
+
last_used_ids_file,
|
|
146
|
+
log_file_threshold,
|
|
147
|
+
input_dir,
|
|
148
|
+
],
|
|
149
|
+
)
|
|
150
|
+
)
|
|
151
|
+
)
|
|
78
152
|
|
|
79
153
|
## set omop filenames
|
|
80
|
-
omop_config_file, omop_ddl_file = set_omop_filenames(
|
|
154
|
+
omop_config_file, omop_ddl_file = set_omop_filenames(
|
|
155
|
+
omop_ddl_file, omop_config_file, omop_version
|
|
156
|
+
)
|
|
81
157
|
## check directories are valid
|
|
82
|
-
|
|
83
|
-
|
|
158
|
+
for idir in input_dir:
|
|
159
|
+
check_dir_isvalid(idir) # Input directory must exist
|
|
160
|
+
check_dir_isvalid(output_dir, create_if_missing=True) # Create output directory if needed
|
|
161
|
+
|
|
84
162
|
|
|
85
163
|
saved_person_id_file = set_saved_person_id_file(saved_person_id_file, output_dir)
|
|
86
|
-
|
|
87
|
-
|
|
164
|
+
|
|
165
|
+
start_time = time.time()
|
|
88
166
|
## create OmopCDM object, which contains attributes and methods for the omop data tables.
|
|
89
167
|
omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
|
|
90
168
|
|
|
91
169
|
## mapping rules determine the ouput files? which input files and fields in the source data, AND the mappings to omop concepts
|
|
92
170
|
mappingrules = tools.mappingrules.MappingRules(rules_file, omopcdm)
|
|
93
171
|
metrics = tools.metrics.Metrics(mappingrules.get_dataset_name(), log_file_threshold)
|
|
94
|
-
nowtime = time.time()
|
|
95
172
|
|
|
96
|
-
|
|
97
|
-
|
|
173
|
+
logger.info(
|
|
174
|
+
"--------------------------------------------------------------------------------"
|
|
175
|
+
)
|
|
176
|
+
logger.info(
|
|
177
|
+
f"Loaded mapping rules from: {rules_file} in {time.time() - start_time:.5f} secs"
|
|
178
|
+
)
|
|
179
|
+
|
|
98
180
|
output_files = mappingrules.get_all_outfile_names()
|
|
99
181
|
|
|
100
182
|
## set record number
|
|
@@ -102,31 +184,30 @@ def mapstream(rules_file, output_dir, write_mode,
|
|
|
102
184
|
record_numbers = {}
|
|
103
185
|
for output_file in output_files:
|
|
104
186
|
record_numbers[output_file] = 1
|
|
105
|
-
if last_used_ids_file
|
|
106
|
-
|
|
107
|
-
record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
|
|
187
|
+
if (last_used_ids_file is not None) and last_used_ids_file.is_file():
|
|
188
|
+
record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
|
|
108
189
|
|
|
109
190
|
fhd = {}
|
|
110
191
|
tgtcolmaps = {}
|
|
111
192
|
|
|
112
|
-
|
|
113
|
-
|
|
114
193
|
try:
|
|
115
194
|
## get all person_ids from file and either renumber with an int or take directly, and add to a dict
|
|
116
|
-
person_lookup, rejected_person_count = load_person_ids(saved_person_id_file,
|
|
195
|
+
person_lookup, rejected_person_count = load_person_ids(saved_person_id_file,
|
|
196
|
+
person_file, mappingrules,
|
|
197
|
+
use_input_person_ids)
|
|
117
198
|
## open person_ids output file
|
|
118
|
-
with open(
|
|
199
|
+
with saved_person_id_file.open(mode="w") as fhpout:
|
|
119
200
|
## write the header to the file
|
|
120
201
|
fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
|
|
121
202
|
##iterate through the ids and write them to the file.
|
|
122
203
|
for person_id, person_assigned_id in person_lookup.items():
|
|
123
|
-
fhpout.write("{
|
|
204
|
+
fhpout.write(f"{str(person_id)}\t{str(person_assigned_id)}")
|
|
124
205
|
|
|
125
206
|
## Initialise output files (adding them to a dict), output a header for each
|
|
126
207
|
## these aren't being closed deliberately
|
|
127
208
|
for tgtfile in output_files:
|
|
128
|
-
fhd[tgtfile] =
|
|
129
|
-
if write_mode ==
|
|
209
|
+
fhd[tgtfile] = (output_dir / tgtfile).with_suffix(".tsv").open(mode=write_mode)
|
|
210
|
+
if write_mode == "w":
|
|
130
211
|
outhdr = omopcdm.get_omop_column_list(tgtfile)
|
|
131
212
|
fhd[tgtfile].write("\t".join(outhdr) + "\n")
|
|
132
213
|
## maps all omop columns for each file into a dict containing the column name and the index
|
|
@@ -134,13 +215,13 @@ def mapstream(rules_file, output_dir, write_mode,
|
|
|
134
215
|
tgtcolmaps[tgtfile] = omopcdm.get_omop_column_map(tgtfile)
|
|
135
216
|
|
|
136
217
|
except IOError as e:
|
|
137
|
-
|
|
218
|
+
logger.exception(f"I/O - error({e.errno}): {e.strerror} -> {str(e)}")
|
|
138
219
|
exit()
|
|
139
220
|
|
|
140
|
-
|
|
221
|
+
logger.info(f"person_id stats: total loaded {len(person_lookup)}, reject count {rejected_person_count}")
|
|
141
222
|
|
|
142
223
|
## Compare files found in the input_dir with those expected based on mapping rules
|
|
143
|
-
existing_input_files =
|
|
224
|
+
existing_input_files = [f.name for f in input_dir[0].glob("*.csv")]
|
|
144
225
|
rules_input_files = mappingrules.get_all_infile_names()
|
|
145
226
|
|
|
146
227
|
## Log mismatches but continue
|
|
@@ -149,7 +230,7 @@ def mapstream(rules_file, output_dir, write_mode,
|
|
|
149
230
|
## set up overall counts
|
|
150
231
|
rejidcounts = {}
|
|
151
232
|
rejdatecounts = {}
|
|
152
|
-
|
|
233
|
+
logger.info(rules_input_files)
|
|
153
234
|
|
|
154
235
|
## set up per-input counts
|
|
155
236
|
for srcfilename in rules_input_files:
|
|
@@ -162,7 +243,7 @@ def mapstream(rules_file, output_dir, write_mode,
|
|
|
162
243
|
rejcounts = {}
|
|
163
244
|
rcount = 0
|
|
164
245
|
|
|
165
|
-
fh, csvr = open_file(input_dir[0]
|
|
246
|
+
fh, csvr = open_file(input_dir[0] / srcfilename)
|
|
166
247
|
if fh is None:
|
|
167
248
|
continue
|
|
168
249
|
|
|
@@ -181,21 +262,37 @@ def mapstream(rules_file, output_dir, write_mode,
|
|
|
181
262
|
inputcolmap = omopcdm.get_column_map(hdrdata)
|
|
182
263
|
pers_id_col = inputcolmap[infile_person_id_source]
|
|
183
264
|
datetime_col = inputcolmap[infile_datetime_source]
|
|
184
|
-
|
|
185
|
-
|
|
265
|
+
|
|
266
|
+
logger.info(
|
|
267
|
+
"--------------------------------------------------------------------------------"
|
|
268
|
+
)
|
|
269
|
+
logger.info(f"Processing input: {srcfilename}")
|
|
186
270
|
|
|
187
271
|
# for each input record
|
|
188
272
|
for indata in csvr:
|
|
189
|
-
|
|
190
|
-
|
|
273
|
+
metrics.increment_key_count(
|
|
274
|
+
source=srcfilename,
|
|
275
|
+
fieldname="all",
|
|
276
|
+
tablename="all",
|
|
277
|
+
concept_id="all",
|
|
278
|
+
additional="",
|
|
279
|
+
count_type="input_count"
|
|
280
|
+
)
|
|
191
281
|
rcount += 1
|
|
192
282
|
# if there is a date, parse it - read it is a string and convert to YYYY-MM-DD
|
|
193
283
|
strdate = indata[datetime_col].split(" ")[0]
|
|
194
284
|
fulldate = parse_date(strdate)
|
|
195
|
-
if fulldate
|
|
285
|
+
if fulldate is not None:
|
|
196
286
|
indata[datetime_col] = fulldate
|
|
197
287
|
else:
|
|
198
|
-
metrics.increment_key_count(
|
|
288
|
+
metrics.increment_key_count(
|
|
289
|
+
source=srcfilename,
|
|
290
|
+
fieldname="all",
|
|
291
|
+
tablename="all",
|
|
292
|
+
concept_id="all",
|
|
293
|
+
additional="",
|
|
294
|
+
count_type="input_date_fields"
|
|
295
|
+
)
|
|
199
296
|
continue
|
|
200
297
|
|
|
201
298
|
for tgtfile in tgtfiles:
|
|
@@ -209,9 +306,9 @@ def mapstream(rules_file, output_dir, write_mode,
|
|
|
209
306
|
|
|
210
307
|
for datacol in datacols:
|
|
211
308
|
built_records, outrecords, metrics = get_target_records(tgtfile, tgtcolmap, src_to_tgt, datacol, indata, inputcolmap, srcfilename, omopcdm, metrics)
|
|
212
|
-
if built_records
|
|
309
|
+
if built_records:
|
|
213
310
|
for outrecord in outrecords:
|
|
214
|
-
if auto_num_col
|
|
311
|
+
if auto_num_col is not None:
|
|
215
312
|
outrecord[tgtcolmap[auto_num_col]] = str(record_numbers[tgtfile])
|
|
216
313
|
### most of the rest of this section is actually to do with metrics
|
|
217
314
|
record_numbers[tgtfile] += 1
|
|
@@ -219,70 +316,61 @@ def mapstream(rules_file, output_dir, write_mode,
|
|
|
219
316
|
outrecord[tgtcolmap[pers_id_col]] = person_lookup[outrecord[tgtcolmap[pers_id_col]]]
|
|
220
317
|
outcounts[tgtfile] += 1
|
|
221
318
|
|
|
222
|
-
|
|
319
|
+
metrics.increment_with_datacol(
|
|
320
|
+
source_path=srcfilename,
|
|
321
|
+
target_file=tgtfile,
|
|
322
|
+
datacol=datacol,
|
|
323
|
+
out_record=outrecord
|
|
324
|
+
)
|
|
223
325
|
|
|
224
326
|
# write the line to the file
|
|
225
327
|
fhd[tgtfile].write("\t".join(outrecord) + "\n")
|
|
226
328
|
else:
|
|
227
|
-
|
|
228
|
-
|
|
329
|
+
metrics.increment_key_count(
|
|
330
|
+
source=srcfilename,
|
|
331
|
+
fieldname="all",
|
|
332
|
+
tablename=tgtfile,
|
|
333
|
+
concept_id="all",
|
|
334
|
+
additional="",
|
|
335
|
+
count_type="invalid_person_ids",
|
|
336
|
+
)
|
|
229
337
|
rejidcounts[srcfilename] += 1
|
|
230
338
|
|
|
231
339
|
fh.close()
|
|
232
340
|
|
|
233
|
-
|
|
234
|
-
print("INPUT file data : {0}: input count {1}, time since start {2:.5} secs".format(srcfilename, str(rcount), (nowtime - starttime)))
|
|
341
|
+
logger.info(f"INPUT file data : {srcfilename}: input count {str(rcount)}, time since start {time.time() - start_time:.5} secs")
|
|
235
342
|
for outtablename, count in outcounts.items():
|
|
236
|
-
|
|
343
|
+
logger.info(f"TARGET: {outtablename}: output count {str(count)}")
|
|
237
344
|
# END main processing loop
|
|
238
345
|
|
|
239
|
-
|
|
346
|
+
logger.info(
|
|
347
|
+
"--------------------------------------------------------------------------------"
|
|
348
|
+
)
|
|
349
|
+
|
|
240
350
|
data_summary = metrics.get_mapstream_summary()
|
|
241
351
|
try:
|
|
242
|
-
dsfh =
|
|
352
|
+
dsfh = (output_dir / "summary_mapstream.tsv").open(mode="w")
|
|
243
353
|
dsfh.write(data_summary)
|
|
244
354
|
dsfh.close()
|
|
245
355
|
except IOError as e:
|
|
246
|
-
|
|
247
|
-
|
|
356
|
+
logger.exception(f"I/O error({e.errno}): {e.strerror}")
|
|
357
|
+
logger.exception("Unable to write file")
|
|
358
|
+
raise e
|
|
248
359
|
|
|
249
360
|
# END mapstream
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
def
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
if tgtfile == "person":
|
|
264
|
-
key = srcfilename + "~all~" + tgtfile + "~" + outrecord[1] + "~"
|
|
265
|
-
metrics.increment_key_count(key, "output_count")
|
|
266
|
-
|
|
267
|
-
key = srcfilename + "~" + datacol + "~" + tgtfile + "~" + outrecord[1] + "~" + outrecord[2]
|
|
268
|
-
metrics.increment_key_count(key, "output_count")
|
|
269
|
-
else:
|
|
270
|
-
key = srcfilename + "~" + datacol + "~" + tgtfile + "~" + outrecord[2] + "~"
|
|
271
|
-
metrics.increment_key_count(key, "output_count")
|
|
272
|
-
|
|
273
|
-
key = srcfilename + "~all~" + tgtfile + "~" + outrecord[2] + "~"
|
|
274
|
-
metrics.increment_key_count(key, "output_count")
|
|
275
|
-
|
|
276
|
-
key = "all~all~" + tgtfile + "~" + outrecord[2] + "~"
|
|
277
|
-
metrics.increment_key_count(key, "output_count")
|
|
278
|
-
|
|
279
|
-
key = "all~all~all~" + outrecord[2] + "~"
|
|
280
|
-
metrics.increment_key_count(key, "output_count")
|
|
281
|
-
return
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
def get_target_records(tgtfilename: str, tgtcolmap: dict[str, dict[str, int]], rulesmap: dict[str, list[dict[str, list[str]]]], srcfield: str, srcdata: list[str], srccolmap: dict[str, int], srcfilename: str, omopcdm: OmopCDM, metrics: tools.metrics.Metrics) -> \
|
|
285
|
-
tuple[bool, list[str], tools.metrics.Metrics]:
|
|
361
|
+
logger.info(f"Elapsed time = {time.time() - start_time:.5f} secs")
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def get_target_records(
|
|
365
|
+
tgtfilename: str,
|
|
366
|
+
tgtcolmap: dict[str, dict[str, int]],
|
|
367
|
+
rulesmap: dict[str, list[dict[str, list[str]]]],
|
|
368
|
+
srcfield: str,
|
|
369
|
+
srcdata: list[str],
|
|
370
|
+
srccolmap: dict[str, int],
|
|
371
|
+
srcfilename: str,
|
|
372
|
+
omopcdm: OmopCDM,
|
|
373
|
+
metrics: tools.metrics.Metrics) -> tuple[bool, list[str], tools.metrics.Metrics]:
|
|
286
374
|
"""
|
|
287
375
|
build all target records for a given input field
|
|
288
376
|
"""
|
|
@@ -292,8 +380,8 @@ tuple[bool, list[str], tools.metrics.Metrics]:
|
|
|
292
380
|
date_component_data = omopcdm.get_omop_date_field_components(tgtfilename)
|
|
293
381
|
notnull_numeric_fields = omopcdm.get_omop_notnull_numeric_fields(tgtfilename)
|
|
294
382
|
|
|
295
|
-
srckey = srcfilename
|
|
296
|
-
summarykey =
|
|
383
|
+
srckey = f"{srcfilename}~{srcfield}~{tgtfilename}"
|
|
384
|
+
summarykey = srckey + "~all~"
|
|
297
385
|
if valid_value(str(srcdata[srccolmap[srcfield]])):
|
|
298
386
|
## check if either or both of the srckey and summarykey are in the rules
|
|
299
387
|
srcfullkey = srcfilename + "~" + srcfield + "~" + str(srcdata[srccolmap[srcfield]]) + "~" + tgtfilename
|
|
@@ -304,7 +392,7 @@ tuple[bool, list[str], tools.metrics.Metrics]:
|
|
|
304
392
|
if srckey in rulesmap:
|
|
305
393
|
build_records = True
|
|
306
394
|
dictkeys.append(srckey)
|
|
307
|
-
if build_records
|
|
395
|
+
if build_records:
|
|
308
396
|
for dictkey in dictkeys:
|
|
309
397
|
for out_data_elem in rulesmap[dictkey]:
|
|
310
398
|
valid_data_elem = True
|
|
@@ -333,27 +421,47 @@ tuple[bool, list[str], tools.metrics.Metrics]:
|
|
|
333
421
|
fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
|
|
334
422
|
tgtarray[tgtcolmap[output_col_data]] = fulldate
|
|
335
423
|
else:
|
|
336
|
-
metrics.increment_key_count(
|
|
424
|
+
metrics.increment_key_count(
|
|
425
|
+
source=srcfilename,
|
|
426
|
+
fieldname=srcfield,
|
|
427
|
+
tablename=tgtfilename,
|
|
428
|
+
concept_id="all",
|
|
429
|
+
additional="",
|
|
430
|
+
count_type="invalid_date_fields"
|
|
431
|
+
)
|
|
337
432
|
valid_data_elem = False
|
|
338
433
|
elif output_col_data in date_col_data:
|
|
339
434
|
fulldate = srcdata[srccolmap[infield]]
|
|
340
435
|
tgtarray[tgtcolmap[output_col_data]] = fulldate
|
|
341
436
|
tgtarray[tgtcolmap[date_col_data[output_col_data]]] = fulldate
|
|
342
|
-
if valid_data_elem
|
|
437
|
+
if valid_data_elem:
|
|
343
438
|
tgtrecords.append(tgtarray)
|
|
344
439
|
else:
|
|
345
|
-
metrics.increment_key_count(
|
|
346
|
-
|
|
440
|
+
metrics.increment_key_count(
|
|
441
|
+
source=srcfilename,
|
|
442
|
+
fieldname=srcfield,
|
|
443
|
+
tablename=tgtfilename,
|
|
444
|
+
concept_id="all",
|
|
445
|
+
additional="",
|
|
446
|
+
count_type="invalid_source_fields"
|
|
447
|
+
)
|
|
347
448
|
|
|
348
449
|
return build_records, tgtrecords, metrics
|
|
349
450
|
|
|
451
|
+
|
|
350
452
|
def valid_value(item):
|
|
351
453
|
"""
|
|
352
454
|
Check if an item is non blank (null)
|
|
353
455
|
"""
|
|
354
456
|
if item.strip() == "":
|
|
355
|
-
return
|
|
356
|
-
return
|
|
457
|
+
return False
|
|
458
|
+
return True
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
# DATE TESTING
|
|
462
|
+
# ------------
|
|
463
|
+
# I started by changing the get_datetime_value to be neater.
|
|
464
|
+
# I think it should be handled all as one thing, but I've spent too much time doing this already
|
|
357
465
|
|
|
358
466
|
def valid_date_value(item):
|
|
359
467
|
"""
|
|
@@ -363,44 +471,33 @@ def valid_date_value(item):
|
|
|
363
471
|
if item.strip() == "":
|
|
364
472
|
return(False)
|
|
365
473
|
if not valid_iso_date(item) and not valid_reverse_iso_date(item) and not valid_uk_date(item):
|
|
366
|
-
|
|
367
|
-
return
|
|
368
|
-
return
|
|
474
|
+
logger.warning("Bad date : {0}".format(item))
|
|
475
|
+
return False
|
|
476
|
+
return True
|
|
477
|
+
|
|
369
478
|
|
|
370
479
|
def get_datetime_value(item):
|
|
371
480
|
"""
|
|
372
|
-
Check if a date item is non
|
|
373
|
-
or
|
|
481
|
+
Check if a date item is non-null and parses as ISO (YYYY-MM-DD), reverse-ISO (DD-MM-YYYY),
|
|
482
|
+
or UK format (DD/MM/YYYY).
|
|
483
|
+
Returns a datetime object if successful, None otherwise.
|
|
374
484
|
"""
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
pass
|
|
389
|
-
|
|
390
|
-
if dt != None:
|
|
391
|
-
return(dt)
|
|
392
|
-
|
|
393
|
-
# Does the date parse as a UK old-style date?
|
|
394
|
-
try:
|
|
395
|
-
dt = datetime.datetime.strptime(item, "%d/%m/%Y")
|
|
396
|
-
except ValueError:
|
|
397
|
-
pass
|
|
398
|
-
|
|
399
|
-
if dt != None:
|
|
400
|
-
return(dt)
|
|
401
|
-
|
|
485
|
+
date_formats = [
|
|
486
|
+
"%Y-%m-%d", # ISO format (YYYY-MM-DD)
|
|
487
|
+
"%d-%m-%Y", # Reverse ISO format (DD-MM-YYYY)
|
|
488
|
+
"%d/%m/%Y", # UK old-style format (DD/MM/YYYY)
|
|
489
|
+
]
|
|
490
|
+
|
|
491
|
+
for date_format in date_formats:
|
|
492
|
+
try:
|
|
493
|
+
return datetime.datetime.strptime(item, date_format)
|
|
494
|
+
except ValueError:
|
|
495
|
+
continue
|
|
496
|
+
|
|
497
|
+
# If we get here, none of the formats worked
|
|
402
498
|
return None
|
|
403
499
|
|
|
500
|
+
|
|
404
501
|
def parse_date(item):
|
|
405
502
|
"""
|
|
406
503
|
Crude hand-coded check on date format
|
|
@@ -411,9 +508,8 @@ def parse_date(item):
|
|
|
411
508
|
if len(datedata) != 3:
|
|
412
509
|
return None
|
|
413
510
|
if len(datedata[2]) == 4:
|
|
414
|
-
return("{
|
|
415
|
-
return
|
|
416
|
-
|
|
511
|
+
return(f"{datedata[2]}-{datedata[1]}-{datedata[0]}".format(datedata[2], datedata[1], datedata[0]))
|
|
512
|
+
return "-".join(datedata[:3])
|
|
417
513
|
|
|
418
514
|
def valid_iso_date(item):
|
|
419
515
|
"""
|
|
@@ -422,9 +518,10 @@ def valid_iso_date(item):
|
|
|
422
518
|
try:
|
|
423
519
|
datetime.datetime.strptime(item, "%Y-%m-%d")
|
|
424
520
|
except ValueError:
|
|
425
|
-
return
|
|
521
|
+
return False
|
|
522
|
+
|
|
523
|
+
return True
|
|
426
524
|
|
|
427
|
-
return(True)
|
|
428
525
|
|
|
429
526
|
def valid_reverse_iso_date(item):
|
|
430
527
|
"""
|
|
@@ -433,9 +530,10 @@ def valid_reverse_iso_date(item):
|
|
|
433
530
|
try:
|
|
434
531
|
datetime.datetime.strptime(item, "%d-%m-%Y")
|
|
435
532
|
except ValueError:
|
|
436
|
-
return
|
|
533
|
+
return False
|
|
534
|
+
|
|
535
|
+
return True
|
|
437
536
|
|
|
438
|
-
return(True)
|
|
439
537
|
|
|
440
538
|
def valid_uk_date(item):
|
|
441
539
|
"""
|
|
@@ -444,12 +542,15 @@ def valid_uk_date(item):
|
|
|
444
542
|
try:
|
|
445
543
|
datetime.datetime.strptime(item, "%d/%m/%Y")
|
|
446
544
|
except ValueError:
|
|
447
|
-
return
|
|
545
|
+
return False
|
|
448
546
|
|
|
449
|
-
return
|
|
547
|
+
return True
|
|
450
548
|
|
|
451
|
-
|
|
452
|
-
|
|
549
|
+
|
|
550
|
+
# End of date code
|
|
551
|
+
|
|
552
|
+
def load_last_used_ids(last_used_ids_file: Path, last_used_ids):
|
|
553
|
+
fh = last_used_ids_file.open(mode="r", encoding="utf-8-sig")
|
|
453
554
|
csvr = csv.reader(fh, delimiter="\t")
|
|
454
555
|
|
|
455
556
|
for last_ids_data in csvr:
|
|
@@ -458,8 +559,9 @@ def load_last_used_ids(last_used_ids_file, last_used_ids):
|
|
|
458
559
|
fh.close()
|
|
459
560
|
return last_used_ids
|
|
460
561
|
|
|
461
|
-
|
|
462
|
-
|
|
562
|
+
|
|
563
|
+
def load_saved_person_ids(person_file: Path):
|
|
564
|
+
fh = person_file.open(mode="r", encoding="utf-8-sig")
|
|
463
565
|
csvr = csv.reader(fh, delimiter="\t")
|
|
464
566
|
last_int = 1
|
|
465
567
|
person_ids = {}
|
|
@@ -475,23 +577,28 @@ def load_saved_person_ids(person_file):
|
|
|
475
577
|
def load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_person_ids, delim=","):
|
|
476
578
|
person_ids, person_number = get_person_lookup(saved_person_id_file)
|
|
477
579
|
|
|
478
|
-
fh = open(
|
|
580
|
+
fh = person_file.open(mode="r", encoding="utf-8-sig")
|
|
479
581
|
csvr = csv.reader(fh, delimiter=delim)
|
|
480
582
|
person_columns = {}
|
|
481
583
|
person_col_in_hdr_number = 0
|
|
482
584
|
reject_count = 0
|
|
483
585
|
|
|
484
586
|
personhdr = next(csvr)
|
|
485
|
-
|
|
587
|
+
logger.info(personhdr)
|
|
486
588
|
|
|
487
589
|
# Make a dictionary of column names vs their positions
|
|
488
590
|
for col in personhdr:
|
|
489
591
|
person_columns[col] = person_col_in_hdr_number
|
|
490
592
|
person_col_in_hdr_number += 1
|
|
491
593
|
|
|
492
|
-
## check the mapping rules for person to find where to get the person data) i.e., which column in the person file contains dob, sex
|
|
493
|
-
birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info(
|
|
494
|
-
|
|
594
|
+
## check the mapping rules for person to find where to get the person data) i.e., which column in the person file contains dob, sex
|
|
595
|
+
birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info(
|
|
596
|
+
"person"
|
|
597
|
+
)
|
|
598
|
+
logger.info(
|
|
599
|
+
"Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source)
|
|
600
|
+
)
|
|
601
|
+
|
|
495
602
|
## get the column index of the PersonID from the input file
|
|
496
603
|
person_col = person_columns[person_id_source]
|
|
497
604
|
|
|
@@ -516,55 +623,122 @@ def load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_p
|
|
|
516
623
|
def py():
|
|
517
624
|
pass
|
|
518
625
|
|
|
519
|
-
|
|
626
|
+
|
|
627
|
+
def check_dir_isvalid(directory: Path | tuple[Path, ...], create_if_missing: bool = False) -> None:
|
|
628
|
+
"""Check if directory is valid, optionally create it if missing.
|
|
629
|
+
|
|
630
|
+
Args:
|
|
631
|
+
directory: Directory path as string or tuple
|
|
632
|
+
create_if_missing: If True, create directory if it doesn't exist
|
|
633
|
+
"""
|
|
634
|
+
|
|
635
|
+
## check directory has been set
|
|
636
|
+
if directory is None:
|
|
637
|
+
logger.warning("Directory not provided.")
|
|
638
|
+
sys.exit(1)
|
|
639
|
+
|
|
520
640
|
## check output dir is valid
|
|
521
|
-
|
|
641
|
+
elif type(directory) is tuple:
|
|
522
642
|
directory = directory[0]
|
|
523
643
|
|
|
524
|
-
if not os.path.isdir(directory):
|
|
525
|
-
print("Not a directory, dir {0}".format(directory))
|
|
526
|
-
sys.exit(1)
|
|
527
644
|
|
|
528
|
-
|
|
529
|
-
|
|
645
|
+
## if not a directory, create it if requested (including parents. This option is for the output directory only).
|
|
646
|
+
if not directory.is_dir():
|
|
647
|
+
if create_if_missing:
|
|
648
|
+
try:
|
|
649
|
+
## deliberately not using the exist_ok option, as we want to know whether it was created or not to provide different logger messages.
|
|
650
|
+
directory.mkdir(parents = True)
|
|
651
|
+
logger.info(f"Created directory: {directory}")
|
|
652
|
+
except OSError as e:
|
|
653
|
+
logger.warning(f"Failed to create directory {directory}: {e}")
|
|
654
|
+
sys.exit(1)
|
|
655
|
+
else:
|
|
656
|
+
logger.warning(f"Not a directory, dir {directory}")
|
|
657
|
+
sys.exit(1)
|
|
658
|
+
|
|
659
|
+
# Handle tuple input (like input_dir)
|
|
660
|
+
if isinstance(directory, tuple):
|
|
661
|
+
if not directory: # Empty tuple
|
|
662
|
+
print("No directory provided")
|
|
663
|
+
sys.exit(1)
|
|
664
|
+
directory = directory[0]
|
|
665
|
+
|
|
666
|
+
# Handle string input
|
|
667
|
+
dir_path = str(directory)
|
|
668
|
+
if not os.path.isdir(dir_path):
|
|
669
|
+
if create_if_missing:
|
|
670
|
+
try:
|
|
671
|
+
os.makedirs(dir_path)
|
|
672
|
+
print(f"Created directory: {dir_path}")
|
|
673
|
+
except OSError as e:
|
|
674
|
+
print(f"Failed to create directory {dir_path}: {e}")
|
|
675
|
+
sys.exit(1)
|
|
676
|
+
else:
|
|
677
|
+
print(f"Not a directory, dir {dir_path}")
|
|
678
|
+
sys.exit(1)
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
def set_saved_person_id_file(
|
|
682
|
+
saved_person_id_file: Path | None, output_dir: Path
|
|
683
|
+
) -> Path:
|
|
684
|
+
"""check if there is a saved person id file set in options - if not, check if the file exists and remove it"""
|
|
685
|
+
|
|
530
686
|
if saved_person_id_file is None:
|
|
531
|
-
saved_person_id_file = output_dir
|
|
532
|
-
if
|
|
533
|
-
|
|
687
|
+
saved_person_id_file = output_dir / "person_ids.tsv"
|
|
688
|
+
if saved_person_id_file.exists():
|
|
689
|
+
assert not saved_person_id_file.is_dir()
|
|
690
|
+
saved_person_id_file.unlink()
|
|
691
|
+
else:
|
|
692
|
+
assert not saved_person_id_file.is_dir()
|
|
534
693
|
return saved_person_id_file
|
|
535
694
|
|
|
536
|
-
|
|
537
695
|
def check_files_in_rules_exist(rules_input_files: list[str], existing_input_files: list[str]) -> None:
|
|
538
696
|
for infile in existing_input_files:
|
|
539
697
|
if infile not in rules_input_files:
|
|
540
|
-
msg =
|
|
541
|
-
|
|
698
|
+
msg = (
|
|
699
|
+
"WARNING: no mapping rules found for existing input file - {0}".format(
|
|
700
|
+
infile
|
|
701
|
+
)
|
|
702
|
+
)
|
|
703
|
+
logger.warning(msg)
|
|
542
704
|
for infile in rules_input_files:
|
|
543
705
|
if infile not in existing_input_files:
|
|
544
706
|
msg = "WARNING: no data for mapped input file - {0}".format(infile)
|
|
545
|
-
|
|
707
|
+
logger.warning(msg)
|
|
546
708
|
|
|
547
|
-
def open_file(
|
|
548
|
-
|
|
709
|
+
def open_file(file_path: Path) -> tuple[IO[str], Iterator[list[str]]] | None:
|
|
710
|
+
"""opens a file and does something related to CSVs"""
|
|
549
711
|
try:
|
|
550
|
-
fh = open(
|
|
712
|
+
fh = file_path.open(mode="r", encoding="utf-8-sig")
|
|
551
713
|
csvr = csv.reader(fh)
|
|
552
714
|
return fh, csvr
|
|
553
715
|
except IOError as e:
|
|
554
|
-
|
|
555
|
-
|
|
716
|
+
logger.exception("Unable to open: {0}".format(file_path))
|
|
717
|
+
logger.exception("I/O error({0}): {1}".format(e.errno, e.strerror))
|
|
556
718
|
return None
|
|
557
719
|
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
720
|
+
|
|
721
|
+
def set_omop_filenames(
|
|
722
|
+
omop_ddl_file: Path, omop_config_file: Path, omop_version: str
|
|
723
|
+
) -> tuple[Path, Path]:
|
|
724
|
+
if (
|
|
725
|
+
(omop_ddl_file is None)
|
|
726
|
+
and (omop_config_file is None)
|
|
727
|
+
and (omop_version is not None)
|
|
728
|
+
):
|
|
729
|
+
omop_config_file = (
|
|
730
|
+
importlib.resources.files("carrottransform") / "config/omop.json"
|
|
731
|
+
)
|
|
561
732
|
omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
|
|
562
|
-
omop_ddl_file =
|
|
733
|
+
omop_ddl_file = (
|
|
734
|
+
importlib.resources.files("carrottransform") / "config" / omop_ddl_file_name
|
|
735
|
+
)
|
|
563
736
|
return omop_config_file, omop_ddl_file
|
|
564
737
|
|
|
565
|
-
|
|
738
|
+
|
|
739
|
+
def get_person_lookup(saved_person_id_file: Path) -> tuple[dict[str, str], int]:
|
|
566
740
|
# Saved-person-file existence test, reload if found, return last used integer
|
|
567
|
-
if
|
|
741
|
+
if saved_person_id_file.is_file():
|
|
568
742
|
person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
|
|
569
743
|
else:
|
|
570
744
|
person_lookup = {}
|
|
@@ -572,6 +746,3 @@ def get_person_lookup(saved_person_id_file: str) -> tuple[dict[str, str], int]:
|
|
|
572
746
|
return person_lookup, last_used_integer
|
|
573
747
|
|
|
574
748
|
run.add_command(mapstream,"mapstream")
|
|
575
|
-
|
|
576
|
-
if __name__== '__main__':
|
|
577
|
-
mapstream()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import click
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def PathArgs():
|
|
6
|
+
"""used by the click library for CLI args that are files"""
|
|
7
|
+
|
|
8
|
+
class PathArgs(click.ParamType):
|
|
9
|
+
name = "pathlib.Path"
|
|
10
|
+
|
|
11
|
+
def convert(self, value, param, ctx):
|
|
12
|
+
try:
|
|
13
|
+
return Path(value)
|
|
14
|
+
except Exception as e:
|
|
15
|
+
self.fail(f"Invalid path: {value} ({e})", param, ctx)
|
|
16
|
+
|
|
17
|
+
return PathArgs()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# use this
|
|
21
|
+
PathArgs = PathArgs()
|
|
@@ -1,15 +1,41 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
1
3
|
import os
|
|
2
4
|
import sys
|
|
3
5
|
import json
|
|
6
|
+
import importlib.resources as resources
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
4
12
|
|
|
5
13
|
# Function inherited from the "old" CaRROT-CDM (modfied to exit on error)
|
|
6
|
-
|
|
7
|
-
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def load_json(f_in: Path):
|
|
8
17
|
try:
|
|
9
|
-
data = json.load(open(
|
|
18
|
+
data = json.load(f_in.open())
|
|
10
19
|
except Exception as err:
|
|
11
|
-
|
|
20
|
+
logger.exception("{0} not found. Or cannot parse as json".format(f_in))
|
|
12
21
|
sys.exit()
|
|
13
22
|
|
|
14
23
|
return data
|
|
15
24
|
|
|
25
|
+
|
|
26
|
+
def resolve_paths(args: List[Optional[Path]]) -> List[Optional[Path]]:
|
|
27
|
+
"""Resolve special path syntaxes in command line arguments."""
|
|
28
|
+
try:
|
|
29
|
+
with resources.files('carrottransform').joinpath('__init__.py') as f:
|
|
30
|
+
package_path = f.parent
|
|
31
|
+
except Exception:
|
|
32
|
+
# Fallback for development environment
|
|
33
|
+
import carrottransform
|
|
34
|
+
package_path = Path(carrottransform.__file__).resolve().parent
|
|
35
|
+
|
|
36
|
+
# Handle None values and replace @carrot with the actual package path
|
|
37
|
+
prefix = '@carrot'
|
|
38
|
+
return [
|
|
39
|
+
package_path / Path(str(arg).replace(prefix, '').lstrip('/')) if arg is not None and str(arg).startswith(prefix) else arg
|
|
40
|
+
for arg in args
|
|
41
|
+
]
|
|
@@ -3,13 +3,16 @@ import json
|
|
|
3
3
|
import carrottransform.tools as tools
|
|
4
4
|
from .omopcdm import OmopCDM
|
|
5
5
|
|
|
6
|
+
import logging
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
6
9
|
class MappingRules:
|
|
7
10
|
"""
|
|
8
11
|
self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
|
|
9
12
|
as a file-specific dictionary allowing rules to be "looked-up" depending on data content
|
|
10
13
|
"""
|
|
11
14
|
|
|
12
|
-
def __init__(self, rulesfilepath, omopcdm):
|
|
15
|
+
def __init__(self, rulesfilepath: os.PathLike, omopcdm: OmopCDM):
|
|
13
16
|
## just loads the json directly
|
|
14
17
|
self.rules_data = tools.load_json(rulesfilepath)
|
|
15
18
|
self.omopcdm = omopcdm
|
|
@@ -80,7 +83,7 @@ class MappingRules:
|
|
|
80
83
|
outfile = keydata[-1]
|
|
81
84
|
for outfield_elem in outfield_data:
|
|
82
85
|
for infield, outfield_list in outfield_elem.items():
|
|
83
|
-
|
|
86
|
+
logger.debug("{0}, {1}, {2}".format(outfile, infield, str(outfield_list)))
|
|
84
87
|
for outfield in outfield_list:
|
|
85
88
|
if outfield.split('~')[0] in self.omopcdm.get_omop_datetime_fields(outfile):
|
|
86
89
|
datetime_source = infield
|
carrottransform/tools/metrics.py
CHANGED
|
@@ -1,3 +1,95 @@
|
|
|
1
|
+
|
|
2
|
+
import logging
|
|
3
|
+
logger = logging.getLogger(__name__)
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Dict, List
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class DataKey:
|
|
10
|
+
source: str
|
|
11
|
+
fieldname:str
|
|
12
|
+
tablename:str
|
|
13
|
+
concept_id:str
|
|
14
|
+
additional:str
|
|
15
|
+
|
|
16
|
+
def __str__(self) -> str:
|
|
17
|
+
"""
|
|
18
|
+
The original implementation used strings as keys, then split by `~`.
|
|
19
|
+
This is here in case that representation is needed somewhere
|
|
20
|
+
"""
|
|
21
|
+
return f"{self.source}~{self.fieldname}~{self.tablename}~{self.concept_id}~{self.additional}"
|
|
22
|
+
def __hash__(self) -> int:
|
|
23
|
+
"""
|
|
24
|
+
The DataKey is used as a key for a dictionary of key counts
|
|
25
|
+
"""
|
|
26
|
+
return hash((self.source, self.fieldname, self.tablename, self.concept_id, self.additional))
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class CountData:
|
|
30
|
+
counts: Dict[str, int] = field(default_factory=dict)
|
|
31
|
+
|
|
32
|
+
def increment(self, count_type: str):
|
|
33
|
+
if count_type not in self.counts:
|
|
34
|
+
self.counts[count_type] = 0
|
|
35
|
+
self.counts[count_type] += 1
|
|
36
|
+
|
|
37
|
+
def get_count(self, count_type: str, default: int=0):
|
|
38
|
+
return self.counts.get(count_type, default)
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class MapstreamSummaryRow:
|
|
42
|
+
"""Represents a single row in the mapstream summary"""
|
|
43
|
+
dataset_name: str
|
|
44
|
+
source: str
|
|
45
|
+
fieldname: str
|
|
46
|
+
tablename: str
|
|
47
|
+
concept_id: str
|
|
48
|
+
additional: str
|
|
49
|
+
input_count: int = 0
|
|
50
|
+
invalid_person_ids: int = 0
|
|
51
|
+
invalid_date_fields: int = 0
|
|
52
|
+
invalid_source_fields: int = 0
|
|
53
|
+
output_count: int = 0
|
|
54
|
+
|
|
55
|
+
def to_tsv_row(self) -> str:
|
|
56
|
+
"""Convert the row to a tab-separated string"""
|
|
57
|
+
row_list = [str(col) for col in [
|
|
58
|
+
self.dataset_name,
|
|
59
|
+
self.source,
|
|
60
|
+
self.fieldname,
|
|
61
|
+
self.tablename,
|
|
62
|
+
self.concept_id,
|
|
63
|
+
self.additional,
|
|
64
|
+
self.input_count,
|
|
65
|
+
self.invalid_person_ids,
|
|
66
|
+
self.invalid_date_fields,
|
|
67
|
+
self.invalid_source_fields,
|
|
68
|
+
self.output_count
|
|
69
|
+
]]
|
|
70
|
+
# If python gets updated, you can move the row_str expression into the f-string
|
|
71
|
+
row_str = '\t'.join(row_list)
|
|
72
|
+
return f"{row_str}\n"
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def get_header(cls) -> str:
|
|
76
|
+
"""Return the TSV header row"""
|
|
77
|
+
header = [
|
|
78
|
+
"dsname",
|
|
79
|
+
"source",
|
|
80
|
+
"source_field",
|
|
81
|
+
"target",
|
|
82
|
+
"concept_id",
|
|
83
|
+
"additional",
|
|
84
|
+
"incount",
|
|
85
|
+
"invalid_persid",
|
|
86
|
+
"invalid_date",
|
|
87
|
+
"invalid_source",
|
|
88
|
+
"outcount"
|
|
89
|
+
]
|
|
90
|
+
header_str = '\t'.join(header)
|
|
91
|
+
return f"{header_str}\n"
|
|
92
|
+
|
|
1
93
|
class Metrics():
|
|
2
94
|
"""
|
|
3
95
|
Capture metrics for output to a summary tsv file, record counts at multiple levels
|
|
@@ -58,21 +150,87 @@ class Metrics():
|
|
|
58
150
|
self.datasummary[dkey][counttype] = 0
|
|
59
151
|
self.datasummary[dkey][counttype] += int(count_block[counttype])
|
|
60
152
|
|
|
61
|
-
def increment_key_count(self,
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
"""
|
|
153
|
+
def increment_key_count(self, source, fieldname, tablename, concept_id, additional, count_type):
|
|
154
|
+
dkey = DataKey(source, fieldname, tablename, concept_id, additional)
|
|
155
|
+
|
|
65
156
|
if dkey not in self.datasummary:
|
|
66
|
-
self.datasummary[dkey] =
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
157
|
+
self.datasummary[dkey] = CountData()
|
|
158
|
+
|
|
159
|
+
self.datasummary[dkey].increment(count_type)
|
|
160
|
+
|
|
161
|
+
def increment_with_datacol(
|
|
162
|
+
self,
|
|
163
|
+
source_path: str,
|
|
164
|
+
target_file: str,
|
|
165
|
+
datacol: str,
|
|
166
|
+
out_record: List[str],
|
|
167
|
+
) -> None:
|
|
168
|
+
#Are the parameters for DataKeys hierarchical?
|
|
169
|
+
#If so, a nested structure where a Source contains n Fields etc. and each has a method to sum its children would be better
|
|
170
|
+
#But I don't know if that's the desired behaviour
|
|
171
|
+
|
|
172
|
+
#A lot of these increment the same thing, so I have defined `increment_this`
|
|
173
|
+
def increment_this(
|
|
174
|
+
fieldname: str,
|
|
175
|
+
concept_id: str,
|
|
176
|
+
additional = "",
|
|
177
|
+
) -> None:
|
|
178
|
+
self.increment_key_count(
|
|
179
|
+
source=source_path,
|
|
180
|
+
fieldname=fieldname,
|
|
181
|
+
tablename=target_file,
|
|
182
|
+
concept_id=concept_id,
|
|
183
|
+
additional=additional,
|
|
184
|
+
count_type="output_count"
|
|
185
|
+
)
|
|
186
|
+
self.increment_key_count(
|
|
187
|
+
source=source_path,
|
|
188
|
+
fieldname="all",
|
|
189
|
+
tablename="all",
|
|
190
|
+
concept_id="all",
|
|
191
|
+
additional="",
|
|
192
|
+
count_type="output_count"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
self.increment_key_count(
|
|
196
|
+
source="all",
|
|
197
|
+
fieldname="all",
|
|
198
|
+
tablename=target_file,
|
|
199
|
+
concept_id="all",
|
|
200
|
+
additional="",
|
|
201
|
+
count_type="output_count"
|
|
202
|
+
)
|
|
203
|
+
increment_this(fieldname="all", concept_id="all")
|
|
204
|
+
|
|
205
|
+
if target_file == "person":
|
|
206
|
+
increment_this(fieldname="all", concept_id=out_record[1])
|
|
207
|
+
increment_this(fieldname="all", concept_id=out_record[1], additional=out_record[2])
|
|
208
|
+
else:
|
|
209
|
+
increment_this(fieldname=datacol, concept_id=out_record[2])
|
|
210
|
+
increment_this(fieldname="all", concept_id=out_record[2])
|
|
211
|
+
self.increment_key_count(
|
|
212
|
+
source="all",
|
|
213
|
+
fieldname="all",
|
|
214
|
+
tablename=target_file,
|
|
215
|
+
concept_id=out_record[2],
|
|
216
|
+
additional="",
|
|
217
|
+
count_type="output_count"
|
|
218
|
+
)
|
|
219
|
+
self.increment_key_count(
|
|
220
|
+
source="all",
|
|
221
|
+
fieldname="all",
|
|
222
|
+
tablename="all",
|
|
223
|
+
concept_id=out_record[2],
|
|
224
|
+
additional="",
|
|
225
|
+
count_type="output_count"
|
|
226
|
+
)
|
|
227
|
+
|
|
70
228
|
|
|
71
229
|
def get_summary(self):
|
|
72
230
|
summary_str = "source\ttablename\tname\tcolumn name\tbefore\tafter content check\tpct reject content check\tafter date format check\tpct reject date format\n"
|
|
73
231
|
|
|
74
232
|
for dkey in self.datasummary:
|
|
75
|
-
|
|
233
|
+
logger.debug(dkey)
|
|
76
234
|
source, tablename, name, colname = dkey.split('.')
|
|
77
235
|
before_count = int(self.datasummary[dkey]["before"])
|
|
78
236
|
after_count = int(self.datasummary[dkey]["after"])
|
|
@@ -90,40 +248,54 @@ class Metrics():
|
|
|
90
248
|
def get_data_summary(self):
|
|
91
249
|
return self.datasummary
|
|
92
250
|
|
|
93
|
-
def
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
source, fieldname, tablename, concept_id, additional = dkey.split('~')
|
|
99
|
-
except ValueError:
|
|
100
|
-
print("get_mapstream_summary - ValueError: {0}".format(dkey))
|
|
101
|
-
break
|
|
102
|
-
|
|
103
|
-
source = self.get_prefix(source)
|
|
104
|
-
dvalue = self.datasummary[dkey]
|
|
105
|
-
|
|
106
|
-
input_count = "0"
|
|
107
|
-
if "input_count" in dvalue:
|
|
108
|
-
input_count = str(dvalue["input_count"])
|
|
251
|
+
def get_mapstream_summary_rows(self) -> List[MapstreamSummaryRow]:
|
|
252
|
+
"""
|
|
253
|
+
Creates a list of MapstreamSummaryRow from the datasummary
|
|
254
|
+
"""
|
|
255
|
+
rows = []
|
|
109
256
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
257
|
+
for d_key in sorted(self.datasummary.keys(), key=str):
|
|
258
|
+
source = self.get_prefix(d_key.source)
|
|
259
|
+
count_data = self.datasummary[d_key]
|
|
113
260
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
261
|
+
row = MapstreamSummaryRow(
|
|
262
|
+
dataset_name=self.dataset_name,
|
|
263
|
+
source=source,
|
|
264
|
+
fieldname=d_key.fieldname,
|
|
265
|
+
tablename=d_key.tablename,
|
|
266
|
+
concept_id=d_key.concept_id,
|
|
267
|
+
additional=d_key.additional,
|
|
268
|
+
input_count=count_data.get_count("input_count"),
|
|
269
|
+
invalid_person_ids=count_data.get_count("invalid_person_ids"),
|
|
270
|
+
invalid_date_fields=count_data.get_count("invalid_date_fields"),
|
|
271
|
+
invalid_source_fields=count_data.get_count("invalid_source_fields"),
|
|
272
|
+
output_count=count_data.get_count("output_count")
|
|
273
|
+
)
|
|
117
274
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
275
|
+
if row.output_count >= self.log_threshold:
|
|
276
|
+
rows.append(row)
|
|
277
|
+
return rows
|
|
121
278
|
|
|
122
|
-
output_count = "0"
|
|
123
|
-
if "output_count" in dvalue:
|
|
124
|
-
output_count = str(dvalue["output_count"])
|
|
125
279
|
|
|
126
|
-
|
|
127
|
-
|
|
280
|
+
def get_mapstream_summary(self) -> str:
|
|
281
|
+
"""
|
|
282
|
+
Makes a TSV string of the mapstream summary
|
|
283
|
+
"""
|
|
284
|
+
summary_rows = self.get_mapstream_summary_rows()
|
|
285
|
+
result = MapstreamSummaryRow.get_header()
|
|
286
|
+
|
|
287
|
+
for row in summary_rows:
|
|
288
|
+
result += row.to_tsv_row()
|
|
289
|
+
|
|
290
|
+
return result
|
|
128
291
|
|
|
129
|
-
|
|
292
|
+
def get_mapstream_summary_dict(self) -> Dict:
|
|
293
|
+
"""
|
|
294
|
+
Makes a dict of the mapstream summary
|
|
295
|
+
"""
|
|
296
|
+
rows = self.get_mapstream_summary_rows()
|
|
297
|
+
return {
|
|
298
|
+
"dataset": self.dataset_name,
|
|
299
|
+
"threshold": self.log_threshold,
|
|
300
|
+
"rows": [vars(row) for row in rows]
|
|
301
|
+
}
|
carrottransform/tools/omopcdm.py
CHANGED
|
@@ -1,8 +1,13 @@
|
|
|
1
1
|
import carrottransform.tools as tools
|
|
2
2
|
import json
|
|
3
|
+
import logging
|
|
3
4
|
import re
|
|
4
5
|
import sys
|
|
5
6
|
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
6
11
|
class OmopCDM:
|
|
7
12
|
"""
|
|
8
13
|
Load and parse OMOP DDL data, to make an in-memory json CDM
|
|
@@ -29,11 +34,11 @@ class OmopCDM:
|
|
|
29
34
|
self.auto_number_field = self.get_columns("auto_number_field")
|
|
30
35
|
|
|
31
36
|
|
|
32
|
-
def load_ddl(self, omopddl):
|
|
37
|
+
def load_ddl(self, omopddl: Path):
|
|
33
38
|
try:
|
|
34
|
-
fp = open(
|
|
39
|
+
fp = omopddl.open("r")
|
|
35
40
|
except Exception as err:
|
|
36
|
-
|
|
41
|
+
logger.exception("OMOP ddl file ({0}) not found".format(omopddl))
|
|
37
42
|
sys.exit()
|
|
38
43
|
|
|
39
44
|
return(self.process_ddl(fp))
|
|
File without changes
|
|
File without changes
|