carrot-transform 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of carrot-transform might be problematic. Click here for more details.
- carrot_transform-0.1.0.dist-info/LICENSE +21 -0
- carrot_transform-0.1.0.dist-info/METADATA +48 -0
- carrot_transform-0.1.0.dist-info/RECORD +17 -0
- carrot_transform-0.1.0.dist-info/WHEEL +4 -0
- carrottransform/__init__.py +5 -0
- carrottransform/_version.py +2 -0
- carrottransform/cli/__init__.py +0 -0
- carrottransform/cli/command.py +21 -0
- carrottransform/cli/subcommands/__init__.py +0 -0
- carrottransform/cli/subcommands/run.py +496 -0
- carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql +508 -0
- carrottransform/config/omop.json +67 -0
- carrottransform/tools/__init__.py +17 -0
- carrottransform/tools/file_helpers.py +15 -0
- carrottransform/tools/mappingrules.py +161 -0
- carrottransform/tools/metrics.py +129 -0
- carrottransform/tools/omopcdm.py +187 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) [2024] [Philip Duncan Appleby]
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: carrot_transform
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary:
|
|
5
|
+
Author: anwarfg
|
|
6
|
+
Author-email: 913028+anwarfg@users.noreply.github.com
|
|
7
|
+
Requires-Python: >=3.10,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Requires-Dist: click (>=8.1.7,<9.0.0)
|
|
14
|
+
Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
|
|
15
|
+
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
<p align="center">
|
|
19
|
+
<a href="https://carrot.ac.uk/" target="_blank">
|
|
20
|
+
<picture>
|
|
21
|
+
<source media="(prefers-color-scheme: dark)" srcset="/images/logo-dark.png">
|
|
22
|
+
<img alt="Carrot Logo" src="/images/logo-primary.png" width="280"/>
|
|
23
|
+
</picture>
|
|
24
|
+
</a>
|
|
25
|
+
</p>
|
|
26
|
+
<div align="center">
|
|
27
|
+
<strong>
|
|
28
|
+
<h2>Streamlined Data Mapping to OMOP</h2>
|
|
29
|
+
<a href="https://carrot.ac.uk/">Carrot Tranform</a> executes the conversion of the data to the OMOP CDM.<br />
|
|
30
|
+
</strong>
|
|
31
|
+
</div>
|
|
32
|
+
|
|
33
|
+
TODO:
|
|
34
|
+
|
|
35
|
+
- Document carrot-transform
|
|
36
|
+
- Add more comments in-code
|
|
37
|
+
- Handle capture of ddl and json config via the command-line as optional args
|
|
38
|
+
|
|
39
|
+
Reduction in complexity over the original CaRROT-CDM version for the Transform part of _ETL_ - In practice _Extract_ is always
|
|
40
|
+
performed by Data Partners, _Load_ by database bulk-load software.
|
|
41
|
+
|
|
42
|
+
Statistics
|
|
43
|
+
|
|
44
|
+
External libraries imported (approximate)
|
|
45
|
+
|
|
46
|
+
carrot-cdm 61
|
|
47
|
+
carrot-transform 12
|
|
48
|
+
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
carrottransform/__init__.py,sha256=cQJKTCpG2qmKxDl-VtSWQ3_WFjyzg4u_8nZacWAHFcU,73
|
|
2
|
+
carrottransform/_version.py,sha256=NfGqG2TgfjxxrlCHaOtwl3BcE0f6UH0VPrQgoDPjV7Y,72
|
|
3
|
+
carrottransform/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
carrottransform/cli/command.py,sha256=xYTaJsVZyRYv0CzUwrh7ZPK8hhGyC3MDfvVYxHcXYSM,508
|
|
5
|
+
carrottransform/cli/subcommands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
carrottransform/cli/subcommands/run.py,sha256=3z5cRG4ekyPOP5tvjZOyHUxbclKfBr_Z0tQRRoKj73E,20651
|
|
7
|
+
carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql,sha256=fXrPfdL3IzU5ux55ogsQKjjd-c1KzdP_N2A_JjlY3gk,18084
|
|
8
|
+
carrottransform/config/omop.json,sha256=OT3jvfPjKhjsDnQcQw1OAEOHhQLoHXNxTj_MDwNbYqo,1934
|
|
9
|
+
carrottransform/tools/__init__.py,sha256=b3JuCwgJVx0rqx5igB8hNNKO0ktlbQjHGHwy-vzpdo0,198
|
|
10
|
+
carrottransform/tools/file_helpers.py,sha256=xlODDAUpsx0H4sweGZ81ttjJjNQGn2spNUa1Fndotw8,316
|
|
11
|
+
carrottransform/tools/mappingrules.py,sha256=bV6tXHBwVeKAUgCwFTZE2-qTcxKtbs3zbJWedBSviVI,6567
|
|
12
|
+
carrottransform/tools/metrics.py,sha256=LOzm80-YIVM9mvgvQXRpyArl2nSfSTTW9DikqJ5M2Yg,5700
|
|
13
|
+
carrottransform/tools/omopcdm.py,sha256=ycyPGgUTUwui7MLxH8JXd-MyCRkG0xOfEoDhCXeogmQ,7623
|
|
14
|
+
carrot_transform-0.1.0.dist-info/LICENSE,sha256=pqIiuuTs6Na-oFd10MMsZoZmdfhfUhHeOtQzgzSkcaw,1082
|
|
15
|
+
carrot_transform-0.1.0.dist-info/METADATA,sha256=d-7gk1VxSdU4MzrMLQi5lnIVdQ4tIBlJA3MMmY_5Nsc,1474
|
|
16
|
+
carrot_transform-0.1.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
|
17
|
+
carrot_transform-0.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Package entry point - sets up the "run" subcommand
|
|
2
|
+
from .subcommands.run import run
|
|
3
|
+
|
|
4
|
+
import carrottransform as c
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
@click.group(invoke_without_command=True)
|
|
8
|
+
@click.option("--version","-v",is_flag=True)
|
|
9
|
+
@click.pass_context
|
|
10
|
+
def transform(ctx,version):
|
|
11
|
+
if ctx.invoked_subcommand == None :
|
|
12
|
+
if version:
|
|
13
|
+
click.echo(c.__version__)
|
|
14
|
+
else:
|
|
15
|
+
click.echo(ctx.get_help())
|
|
16
|
+
return
|
|
17
|
+
|
|
18
|
+
transform.add_command(run, "run")
|
|
19
|
+
|
|
20
|
+
if __name__ == "__main__":
|
|
21
|
+
transform()
|
|
File without changes
|
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import os, time
|
|
3
|
+
import datetime
|
|
4
|
+
import fnmatch
|
|
5
|
+
import sys
|
|
6
|
+
import click
|
|
7
|
+
import json
|
|
8
|
+
import importlib.resources
|
|
9
|
+
import carrottransform
|
|
10
|
+
import carrottransform.tools as tools
|
|
11
|
+
|
|
12
|
+
@click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
|
|
13
|
+
def run():
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
@click.command()
|
|
17
|
+
@click.option("--rules-file",
|
|
18
|
+
required=True,
|
|
19
|
+
help="json file containing mapping rules")
|
|
20
|
+
@click.option("--output-dir",
|
|
21
|
+
default=None,
|
|
22
|
+
help="define the output directory for OMOP-format tsv files")
|
|
23
|
+
@click.option("--write-mode",
|
|
24
|
+
default='w',
|
|
25
|
+
type=click.Choice(['w','a']),
|
|
26
|
+
help="force write-mode on output files")
|
|
27
|
+
@click.option("--person-file",
|
|
28
|
+
required=True,
|
|
29
|
+
help="File containing person_ids in the first column")
|
|
30
|
+
@click.option("--omop-ddl-file",
|
|
31
|
+
required=False,
|
|
32
|
+
help="File containing OHDSI ddl statements for OMOP tables")
|
|
33
|
+
@click.option("--omop-config-file",
|
|
34
|
+
required=False,
|
|
35
|
+
help="File containing additional / override json config for omop outputs")
|
|
36
|
+
@click.option("--omop-version",
|
|
37
|
+
required=False,
|
|
38
|
+
help="Quoted string containing opmop version - eg '5.3'")
|
|
39
|
+
@click.option("--saved-person-id-file",
|
|
40
|
+
default=None,
|
|
41
|
+
required=False,
|
|
42
|
+
help="Full path to person id file used to save person_id state and share person_ids between data sets")
|
|
43
|
+
@click.option("--use-input-person-ids",
|
|
44
|
+
required=False,
|
|
45
|
+
default='N',
|
|
46
|
+
help="Use person ids as input without generating new integers")
|
|
47
|
+
@click.option("--last-used-ids-file",
|
|
48
|
+
default=None,
|
|
49
|
+
required=False,
|
|
50
|
+
help="Full path to last used ids file for OMOP tables - format: tablename\tlast_used_id, \nwhere last_used_id must be an integer")
|
|
51
|
+
@click.option("--log-file-threshold",
|
|
52
|
+
required=False,
|
|
53
|
+
default=0,
|
|
54
|
+
help="Lower outcount limit for logfile output")
|
|
55
|
+
@click.argument("input-dir",
|
|
56
|
+
required=False,
|
|
57
|
+
nargs=-1)
|
|
58
|
+
def mapstream(rules_file, output_dir, write_mode,
|
|
59
|
+
person_file, omop_ddl_file, omop_config_file,
|
|
60
|
+
omop_version, saved_person_id_file, use_input_person_ids,
|
|
61
|
+
last_used_ids_file, log_file_threshold, input_dir):
|
|
62
|
+
"""
|
|
63
|
+
Map to output using input streams
|
|
64
|
+
"""
|
|
65
|
+
# Initialisation
|
|
66
|
+
# - check for values in optional arguments
|
|
67
|
+
# - read in configuration files
|
|
68
|
+
# - check main directories for existence
|
|
69
|
+
# - handle saved persion ids
|
|
70
|
+
# - initialise metrics
|
|
71
|
+
if (omop_ddl_file == None) and (omop_config_file == None) and (omop_version != None):
|
|
72
|
+
omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
|
|
73
|
+
omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
|
|
74
|
+
omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
|
|
75
|
+
|
|
76
|
+
if os.path.isdir(input_dir[0]) == False:
|
|
77
|
+
print("Not a directory, input dir {0}".format(input_dir[0]))
|
|
78
|
+
sys.exit(1)
|
|
79
|
+
|
|
80
|
+
if os.path.isdir(output_dir) == False:
|
|
81
|
+
print("Not a directory, output dir {0}".format(output_dir))
|
|
82
|
+
sys.exit(1)
|
|
83
|
+
|
|
84
|
+
if saved_person_id_file == None:
|
|
85
|
+
saved_person_id_file = output_dir + "/" + "person_ids.tsv"
|
|
86
|
+
if os.path.exists(saved_person_id_file):
|
|
87
|
+
os.remove(saved_person_id_file)
|
|
88
|
+
|
|
89
|
+
starttime = time.time()
|
|
90
|
+
omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
|
|
91
|
+
mappingrules = tools.mappingrules.MappingRules(rules_file, omopcdm)
|
|
92
|
+
metrics = tools.metrics.Metrics(mappingrules.get_dataset_name(), log_file_threshold)
|
|
93
|
+
nowtime = time.time()
|
|
94
|
+
|
|
95
|
+
print("--------------------------------------------------------------------------------")
|
|
96
|
+
print("Loaded mapping rules from: {0} in {1:.5f} secs".format(rules_file, (nowtime - starttime)))
|
|
97
|
+
output_files = mappingrules.get_all_outfile_names()
|
|
98
|
+
record_numbers = {}
|
|
99
|
+
for output_file in output_files:
|
|
100
|
+
record_numbers[output_file] = 1
|
|
101
|
+
|
|
102
|
+
fhd = {}
|
|
103
|
+
tgtcolmaps = {}
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
# Saved-person-file existence test, reload if found, return last used integer
|
|
107
|
+
if os.path.isfile(saved_person_id_file):
|
|
108
|
+
person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
|
|
109
|
+
else:
|
|
110
|
+
person_lookup = {}
|
|
111
|
+
last_used_integer = 1
|
|
112
|
+
if last_used_ids_file != None:
|
|
113
|
+
if os.path.isfile(last_used_ids_file):
|
|
114
|
+
record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
|
|
115
|
+
|
|
116
|
+
person_lookup, rejected_person_count = load_person_ids(person_file, person_lookup, mappingrules, use_input_person_ids, last_used_integer)
|
|
117
|
+
fhpout = open(saved_person_id_file, mode="w")
|
|
118
|
+
fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
|
|
119
|
+
for person_id, person_assigned_id in person_lookup.items():
|
|
120
|
+
fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
|
|
121
|
+
fhpout.close()
|
|
122
|
+
# Initialise output files, output a header for each
|
|
123
|
+
for tgtfile in output_files:
|
|
124
|
+
fhd[tgtfile] = open(output_dir + "/" + tgtfile + ".tsv", mode=write_mode)
|
|
125
|
+
if write_mode == 'w':
|
|
126
|
+
outhdr = omopcdm.get_omop_column_list(tgtfile)
|
|
127
|
+
fhd[tgtfile].write("\t".join(outhdr) + "\n")
|
|
128
|
+
tgtcolmaps[tgtfile] = omopcdm.get_omop_column_map(tgtfile)
|
|
129
|
+
|
|
130
|
+
except IOError as e:
|
|
131
|
+
print("I/O - error({0}): {1} -> {2}".format(e.errno, e.strerror, str(e)))
|
|
132
|
+
exit()
|
|
133
|
+
|
|
134
|
+
print("person_id stats: total loaded {0}, reject count {1}".format(len(person_lookup), rejected_person_count))
|
|
135
|
+
|
|
136
|
+
# Compare files found in the input_dir with those expected based on mapping rules
|
|
137
|
+
existing_input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
|
|
138
|
+
rules_input_files = mappingrules.get_all_infile_names()
|
|
139
|
+
# Log mismatches but continue
|
|
140
|
+
for infile in existing_input_files:
|
|
141
|
+
if infile not in rules_input_files:
|
|
142
|
+
msg = "ERROR: no mapping rules found for existing input file - {0}".format(infile)
|
|
143
|
+
print(msg)
|
|
144
|
+
for infile in rules_input_files:
|
|
145
|
+
if infile not in existing_input_files:
|
|
146
|
+
msg = "ERROR: no data for mapped input file - {0}".format(infile)
|
|
147
|
+
print(msg)
|
|
148
|
+
|
|
149
|
+
# set up overall counts
|
|
150
|
+
rejidcounts = {}
|
|
151
|
+
rejdatecounts = {}
|
|
152
|
+
print(rules_input_files)
|
|
153
|
+
|
|
154
|
+
# set up per-input counts
|
|
155
|
+
for srcfilename in rules_input_files:
|
|
156
|
+
rejidcounts[srcfilename] = 0
|
|
157
|
+
rejdatecounts[srcfilename] = 0
|
|
158
|
+
|
|
159
|
+
# main processing loop, for each input file
|
|
160
|
+
for srcfilename in rules_input_files:
|
|
161
|
+
outcounts = {}
|
|
162
|
+
rejcounts = {}
|
|
163
|
+
rcount = 0
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
fh = open(input_dir[0] + "/" + srcfilename, mode="r", encoding="utf-8-sig")
|
|
167
|
+
csvr = csv.reader(fh)
|
|
168
|
+
except IOError as e:
|
|
169
|
+
print("Unable to open: {0}".format(input_dir[0] + "/" + srcfilename))
|
|
170
|
+
print("I/O error({0}): {1}".format(e.errno, e.strerror))
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
tgtfiles, src_to_tgt = mappingrules.parse_rules_src_to_tgt(srcfilename)
|
|
174
|
+
infile_datetime_source, infile_person_id_source = mappingrules.get_infile_date_person_id(srcfilename)
|
|
175
|
+
for tgtfile in tgtfiles:
|
|
176
|
+
outcounts[tgtfile] = 0
|
|
177
|
+
rejcounts[tgtfile] = 0
|
|
178
|
+
datacolsall = []
|
|
179
|
+
hdrdata = next(csvr)
|
|
180
|
+
dflist = mappingrules.get_infile_data_fields(srcfilename)
|
|
181
|
+
for colname in hdrdata:
|
|
182
|
+
datacolsall.append(colname)
|
|
183
|
+
inputcolmap = omopcdm.get_column_map(hdrdata)
|
|
184
|
+
pers_id_col = inputcolmap[infile_person_id_source]
|
|
185
|
+
datetime_col = inputcolmap[infile_datetime_source]
|
|
186
|
+
print("--------------------------------------------------------------------------------")
|
|
187
|
+
print("Processing input: {0}".format(srcfilename))
|
|
188
|
+
|
|
189
|
+
# for each input record
|
|
190
|
+
for indata in csvr:
|
|
191
|
+
key = srcfilename + "~all~all~all~"
|
|
192
|
+
metrics.increment_key_count(key, "input_count")
|
|
193
|
+
rcount += 1
|
|
194
|
+
strdate = indata[datetime_col].split(" ")[0]
|
|
195
|
+
fulldate = parse_date(strdate)
|
|
196
|
+
if fulldate != None:
|
|
197
|
+
indata[datetime_col] = fulldate
|
|
198
|
+
else:
|
|
199
|
+
metrics.increment_key_count(key, "invalid_date_fields")
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
for tgtfile in tgtfiles:
|
|
203
|
+
tgtcolmap = tgtcolmaps[tgtfile]
|
|
204
|
+
auto_num_col = omopcdm.get_omop_auto_number_field(tgtfile)
|
|
205
|
+
pers_id_col = omopcdm.get_omop_person_id_field(tgtfile)
|
|
206
|
+
|
|
207
|
+
datacols = datacolsall
|
|
208
|
+
if tgtfile in dflist:
|
|
209
|
+
datacols = dflist[tgtfile]
|
|
210
|
+
|
|
211
|
+
for datacol in datacols:
|
|
212
|
+
built_records, outrecords, metrics = get_target_records(tgtfile, tgtcolmap, src_to_tgt, datacol, indata, inputcolmap, srcfilename, omopcdm, metrics)
|
|
213
|
+
if built_records == True:
|
|
214
|
+
for outrecord in outrecords:
|
|
215
|
+
if auto_num_col != None:
|
|
216
|
+
outrecord[tgtcolmap[auto_num_col]] = str(record_numbers[tgtfile])
|
|
217
|
+
record_numbers[tgtfile] += 1
|
|
218
|
+
if (outrecord[tgtcolmap[pers_id_col]]) in person_lookup:
|
|
219
|
+
outrecord[tgtcolmap[pers_id_col]] = person_lookup[outrecord[tgtcolmap[pers_id_col]]]
|
|
220
|
+
outcounts[tgtfile] += 1
|
|
221
|
+
key = srcfilename + "~all~all~all~"
|
|
222
|
+
metrics.increment_key_count(key, "output_count")
|
|
223
|
+
key = "all~all~" + tgtfile + "~all~"
|
|
224
|
+
metrics.increment_key_count(key, "output_count")
|
|
225
|
+
key = srcfilename + "~all~" + tgtfile + "~all~"
|
|
226
|
+
metrics.increment_key_count(key, "output_count")
|
|
227
|
+
if tgtfile == "person":
|
|
228
|
+
key = srcfilename + "~all~" + tgtfile + "~" + outrecord[1] +"~"
|
|
229
|
+
metrics.increment_key_count(key, "output_count")
|
|
230
|
+
key = srcfilename + "~" + datacol +"~" + tgtfile + "~" + outrecord[1] + "~" + outrecord[2]
|
|
231
|
+
metrics.increment_key_count(key, "output_count")
|
|
232
|
+
else:
|
|
233
|
+
key = srcfilename + "~" + datacol +"~" + tgtfile + "~" + outrecord[2] + "~"
|
|
234
|
+
metrics.increment_key_count(key, "output_count")
|
|
235
|
+
key = srcfilename + "~all~" + tgtfile + "~" + outrecord[2] + "~"
|
|
236
|
+
metrics.increment_key_count(key, "output_count")
|
|
237
|
+
key = "all~all~" + tgtfile + "~" + outrecord[2] + "~"
|
|
238
|
+
metrics.increment_key_count(key, "output_count")
|
|
239
|
+
key = "all~all~all~" + outrecord[2] + "~"
|
|
240
|
+
metrics.increment_key_count(key, "output_count")
|
|
241
|
+
fhd[tgtfile].write("\t".join(outrecord) + "\n")
|
|
242
|
+
else:
|
|
243
|
+
key = srcfilename + "~all~" + tgtfile + "~all~"
|
|
244
|
+
metrics.increment_key_count(key, "invalid_person_ids")
|
|
245
|
+
rejidcounts[srcfilename] += 1
|
|
246
|
+
|
|
247
|
+
fh.close()
|
|
248
|
+
|
|
249
|
+
nowtime= time.time()
|
|
250
|
+
print("INPUT file data : {0}: input count {1}, time since start {2:.5} secs".format(srcfilename, str(rcount), (nowtime - starttime)))
|
|
251
|
+
for outtablename, count in outcounts.items():
|
|
252
|
+
print("TARGET: {0}: output count {1}".format(outtablename, str(count)))
|
|
253
|
+
# END main processing loop
|
|
254
|
+
|
|
255
|
+
print("--------------------------------------------------------------------------------")
|
|
256
|
+
data_summary = metrics.get_mapstream_summary()
|
|
257
|
+
try:
|
|
258
|
+
dsfh = open(output_dir + "/summary_mapstream.tsv", mode="w")
|
|
259
|
+
dsfh.write(data_summary)
|
|
260
|
+
dsfh.close()
|
|
261
|
+
except IOError as e:
|
|
262
|
+
print("I/O error({0}): {1}".format(e.errno, e.strerror))
|
|
263
|
+
print("Unable to write file")
|
|
264
|
+
|
|
265
|
+
# END mapstream
|
|
266
|
+
nowtime = time.time()
|
|
267
|
+
print("Elapsed time = {0:.5f} secs".format(nowtime - starttime))
|
|
268
|
+
|
|
269
|
+
def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srccolmap, srcfilename, omopcdm, metrics):
|
|
270
|
+
"""
|
|
271
|
+
build all target records for a given input field
|
|
272
|
+
"""
|
|
273
|
+
build_records = False
|
|
274
|
+
tgtrecords = []
|
|
275
|
+
date_col_data = omopcdm.get_omop_datetime_linked_fields(tgtfilename)
|
|
276
|
+
date_component_data = omopcdm.get_omop_date_field_components(tgtfilename)
|
|
277
|
+
notnull_numeric_fields = omopcdm.get_omop_notnull_numeric_fields(tgtfilename)
|
|
278
|
+
|
|
279
|
+
srckey = srcfilename + "~" + srcfield + "~" + tgtfilename
|
|
280
|
+
summarykey = srcfilename + "~" + srcfield + "~" + tgtfilename + "~all~"
|
|
281
|
+
if valid_value(str(srcdata[srccolmap[srcfield]])):
|
|
282
|
+
srcfullkey = srcfilename + "~" + srcfield + "~" + str(srcdata[srccolmap[srcfield]]) + "~" + tgtfilename
|
|
283
|
+
dictkeys = []
|
|
284
|
+
if srcfullkey in rulesmap:
|
|
285
|
+
build_records = True
|
|
286
|
+
dictkeys.append(srcfullkey)
|
|
287
|
+
if srckey in rulesmap:
|
|
288
|
+
build_records = True
|
|
289
|
+
dictkeys.append(srckey)
|
|
290
|
+
if build_records == True:
|
|
291
|
+
for dictkey in dictkeys:
|
|
292
|
+
for out_data_elem in rulesmap[dictkey]:
|
|
293
|
+
valid_data_elem = True
|
|
294
|
+
tgtarray = ['']*len(tgtcolmap)
|
|
295
|
+
for req_integer in notnull_numeric_fields:
|
|
296
|
+
tgtarray[tgtcolmap[req_integer]] = "0"
|
|
297
|
+
for infield, outfield_list in out_data_elem.items():
|
|
298
|
+
for output_col_data in outfield_list:
|
|
299
|
+
if "~" in output_col_data:
|
|
300
|
+
outcol, term = output_col_data.split("~")
|
|
301
|
+
tgtarray[tgtcolmap[outcol]] = term
|
|
302
|
+
else:
|
|
303
|
+
tgtarray[tgtcolmap[output_col_data]] = srcdata[srccolmap[infield]]
|
|
304
|
+
if output_col_data in date_component_data:
|
|
305
|
+
strdate = srcdata[srccolmap[infield]].split(" ")[0]
|
|
306
|
+
dt = get_datetime_value(strdate)
|
|
307
|
+
if dt != None:
|
|
308
|
+
year_field = date_component_data[output_col_data]["year"]
|
|
309
|
+
month_field = date_component_data[output_col_data]["month"]
|
|
310
|
+
day_field = date_component_data[output_col_data]["day"]
|
|
311
|
+
tgtarray[tgtcolmap[year_field]] = str(dt.year)
|
|
312
|
+
tgtarray[tgtcolmap[month_field]] = str(dt.month)
|
|
313
|
+
tgtarray[tgtcolmap[day_field]] = str(dt.day)
|
|
314
|
+
fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
|
|
315
|
+
tgtarray[tgtcolmap[output_col_data]] = fulldate
|
|
316
|
+
else:
|
|
317
|
+
metrics.increment_key_count(summarykey, "invalid_date_fields")
|
|
318
|
+
valid_data_elem = False
|
|
319
|
+
elif output_col_data in date_col_data:
|
|
320
|
+
fulldate = srcdata[srccolmap[infield]]
|
|
321
|
+
tgtarray[tgtcolmap[output_col_data]] = fulldate
|
|
322
|
+
tgtarray[tgtcolmap[date_col_data[output_col_data]]] = fulldate
|
|
323
|
+
if valid_data_elem == True:
|
|
324
|
+
tgtrecords.append(tgtarray)
|
|
325
|
+
else:
|
|
326
|
+
metrics.increment_key_count(summarykey, "invalid_source_fields")
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
return build_records, tgtrecords, metrics
|
|
330
|
+
|
|
331
|
+
def valid_value(item):
|
|
332
|
+
"""
|
|
333
|
+
Check if an item is non blank (null)
|
|
334
|
+
"""
|
|
335
|
+
if item.strip() == "":
|
|
336
|
+
return(False)
|
|
337
|
+
return(True)
|
|
338
|
+
|
|
339
|
+
def valid_date_value(item):
|
|
340
|
+
"""
|
|
341
|
+
Check if a date item is non null and parses as ISO (YYYY-MM-DD), reverse-ISO
|
|
342
|
+
or dd/mm/yyyy or mm/dd/yyyy
|
|
343
|
+
"""
|
|
344
|
+
if item.strip() == "":
|
|
345
|
+
return(False)
|
|
346
|
+
if not valid_iso_date(item) and not valid_reverse_iso_date(item) and not valid_uk_date(item):
|
|
347
|
+
#print("Bad date : {0}".format(item))
|
|
348
|
+
return(False)
|
|
349
|
+
return(True)
|
|
350
|
+
|
|
351
|
+
def get_datetime_value(item):
|
|
352
|
+
"""
|
|
353
|
+
Check if a date item is non null and parses as ISO (YYYY-MM-DD), reverse-ISO
|
|
354
|
+
or dd/mm/yyyy or mm/dd/yyyy
|
|
355
|
+
"""
|
|
356
|
+
dt = None
|
|
357
|
+
# Does the date parse as an ISO date?
|
|
358
|
+
try:
|
|
359
|
+
dt = datetime.datetime.strptime(item, "%Y-%m-%d")
|
|
360
|
+
except ValueError:
|
|
361
|
+
pass
|
|
362
|
+
if dt != None:
|
|
363
|
+
return(dt)
|
|
364
|
+
|
|
365
|
+
# Does the date parse as a reverse ISO date?
|
|
366
|
+
try:
|
|
367
|
+
dt = datetime.datetime.strptime(item, "%d-%m-%Y")
|
|
368
|
+
except ValueError:
|
|
369
|
+
pass
|
|
370
|
+
|
|
371
|
+
if dt != None:
|
|
372
|
+
return(dt)
|
|
373
|
+
|
|
374
|
+
# Does the date parse as a UK old-style date?
|
|
375
|
+
try:
|
|
376
|
+
dt = datetime.datetime.strptime(item, "%d/%m/%Y")
|
|
377
|
+
except ValueError:
|
|
378
|
+
pass
|
|
379
|
+
|
|
380
|
+
if dt != None:
|
|
381
|
+
return(dt)
|
|
382
|
+
|
|
383
|
+
return None
|
|
384
|
+
|
|
385
|
+
def parse_date(item):
|
|
386
|
+
"""
|
|
387
|
+
Crude hand-coded check on date format
|
|
388
|
+
"""
|
|
389
|
+
datedata = item.split("-")
|
|
390
|
+
if len(datedata) != 3:
|
|
391
|
+
datedata = item.split("/")
|
|
392
|
+
if len(datedata) != 3:
|
|
393
|
+
return None
|
|
394
|
+
if len(datedata[2]) == 4:
|
|
395
|
+
return("{0}-{1}-{2}".format(datedata[2], datedata[1], datedata[0]))
|
|
396
|
+
return("{0}-{1}-{2}".format(datedata[0], datedata[1], datedata[2]))
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def valid_iso_date(item):
|
|
400
|
+
"""
|
|
401
|
+
Check if a date item is non null and parses as ISO (YYYY-MM-DD)
|
|
402
|
+
"""
|
|
403
|
+
try:
|
|
404
|
+
datetime.datetime.strptime(item, "%Y-%m-%d")
|
|
405
|
+
except ValueError:
|
|
406
|
+
return(False)
|
|
407
|
+
|
|
408
|
+
return(True)
|
|
409
|
+
|
|
410
|
+
def valid_reverse_iso_date(item):
|
|
411
|
+
"""
|
|
412
|
+
Check if a date item is non null and parses as reverse ISO (DD-MM-YYYY)
|
|
413
|
+
"""
|
|
414
|
+
try:
|
|
415
|
+
datetime.datetime.strptime(item, "%d-%m-%Y")
|
|
416
|
+
except ValueError:
|
|
417
|
+
return(False)
|
|
418
|
+
|
|
419
|
+
return(True)
|
|
420
|
+
|
|
421
|
+
def valid_uk_date(item):
|
|
422
|
+
"""
|
|
423
|
+
Check if a date item is non null and parses as UK format (DD/MM/YYYY)
|
|
424
|
+
"""
|
|
425
|
+
try:
|
|
426
|
+
datetime.datetime.strptime(item, "%d/%m/%Y")
|
|
427
|
+
except ValueError:
|
|
428
|
+
return(False)
|
|
429
|
+
|
|
430
|
+
return(True)
|
|
431
|
+
|
|
432
|
+
def load_last_used_ids(last_used_ids_file, last_used_ids):
|
|
433
|
+
fh = open(last_used_ids_file, mode="r", encoding="utf-8-sig")
|
|
434
|
+
csvr = csv.reader(fh, delimiter="\t")
|
|
435
|
+
|
|
436
|
+
for last_ids_data in csvr:
|
|
437
|
+
last_used_ids[last_ids_data[0]] = int(last_ids_data[1]) + 1
|
|
438
|
+
|
|
439
|
+
fh.close()
|
|
440
|
+
return last_used_ids
|
|
441
|
+
|
|
442
|
+
def load_saved_person_ids(person_file):
|
|
443
|
+
fh = open(person_file, mode="r", encoding="utf-8-sig")
|
|
444
|
+
csvr = csv.reader(fh, delimiter="\t")
|
|
445
|
+
last_int = 1
|
|
446
|
+
person_ids = {}
|
|
447
|
+
|
|
448
|
+
next(csvr)
|
|
449
|
+
for persondata in csvr:
|
|
450
|
+
person_ids[persondata[0]] = persondata[1]
|
|
451
|
+
last_int += 1
|
|
452
|
+
|
|
453
|
+
fh.close()
|
|
454
|
+
return person_ids, last_int
|
|
455
|
+
|
|
456
|
+
def load_person_ids(person_file, person_ids, mappingrules, use_input_person_ids, person_number=1, delim=","):
|
|
457
|
+
fh = open(person_file, mode="r", encoding="utf-8-sig")
|
|
458
|
+
csvr = csv.reader(fh, delimiter=delim)
|
|
459
|
+
person_columns = {}
|
|
460
|
+
person_col_in_hdr_number = 0
|
|
461
|
+
reject_count = 0
|
|
462
|
+
|
|
463
|
+
personhdr = next(csvr)
|
|
464
|
+
print(personhdr)
|
|
465
|
+
|
|
466
|
+
# Make a dictionary of column names vs their positions
|
|
467
|
+
for col in personhdr:
|
|
468
|
+
person_columns[col] = person_col_in_hdr_number
|
|
469
|
+
person_col_in_hdr_number += 1
|
|
470
|
+
|
|
471
|
+
birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info("person")
|
|
472
|
+
print("Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source))
|
|
473
|
+
person_col = person_columns[person_id_source]
|
|
474
|
+
|
|
475
|
+
for persondata in csvr:
|
|
476
|
+
if not valid_value(persondata[person_columns[person_id_source]]):
|
|
477
|
+
reject_count += 1
|
|
478
|
+
continue
|
|
479
|
+
if not valid_date_value(persondata[person_columns[birth_datetime_source]]):
|
|
480
|
+
reject_count += 1
|
|
481
|
+
continue
|
|
482
|
+
if persondata[person_col] not in person_ids:
|
|
483
|
+
if use_input_person_ids == "N":
|
|
484
|
+
person_ids[persondata[person_col]] = str(person_number)
|
|
485
|
+
person_number += 1
|
|
486
|
+
else:
|
|
487
|
+
person_ids[persondata[person_col]] = str(persondata[person_col])
|
|
488
|
+
fh.close()
|
|
489
|
+
|
|
490
|
+
return person_ids, reject_count
|
|
491
|
+
|
|
492
|
+
@click.group(help="Commands for using python configurations to run the ETL transformation.")
|
|
493
|
+
def py():
|
|
494
|
+
pass
|
|
495
|
+
|
|
496
|
+
run.add_command(mapstream,"mapstream")
|