carrot-transform 0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of carrot-transform might be problematic. Click here for more details.
- carrot_transform-0.3.dist-info/LICENSE +21 -0
- carrot_transform-0.3.dist-info/METADATA +28 -0
- carrot_transform-0.3.dist-info/RECORD +19 -0
- carrot_transform-0.3.dist-info/WHEEL +5 -0
- carrot_transform-0.3.dist-info/entry_points.txt +2 -0
- carrot_transform-0.3.dist-info/top_level.txt +1 -0
- carrottransform/__init__.py +5 -0
- carrottransform/_version.py +2 -0
- carrottransform/cli/__init__.py +0 -0
- carrottransform/cli/command.py +21 -0
- carrottransform/cli/subcommands/__init__.py +0 -0
- carrottransform/cli/subcommands/run.py +484 -0
- carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql +508 -0
- carrottransform/config/omop.json +61 -0
- carrottransform/tools/__init__.py +17 -0
- carrottransform/tools/file_helpers.py +14 -0
- carrottransform/tools/mappingrules.py +157 -0
- carrottransform/tools/metrics.py +127 -0
- carrottransform/tools/omopcdm.py +182 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) [2024] [Philip Duncan Appleby]
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: carrot-transform
|
|
3
|
+
Version: 0.3
|
|
4
|
+
Summary: Carrot somple transformer, input rules and data csvs, output OMOP
|
|
5
|
+
Author-email: PD Appleby <pdappleby@gmail.com>
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.9
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
|
|
13
|
+
# carrot-transform
|
|
14
|
+
|
|
15
|
+
TODO:
|
|
16
|
+
* Document carrot-transform
|
|
17
|
+
* Add more comments in-code
|
|
18
|
+
* Handle capture of ddl and json config via the command-line as optional args
|
|
19
|
+
|
|
20
|
+
Reduction in complexity over the original CaRROT-CDM version for the Transform part of *ETL* - In practice *Extract* is always
|
|
21
|
+
performed by Data Partners, *Load* by database bulk-load software.
|
|
22
|
+
|
|
23
|
+
Statistics
|
|
24
|
+
|
|
25
|
+
External libraries imported (approximate)
|
|
26
|
+
|
|
27
|
+
carrot-cdm 61
|
|
28
|
+
carrot-transform 12
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
carrottransform/__init__.py,sha256=cQJKTCpG2qmKxDl-VtSWQ3_WFjyzg4u_8nZacWAHFcU,73
|
|
2
|
+
carrottransform/_version.py,sha256=GIcaSIQ2wetvh_X8XcZC4nmbIniXNzgn9zFpgXoMWW8,70
|
|
3
|
+
carrottransform/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
carrottransform/cli/command.py,sha256=xYTaJsVZyRYv0CzUwrh7ZPK8hhGyC3MDfvVYxHcXYSM,508
|
|
5
|
+
carrottransform/cli/subcommands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
carrottransform/cli/subcommands/run.py,sha256=J081wG4C6gQYNB_ahejyxtoNA_ZI6Aq5YOopWEtAWLw,20384
|
|
7
|
+
carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql,sha256=fXrPfdL3IzU5ux55ogsQKjjd-c1KzdP_N2A_JjlY3gk,18084
|
|
8
|
+
carrottransform/config/omop.json,sha256=WiA1XeEd9K3dH3DRN1uJAzjzQpslGlmL-AxJ9z1PDQI,1687
|
|
9
|
+
carrottransform/tools/__init__.py,sha256=b3JuCwgJVx0rqx5igB8hNNKO0ktlbQjHGHwy-vzpdo0,198
|
|
10
|
+
carrottransform/tools/file_helpers.py,sha256=15iNY7qDMXc1p_KHb77ZnV4Tx7wi-vkiufZE4tz6DiM,276
|
|
11
|
+
carrottransform/tools/mappingrules.py,sha256=ru7sExFHEQA0eVbY68P-HQyGtZLUM1NxC_AWKIzgQzQ,6335
|
|
12
|
+
carrottransform/tools/metrics.py,sha256=r6Q2-rt9C13D5fTiwEdxuHx_NjyHpy1zMhcHxCZpZfc,5505
|
|
13
|
+
carrottransform/tools/omopcdm.py,sha256=BdftE6-E0oJwGIrOfrjP8gpEAIR5JQhR1DVzWDrzNO8,7365
|
|
14
|
+
carrot_transform-0.3.dist-info/LICENSE,sha256=pqIiuuTs6Na-oFd10MMsZoZmdfhfUhHeOtQzgzSkcaw,1082
|
|
15
|
+
carrot_transform-0.3.dist-info/METADATA,sha256=7u8s94DZd8CfODLRBI_HygqzGNadhlilT5iUa2QOdV8,865
|
|
16
|
+
carrot_transform-0.3.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
|
17
|
+
carrot_transform-0.3.dist-info/entry_points.txt,sha256=WSJqmgB8PEK8iMl3IFEMBYuyXtzHX5PaKbG13R54AH4,75
|
|
18
|
+
carrot_transform-0.3.dist-info/top_level.txt,sha256=UXPSohnlYfzndis3fEcl6f-dg80qwrKdPjnnSsggEUs,16
|
|
19
|
+
carrot_transform-0.3.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
carrottransform
|
|
File without changes
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Package entry point - sets up the "run" subcommand
|
|
2
|
+
from .subcommands.run import run
|
|
3
|
+
|
|
4
|
+
import carrottransform as c
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
@click.group(invoke_without_command=True)
|
|
8
|
+
@click.option("--version","-v",is_flag=True)
|
|
9
|
+
@click.pass_context
|
|
10
|
+
def transform(ctx,version):
|
|
11
|
+
if ctx.invoked_subcommand == None :
|
|
12
|
+
if version:
|
|
13
|
+
click.echo(c.__version__)
|
|
14
|
+
else:
|
|
15
|
+
click.echo(ctx.get_help())
|
|
16
|
+
return
|
|
17
|
+
|
|
18
|
+
transform.add_command(run, "run")
|
|
19
|
+
|
|
20
|
+
if __name__ == "__main__":
|
|
21
|
+
transform()
|
|
File without changes
|
|
@@ -0,0 +1,484 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import os, time
|
|
3
|
+
import datetime
|
|
4
|
+
import fnmatch
|
|
5
|
+
import sys
|
|
6
|
+
import click
|
|
7
|
+
import json
|
|
8
|
+
import importlib.resources
|
|
9
|
+
import carrottransform
|
|
10
|
+
import carrottransform.tools as tools
|
|
11
|
+
|
|
12
|
+
@click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
|
|
13
|
+
def run():
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
@click.command()
|
|
17
|
+
@click.option("--rules-file",
|
|
18
|
+
required=True,
|
|
19
|
+
help="json file containing mapping rules")
|
|
20
|
+
@click.option("--output-dir",
|
|
21
|
+
default=None,
|
|
22
|
+
help="define the output directory for OMOP-format tsv files")
|
|
23
|
+
@click.option("--write-mode",
|
|
24
|
+
default='w',
|
|
25
|
+
type=click.Choice(['w','a']),
|
|
26
|
+
help="force write-mode on output files")
|
|
27
|
+
@click.option("--person-file",
|
|
28
|
+
required=True,
|
|
29
|
+
help="File containing person_ids in the first column")
|
|
30
|
+
@click.option("--omop-version",
|
|
31
|
+
required=True,
|
|
32
|
+
help="Quoted string containing opmop version - eg '5.3'")
|
|
33
|
+
@click.option("--saved-person-id-file",
|
|
34
|
+
default=None,
|
|
35
|
+
required=False,
|
|
36
|
+
help="Full path to person id file used to save person_id state and share person_ids between data sets")
|
|
37
|
+
@click.option("--use-input-person-ids",
|
|
38
|
+
required=False,
|
|
39
|
+
default='N',
|
|
40
|
+
help="Use person ids as input without generating new integers")
|
|
41
|
+
@click.option("--last-used-ids-file",
|
|
42
|
+
default=None,
|
|
43
|
+
required=False,
|
|
44
|
+
help="Full path to last used ids file for OMOP tables - format: tablename\tlast_used_id, \nwhere last_used_id must be an integer")
|
|
45
|
+
@click.option("--log-file-threshold",
|
|
46
|
+
required=False,
|
|
47
|
+
default=0,
|
|
48
|
+
help="Lower outcount limit for logfile output")
|
|
49
|
+
@click.argument("input-dir",
|
|
50
|
+
required=False,
|
|
51
|
+
nargs=-1)
|
|
52
|
+
def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, saved_person_id_file, use_input_person_ids, last_used_ids_file, log_file_threshold, input_dir):
|
|
53
|
+
"""
|
|
54
|
+
Map to output using input streams
|
|
55
|
+
"""
|
|
56
|
+
omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
|
|
57
|
+
omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
|
|
58
|
+
omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
|
|
59
|
+
|
|
60
|
+
if os.path.isdir(input_dir[0]) == False:
|
|
61
|
+
print("Not a directory, input dir {0}".format(input_dir[0]))
|
|
62
|
+
sys.exit(1)
|
|
63
|
+
|
|
64
|
+
if os.path.isdir(output_dir) == False:
|
|
65
|
+
print("Not a directory, output dir {0}".format(output_dir))
|
|
66
|
+
sys.exit(1)
|
|
67
|
+
|
|
68
|
+
if saved_person_id_file == None:
|
|
69
|
+
saved_person_id_file = output_dir + "/" + "person_ids.tsv"
|
|
70
|
+
if os.path.exists(saved_person_id_file):
|
|
71
|
+
os.remove(saved_person_id_file)
|
|
72
|
+
|
|
73
|
+
starttime = time.time()
|
|
74
|
+
omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
|
|
75
|
+
#print(omopcdm.dump_ddl())
|
|
76
|
+
mappingrules = tools.mappingrules.MappingRules(rules_file, omopcdm)
|
|
77
|
+
metrics = tools.metrics.Metrics(mappingrules.get_dataset_name(), log_file_threshold)
|
|
78
|
+
nowtime = time.time()
|
|
79
|
+
|
|
80
|
+
print("--------------------------------------------------------------------------------")
|
|
81
|
+
print("Loaded mapping rules from: {0} after {1:.5f} secs".format(rules_file, (nowtime - starttime)))
|
|
82
|
+
output_files = mappingrules.get_all_outfile_names()
|
|
83
|
+
record_numbers = {}
|
|
84
|
+
for output_file in output_files:
|
|
85
|
+
record_numbers[output_file] = 1
|
|
86
|
+
|
|
87
|
+
fhd = {}
|
|
88
|
+
tgtcolmaps = {}
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
# Add in a saved-person-file existence test and reload from it is necessary returning the last used integer
|
|
92
|
+
if os.path.isfile(saved_person_id_file):
|
|
93
|
+
person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
|
|
94
|
+
else:
|
|
95
|
+
person_lookup = {}
|
|
96
|
+
last_used_integer = 1
|
|
97
|
+
if last_used_ids_file != None:
|
|
98
|
+
if os.path.isfile(last_used_ids_file):
|
|
99
|
+
record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
|
|
100
|
+
|
|
101
|
+
#fhp = open(person_file, mode="r", encoding="utf-8-sig")
|
|
102
|
+
#csvrp = csv.reader(fhp)
|
|
103
|
+
person_lookup, rejected_person_count = load_person_ids(person_file, person_lookup, mappingrules, use_input_person_ids, last_used_integer)
|
|
104
|
+
fhpout = open(saved_person_id_file, mode="w")
|
|
105
|
+
fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
|
|
106
|
+
for person_id, person_assigned_id in person_lookup.items():
|
|
107
|
+
fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
|
|
108
|
+
fhpout.close()
|
|
109
|
+
for tgtfile in output_files:
|
|
110
|
+
fhd[tgtfile] = open(output_dir + "/" + tgtfile + ".tsv", mode=write_mode)
|
|
111
|
+
if write_mode == 'w':
|
|
112
|
+
outhdr = omopcdm.get_omop_column_list(tgtfile)
|
|
113
|
+
fhd[tgtfile].write("\t".join(outhdr) + "\n")
|
|
114
|
+
tgtcolmaps[tgtfile] = omopcdm.get_omop_column_map(tgtfile)
|
|
115
|
+
|
|
116
|
+
except IOError as e:
|
|
117
|
+
print("I/O - error({0}): {1} -> {2}".format(e.errno, e.strerror, str(e)))
|
|
118
|
+
exit()
|
|
119
|
+
|
|
120
|
+
print("person_id stats: total loaded {0}, reject count {1}".format(len(person_lookup), rejected_person_count))
|
|
121
|
+
|
|
122
|
+
# TODO get this list of input files from the parsed rules
|
|
123
|
+
existing_input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
|
|
124
|
+
rules_input_files = mappingrules.get_all_infile_names()
|
|
125
|
+
for infile in existing_input_files:
|
|
126
|
+
if infile not in rules_input_files:
|
|
127
|
+
msg = "ERROR: no mapping rules found for existing input file - {0}".format(infile)
|
|
128
|
+
print(msg)
|
|
129
|
+
metrics.add_log_data(msg)
|
|
130
|
+
for infile in rules_input_files:
|
|
131
|
+
if infile not in existing_input_files:
|
|
132
|
+
msg = "ERROR: no data for mapped input file - {0}".format(infile)
|
|
133
|
+
print(msg)
|
|
134
|
+
metrics.add_log_data(msg)
|
|
135
|
+
rejidcounts = {}
|
|
136
|
+
rejdatecounts = {}
|
|
137
|
+
#src_tgt_counts = {}
|
|
138
|
+
print(rules_input_files)
|
|
139
|
+
|
|
140
|
+
for srcfilename in rules_input_files:
|
|
141
|
+
rejidcounts[srcfilename] = 0
|
|
142
|
+
rejdatecounts[srcfilename] = 0
|
|
143
|
+
|
|
144
|
+
for srcfilename in rules_input_files:
|
|
145
|
+
outcounts = {}
|
|
146
|
+
rejcounts = {}
|
|
147
|
+
rcount = 0
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
fh = open(input_dir[0] + "/" + srcfilename, mode="r", encoding="utf-8-sig")
|
|
151
|
+
csvr = csv.reader(fh)
|
|
152
|
+
except IOError as e:
|
|
153
|
+
print("Unable to open: {0}".format(input_dir[0] + "/" + srcfilename))
|
|
154
|
+
print("I/O error({0}): {1}".format(e.errno, e.strerror))
|
|
155
|
+
continue
|
|
156
|
+
|
|
157
|
+
tgtfiles, src_to_tgt = mappingrules.parse_rules_src_to_tgt(srcfilename)
|
|
158
|
+
infile_datetime_source, infile_person_id_source = mappingrules.get_infile_date_person_id(srcfilename)
|
|
159
|
+
for tgtfile in tgtfiles:
|
|
160
|
+
outcounts[tgtfile] = 0
|
|
161
|
+
rejcounts[tgtfile] = 0
|
|
162
|
+
datacolsall = []
|
|
163
|
+
hdrdata = next(csvr)
|
|
164
|
+
dflist = mappingrules.get_infile_data_fields(srcfilename)
|
|
165
|
+
for colname in hdrdata:
|
|
166
|
+
datacolsall.append(colname)
|
|
167
|
+
inputcolmap = omopcdm.get_column_map(hdrdata)
|
|
168
|
+
pers_id_col = inputcolmap[infile_person_id_source]
|
|
169
|
+
datetime_col = inputcolmap[infile_datetime_source]
|
|
170
|
+
print("--------------------------------------------------------------------------------")
|
|
171
|
+
print("Processing input: {0}".format(srcfilename))
|
|
172
|
+
# print("Processing input: {0}, All input cols = {1}, Data cols = {2}".format(srcfilename, str(datacolsall), str(dflist)))
|
|
173
|
+
|
|
174
|
+
for indata in csvr:
|
|
175
|
+
#indata = inputline.strip().split(",")
|
|
176
|
+
key = srcfilename + "~all~all~all~"
|
|
177
|
+
metrics.increment_key_count(key, "input_count")
|
|
178
|
+
rcount += 1
|
|
179
|
+
strdate = indata[datetime_col].split(" ")[0]
|
|
180
|
+
fulldate = parse_date(strdate)
|
|
181
|
+
if fulldate != None:
|
|
182
|
+
#fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
|
|
183
|
+
indata[datetime_col] = fulldate
|
|
184
|
+
else:
|
|
185
|
+
metrics.increment_key_count(key, "invalid_date_fields")
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
for tgtfile in tgtfiles:
|
|
189
|
+
tgtcolmap = tgtcolmaps[tgtfile]
|
|
190
|
+
auto_num_col = omopcdm.get_omop_auto_number_field(tgtfile)
|
|
191
|
+
pers_id_col = omopcdm.get_omop_person_id_field(tgtfile)
|
|
192
|
+
|
|
193
|
+
datacols = datacolsall
|
|
194
|
+
if tgtfile in dflist:
|
|
195
|
+
datacols = dflist[tgtfile]
|
|
196
|
+
|
|
197
|
+
for datacol in datacols:
|
|
198
|
+
built_records, outrecords, metrics = get_target_records(tgtfile, tgtcolmap, src_to_tgt, datacol, indata, inputcolmap, srcfilename, omopcdm, metrics)
|
|
199
|
+
if built_records == True:
|
|
200
|
+
for outrecord in outrecords:
|
|
201
|
+
if auto_num_col != None:
|
|
202
|
+
outrecord[tgtcolmap[auto_num_col]] = str(record_numbers[tgtfile])
|
|
203
|
+
record_numbers[tgtfile] += 1
|
|
204
|
+
if (outrecord[tgtcolmap[pers_id_col]]) in person_lookup:
|
|
205
|
+
outrecord[tgtcolmap[pers_id_col]] = person_lookup[outrecord[tgtcolmap[pers_id_col]]]
|
|
206
|
+
outcounts[tgtfile] += 1
|
|
207
|
+
key = srcfilename + "~all~all~all~"
|
|
208
|
+
metrics.increment_key_count(key, "output_count")
|
|
209
|
+
key = "all~all~" + tgtfile + "~all~"
|
|
210
|
+
metrics.increment_key_count(key, "output_count")
|
|
211
|
+
key = srcfilename + "~all~" + tgtfile + "~all~"
|
|
212
|
+
metrics.increment_key_count(key, "output_count")
|
|
213
|
+
if tgtfile == "person":
|
|
214
|
+
key = srcfilename + "~all~" + tgtfile + "~" + outrecord[1] +"~"
|
|
215
|
+
metrics.increment_key_count(key, "output_count")
|
|
216
|
+
key = srcfilename + "~" + datacol +"~" + tgtfile + "~" + outrecord[1] + "~" + outrecord[2]
|
|
217
|
+
metrics.increment_key_count(key, "output_count")
|
|
218
|
+
else:
|
|
219
|
+
key = srcfilename + "~" + datacol +"~" + tgtfile + "~" + outrecord[2] + "~"
|
|
220
|
+
metrics.increment_key_count(key, "output_count")
|
|
221
|
+
key = srcfilename + "~all~" + tgtfile + "~" + outrecord[2] + "~"
|
|
222
|
+
metrics.increment_key_count(key, "output_count")
|
|
223
|
+
key = "all~all~" + tgtfile + "~" + outrecord[2] + "~"
|
|
224
|
+
metrics.increment_key_count(key, "output_count")
|
|
225
|
+
key = "all~all~all~" + outrecord[2] + "~"
|
|
226
|
+
metrics.increment_key_count(key, "output_count")
|
|
227
|
+
fhd[tgtfile].write("\t".join(outrecord) + "\n")
|
|
228
|
+
else:
|
|
229
|
+
key = srcfilename + "~all~" + tgtfile + "~all~"
|
|
230
|
+
metrics.increment_key_count(key, "invalid_person_ids")
|
|
231
|
+
rejidcounts[srcfilename] += 1
|
|
232
|
+
|
|
233
|
+
fh.close()
|
|
234
|
+
|
|
235
|
+
nowtime= time.time()
|
|
236
|
+
print("INPUT file data : {0}: input count {1}, time since start {2:.5} secs".format(srcfilename, str(rcount), (nowtime - starttime)))
|
|
237
|
+
for outtablename, count in outcounts.items():
|
|
238
|
+
print("TARGET: {0}: output count {1}".format(outtablename, str(count)))
|
|
239
|
+
|
|
240
|
+
print("--------------------------------------------------------------------------------")
|
|
241
|
+
data_summary = metrics.get_mapstream_summary()
|
|
242
|
+
log_report = metrics.get_log_data()
|
|
243
|
+
try:
|
|
244
|
+
dsfh = open(output_dir + "/summary_mapstream.tsv", mode="w")
|
|
245
|
+
dsfh.write(data_summary)
|
|
246
|
+
dsfh.close()
|
|
247
|
+
logfh = open(output_dir + "/error_report.txt", mode="w")
|
|
248
|
+
logfh.write(log_report)
|
|
249
|
+
logfh.close()
|
|
250
|
+
except IOError as e:
|
|
251
|
+
print("I/O error({0}): {1}".format(e.errno, e.strerror))
|
|
252
|
+
print("Unable to write file")
|
|
253
|
+
|
|
254
|
+
nowtime = time.time()
|
|
255
|
+
print("Elapsed time = {0:.5f} secs".format(nowtime - starttime))
|
|
256
|
+
#profiler.disable()
|
|
257
|
+
#stats = pstats.Stats(profiler).sort_stats('ncalls')
|
|
258
|
+
#stats.print_stats()
|
|
259
|
+
|
|
260
|
+
def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srccolmap, srcfilename, omopcdm, metrics):
|
|
261
|
+
build_records = False
|
|
262
|
+
tgtrecords = []
|
|
263
|
+
date_col_data = omopcdm.get_omop_datetime_linked_fields(tgtfilename)
|
|
264
|
+
date_component_data = omopcdm.get_omop_date_field_components(tgtfilename)
|
|
265
|
+
notnull_numeric_fields = omopcdm.get_omop_notnull_numeric_fields(tgtfilename)
|
|
266
|
+
|
|
267
|
+
srckey = srcfilename + "~" + srcfield + "~" + tgtfilename
|
|
268
|
+
summarykey = srcfilename + "~" + srcfield + "~" + tgtfilename + "~all~"
|
|
269
|
+
if valid_value(str(srcdata[srccolmap[srcfield]])):
|
|
270
|
+
srcfullkey = srcfilename + "~" + srcfield + "~" + str(srcdata[srccolmap[srcfield]]) + "~" + tgtfilename
|
|
271
|
+
dictkeys = []
|
|
272
|
+
if srcfullkey in rulesmap:
|
|
273
|
+
build_records = True
|
|
274
|
+
dictkeys.append(srcfullkey)
|
|
275
|
+
if srckey in rulesmap:
|
|
276
|
+
build_records = True
|
|
277
|
+
dictkeys.append(srckey)
|
|
278
|
+
if build_records == True:
|
|
279
|
+
for dictkey in dictkeys:
|
|
280
|
+
for out_data_elem in rulesmap[dictkey]:
|
|
281
|
+
valid_data_elem = True
|
|
282
|
+
tgtarray = ['']*len(tgtcolmap)
|
|
283
|
+
for req_integer in notnull_numeric_fields:
|
|
284
|
+
tgtarray[tgtcolmap[req_integer]] = "0"
|
|
285
|
+
for infield, outfield_list in out_data_elem.items():
|
|
286
|
+
for output_col_data in outfield_list:
|
|
287
|
+
if "~" in output_col_data:
|
|
288
|
+
outcol, term = output_col_data.split("~")
|
|
289
|
+
tgtarray[tgtcolmap[outcol]] = term
|
|
290
|
+
else:
|
|
291
|
+
tgtarray[tgtcolmap[output_col_data]] = srcdata[srccolmap[infield]]
|
|
292
|
+
if output_col_data in date_component_data:
|
|
293
|
+
strdate = srcdata[srccolmap[infield]].split(" ")[0]
|
|
294
|
+
dt = get_datetime_value(strdate)
|
|
295
|
+
if dt != None:
|
|
296
|
+
year_field = date_component_data[output_col_data]["year"]
|
|
297
|
+
month_field = date_component_data[output_col_data]["month"]
|
|
298
|
+
day_field = date_component_data[output_col_data]["day"]
|
|
299
|
+
tgtarray[tgtcolmap[year_field]] = str(dt.year)
|
|
300
|
+
tgtarray[tgtcolmap[month_field]] = str(dt.month)
|
|
301
|
+
tgtarray[tgtcolmap[day_field]] = str(dt.day)
|
|
302
|
+
fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
|
|
303
|
+
tgtarray[tgtcolmap[output_col_data]] = fulldate
|
|
304
|
+
else:
|
|
305
|
+
metrics.increment_key_count(summarykey, "invalid_date_fields")
|
|
306
|
+
valid_data_elem = False
|
|
307
|
+
elif output_col_data in date_col_data:
|
|
308
|
+
fulldate = srcdata[srccolmap[infield]]
|
|
309
|
+
tgtarray[tgtcolmap[output_col_data]] = fulldate
|
|
310
|
+
tgtarray[tgtcolmap[date_col_data[output_col_data]]] = fulldate
|
|
311
|
+
if valid_data_elem == True:
|
|
312
|
+
tgtrecords.append(tgtarray)
|
|
313
|
+
else:
|
|
314
|
+
metrics.increment_key_count(summarykey, "invalid_source_fields")
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
return build_records, tgtrecords, metrics
|
|
318
|
+
|
|
319
|
+
def valid_value(item):
|
|
320
|
+
"""
|
|
321
|
+
Check if an item is non blank (null)
|
|
322
|
+
"""
|
|
323
|
+
if item.strip() == "":
|
|
324
|
+
return(False)
|
|
325
|
+
return(True)
|
|
326
|
+
|
|
327
|
+
def valid_date_value(item):
|
|
328
|
+
"""
|
|
329
|
+
Check if a date item is non null and parses as ISO (YYYY-MM-DD), reverse-ISO
|
|
330
|
+
or dd/mm/yyyy or mm/dd/yyyy
|
|
331
|
+
"""
|
|
332
|
+
if item.strip() == "":
|
|
333
|
+
return(False)
|
|
334
|
+
if not valid_iso_date(item) and not valid_reverse_iso_date(item) and not valid_uk_date(item):
|
|
335
|
+
#print("Bad date : {0}".format(item))
|
|
336
|
+
return(False)
|
|
337
|
+
return(True)
|
|
338
|
+
|
|
339
|
+
def get_datetime_value(item):
|
|
340
|
+
"""
|
|
341
|
+
Check if a date item is non null and parses as ISO (YYYY-MM-DD), reverse-ISO
|
|
342
|
+
or dd/mm/yyyy or mm/dd/yyyy
|
|
343
|
+
"""
|
|
344
|
+
dt = None
|
|
345
|
+
# Does the date parse as an ISO date?
|
|
346
|
+
try:
|
|
347
|
+
dt = datetime.datetime.strptime(item, "%Y-%m-%d")
|
|
348
|
+
except ValueError:
|
|
349
|
+
pass
|
|
350
|
+
if dt != None:
|
|
351
|
+
return(dt)
|
|
352
|
+
|
|
353
|
+
# Does the date parse as a reverse ISO date?
|
|
354
|
+
try:
|
|
355
|
+
dt = datetime.datetime.strptime(item, "%d-%m-%Y")
|
|
356
|
+
except ValueError:
|
|
357
|
+
pass
|
|
358
|
+
|
|
359
|
+
if dt != None:
|
|
360
|
+
return(dt)
|
|
361
|
+
|
|
362
|
+
# Does the date parse as a UK old-style date?
|
|
363
|
+
try:
|
|
364
|
+
dt = datetime.datetime.strptime(item, "%d/%m/%Y")
|
|
365
|
+
except ValueError:
|
|
366
|
+
pass
|
|
367
|
+
|
|
368
|
+
if dt != None:
|
|
369
|
+
return(dt)
|
|
370
|
+
|
|
371
|
+
return None
|
|
372
|
+
|
|
373
|
+
def parse_date(item):
|
|
374
|
+
"""
|
|
375
|
+
Crude hand-coded check on date format
|
|
376
|
+
"""
|
|
377
|
+
datedata = item.split("-")
|
|
378
|
+
if len(datedata) != 3:
|
|
379
|
+
datedata = item.split("/")
|
|
380
|
+
if len(datedata) != 3:
|
|
381
|
+
return None
|
|
382
|
+
if len(datedata[2]) == 4:
|
|
383
|
+
return("{0}-{1}-{2}".format(datedata[2], datedata[1], datedata[0]))
|
|
384
|
+
return("{0}-{1}-{2}".format(datedata[0], datedata[1], datedata[2]))
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def valid_iso_date(item):
|
|
388
|
+
"""
|
|
389
|
+
Check if a date item is non null and parses as ISO (YYYY-MM-DD)
|
|
390
|
+
"""
|
|
391
|
+
try:
|
|
392
|
+
datetime.datetime.strptime(item, "%Y-%m-%d")
|
|
393
|
+
except ValueError:
|
|
394
|
+
return(False)
|
|
395
|
+
|
|
396
|
+
return(True)
|
|
397
|
+
|
|
398
|
+
def valid_reverse_iso_date(item):
|
|
399
|
+
"""
|
|
400
|
+
Check if a date item is non null and parses as reverse ISO (DD-MM-YYYY)
|
|
401
|
+
"""
|
|
402
|
+
try:
|
|
403
|
+
datetime.datetime.strptime(item, "%d-%m-%Y")
|
|
404
|
+
except ValueError:
|
|
405
|
+
return(False)
|
|
406
|
+
|
|
407
|
+
return(True)
|
|
408
|
+
|
|
409
|
+
def valid_uk_date(item):
|
|
410
|
+
"""
|
|
411
|
+
Check if a date item is non null and parses as UK format (DD/MM/YYYY)
|
|
412
|
+
"""
|
|
413
|
+
try:
|
|
414
|
+
datetime.datetime.strptime(item, "%d/%m/%Y")
|
|
415
|
+
except ValueError:
|
|
416
|
+
return(False)
|
|
417
|
+
|
|
418
|
+
return(True)
|
|
419
|
+
|
|
420
|
+
def load_last_used_ids(last_used_ids_file, last_used_ids):
|
|
421
|
+
fh = open(last_used_ids_file, mode="r", encoding="utf-8-sig")
|
|
422
|
+
csvr = csv.reader(fh, delimiter="\t")
|
|
423
|
+
|
|
424
|
+
for last_ids_data in csvr:
|
|
425
|
+
last_used_ids[last_ids_data[0]] = int(last_ids_data[1]) + 1
|
|
426
|
+
|
|
427
|
+
fh.close()
|
|
428
|
+
return last_used_ids
|
|
429
|
+
|
|
430
|
+
def load_saved_person_ids(person_file):
|
|
431
|
+
fh = open(person_file, mode="r", encoding="utf-8-sig")
|
|
432
|
+
csvr = csv.reader(fh, delimiter="\t")
|
|
433
|
+
last_int = 1
|
|
434
|
+
person_ids = {}
|
|
435
|
+
|
|
436
|
+
next(csvr)
|
|
437
|
+
for persondata in csvr:
|
|
438
|
+
person_ids[persondata[0]] = persondata[1]
|
|
439
|
+
last_int += 1
|
|
440
|
+
|
|
441
|
+
fh.close()
|
|
442
|
+
return person_ids, last_int
|
|
443
|
+
|
|
444
|
+
def load_person_ids(person_file, person_ids, mappingrules, use_input_person_ids, person_number=1, delim=","):
|
|
445
|
+
fh = open(person_file, mode="r", encoding="utf-8-sig")
|
|
446
|
+
csvr = csv.reader(fh, delimiter=delim)
|
|
447
|
+
person_columns = {}
|
|
448
|
+
person_col_in_hdr_number = 0
|
|
449
|
+
reject_count = 0
|
|
450
|
+
|
|
451
|
+
personhdr = next(csvr)
|
|
452
|
+
print(personhdr)
|
|
453
|
+
|
|
454
|
+
# Make a dictionary of column names vs their positions
|
|
455
|
+
for col in personhdr:
|
|
456
|
+
person_columns[col] = person_col_in_hdr_number
|
|
457
|
+
person_col_in_hdr_number += 1
|
|
458
|
+
|
|
459
|
+
birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info("person")
|
|
460
|
+
print("Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source))
|
|
461
|
+
person_col = person_columns[person_id_source]
|
|
462
|
+
|
|
463
|
+
for persondata in csvr:
|
|
464
|
+
if not valid_value(persondata[person_columns[person_id_source]]):
|
|
465
|
+
reject_count += 1
|
|
466
|
+
continue
|
|
467
|
+
if not valid_date_value(persondata[person_columns[birth_datetime_source]]):
|
|
468
|
+
reject_count += 1
|
|
469
|
+
continue
|
|
470
|
+
if persondata[person_col] not in person_ids:
|
|
471
|
+
if use_input_person_ids == "N":
|
|
472
|
+
person_ids[persondata[person_col]] = str(person_number)
|
|
473
|
+
person_number += 1
|
|
474
|
+
else:
|
|
475
|
+
person_ids[persondata[person_col]] = str(persondata[person_col])
|
|
476
|
+
fh.close()
|
|
477
|
+
|
|
478
|
+
return person_ids, reject_count
|
|
479
|
+
|
|
480
|
+
@click.group(help="Commands for using python configurations to run the ETL transformation.")
|
|
481
|
+
def py():
|
|
482
|
+
pass
|
|
483
|
+
|
|
484
|
+
run.add_command(mapstream,"mapstream")
|