carrot-transform 0.3__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of carrot-transform might be problematic. Click here for more details.
- {carrot_transform-0.3 → carrot_transform-0.3.2}/PKG-INFO +2 -2
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrot_transform.egg-info/PKG-INFO +2 -2
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrot_transform.py +1 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/_version.py +1 -1
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/cli/subcommands/run.py +37 -25
- carrot_transform-0.3.2/carrottransform/tools/file_helpers.py +15 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/tools/mappingrules.py +4 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/tools/metrics.py +10 -8
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/tools/omopcdm.py +7 -2
- {carrot_transform-0.3 → carrot_transform-0.3.2}/pyproject.toml +2 -2
- carrot_transform-0.3/carrottransform/tools/file_helpers.py +0 -14
- {carrot_transform-0.3 → carrot_transform-0.3.2}/.github/workflows/pypi.publish.yml +0 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/.gitignore +0 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/LICENSE +0 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/MANIFEST.in +0 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/README.md +0 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrot_transform.egg-info/SOURCES.txt +0 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrot_transform.egg-info/dependency_links.txt +0 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrot_transform.egg-info/entry_points.txt +0 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrot_transform.egg-info/top_level.txt +0 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/__init__.py +0 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/cli/__init__.py +0 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/cli/command.py +0 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/cli/subcommands/__init__.py +0 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql +0 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/config/omop.json +0 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/tools/__init__.py +0 -0
- {carrot_transform-0.3 → carrot_transform-0.3.2}/setup.cfg +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: carrot-transform
|
|
3
|
-
Version: 0.3
|
|
4
|
-
Summary: Carrot
|
|
3
|
+
Version: 0.3.2
|
|
4
|
+
Summary: Carrot simple transformer, input rules and data csv's, output OMOP
|
|
5
5
|
Author-email: PD Appleby <pdappleby@gmail.com>
|
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
|
7
7
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: carrot-transform
|
|
3
|
-
Version: 0.3
|
|
4
|
-
Summary: Carrot
|
|
3
|
+
Version: 0.3.2
|
|
4
|
+
Summary: Carrot simple transformer, input rules and data csv's, output OMOP
|
|
5
5
|
Author-email: PD Appleby <pdappleby@gmail.com>
|
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
|
7
7
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
# TODO - pick this up automatically when building
|
|
2
|
-
__version__ = '0.3'
|
|
2
|
+
__version__ = '0.3.2'
|
|
@@ -27,8 +27,14 @@ def run():
|
|
|
27
27
|
@click.option("--person-file",
|
|
28
28
|
required=True,
|
|
29
29
|
help="File containing person_ids in the first column")
|
|
30
|
+
@click.option("--omop-ddl-file",
|
|
31
|
+
required=False,
|
|
32
|
+
help="File containing OHDSI ddl statements for OMOP tables")
|
|
33
|
+
@click.option("--omop-config-file",
|
|
34
|
+
required=False,
|
|
35
|
+
help="File containing additional / override json config for omop outputs")
|
|
30
36
|
@click.option("--omop-version",
|
|
31
|
-
required=
|
|
37
|
+
required=False,
|
|
32
38
|
help="Quoted string containing opmop version - eg '5.3'")
|
|
33
39
|
@click.option("--saved-person-id-file",
|
|
34
40
|
default=None,
|
|
@@ -49,13 +55,23 @@ def run():
|
|
|
49
55
|
@click.argument("input-dir",
|
|
50
56
|
required=False,
|
|
51
57
|
nargs=-1)
|
|
52
|
-
def mapstream(rules_file, output_dir, write_mode,
|
|
58
|
+
def mapstream(rules_file, output_dir, write_mode,
|
|
59
|
+
person_file, omop_ddl_file, omop_config_file,
|
|
60
|
+
omop_version, saved_person_id_file, use_input_person_ids,
|
|
61
|
+
last_used_ids_file, log_file_threshold, input_dir):
|
|
53
62
|
"""
|
|
54
63
|
Map to output using input streams
|
|
55
64
|
"""
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
65
|
+
# Initialisation
|
|
66
|
+
# - check for values in optional arguments
|
|
67
|
+
# - read in configuration files
|
|
68
|
+
# - check main directories for existence
|
|
69
|
+
# - handle saved persion ids
|
|
70
|
+
# - initialise metrics
|
|
71
|
+
if (omop_ddl_file == None) and (omop_config_file == None) and (omop_version != None):
|
|
72
|
+
omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
|
|
73
|
+
omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
|
|
74
|
+
omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
|
|
59
75
|
|
|
60
76
|
if os.path.isdir(input_dir[0]) == False:
|
|
61
77
|
print("Not a directory, input dir {0}".format(input_dir[0]))
|
|
@@ -72,13 +88,12 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
|
|
|
72
88
|
|
|
73
89
|
starttime = time.time()
|
|
74
90
|
omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
|
|
75
|
-
#print(omopcdm.dump_ddl())
|
|
76
91
|
mappingrules = tools.mappingrules.MappingRules(rules_file, omopcdm)
|
|
77
92
|
metrics = tools.metrics.Metrics(mappingrules.get_dataset_name(), log_file_threshold)
|
|
78
93
|
nowtime = time.time()
|
|
79
94
|
|
|
80
95
|
print("--------------------------------------------------------------------------------")
|
|
81
|
-
print("Loaded mapping rules from: {0}
|
|
96
|
+
print("Loaded mapping rules from: {0} in {1:.5f} secs".format(rules_file, (nowtime - starttime)))
|
|
82
97
|
output_files = mappingrules.get_all_outfile_names()
|
|
83
98
|
record_numbers = {}
|
|
84
99
|
for output_file in output_files:
|
|
@@ -88,7 +103,7 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
|
|
|
88
103
|
tgtcolmaps = {}
|
|
89
104
|
|
|
90
105
|
try:
|
|
91
|
-
#
|
|
106
|
+
# Saved-person-file existence test, reload if found, return last used integer
|
|
92
107
|
if os.path.isfile(saved_person_id_file):
|
|
93
108
|
person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
|
|
94
109
|
else:
|
|
@@ -98,14 +113,13 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
|
|
|
98
113
|
if os.path.isfile(last_used_ids_file):
|
|
99
114
|
record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
|
|
100
115
|
|
|
101
|
-
#fhp = open(person_file, mode="r", encoding="utf-8-sig")
|
|
102
|
-
#csvrp = csv.reader(fhp)
|
|
103
116
|
person_lookup, rejected_person_count = load_person_ids(person_file, person_lookup, mappingrules, use_input_person_ids, last_used_integer)
|
|
104
117
|
fhpout = open(saved_person_id_file, mode="w")
|
|
105
118
|
fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
|
|
106
119
|
for person_id, person_assigned_id in person_lookup.items():
|
|
107
120
|
fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
|
|
108
121
|
fhpout.close()
|
|
122
|
+
# Initialise output files, output a header for each
|
|
109
123
|
for tgtfile in output_files:
|
|
110
124
|
fhd[tgtfile] = open(output_dir + "/" + tgtfile + ".tsv", mode=write_mode)
|
|
111
125
|
if write_mode == 'w':
|
|
@@ -119,28 +133,30 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
|
|
|
119
133
|
|
|
120
134
|
print("person_id stats: total loaded {0}, reject count {1}".format(len(person_lookup), rejected_person_count))
|
|
121
135
|
|
|
122
|
-
#
|
|
136
|
+
# Compare files found in the input_dir with those expected based on mapping rules
|
|
123
137
|
existing_input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
|
|
124
138
|
rules_input_files = mappingrules.get_all_infile_names()
|
|
139
|
+
# Log mismatches but continue
|
|
125
140
|
for infile in existing_input_files:
|
|
126
141
|
if infile not in rules_input_files:
|
|
127
142
|
msg = "ERROR: no mapping rules found for existing input file - {0}".format(infile)
|
|
128
143
|
print(msg)
|
|
129
|
-
metrics.add_log_data(msg)
|
|
130
144
|
for infile in rules_input_files:
|
|
131
145
|
if infile not in existing_input_files:
|
|
132
146
|
msg = "ERROR: no data for mapped input file - {0}".format(infile)
|
|
133
147
|
print(msg)
|
|
134
|
-
|
|
148
|
+
|
|
149
|
+
# set up overall counts
|
|
135
150
|
rejidcounts = {}
|
|
136
151
|
rejdatecounts = {}
|
|
137
|
-
#src_tgt_counts = {}
|
|
138
152
|
print(rules_input_files)
|
|
139
153
|
|
|
154
|
+
# set up per-input counts
|
|
140
155
|
for srcfilename in rules_input_files:
|
|
141
156
|
rejidcounts[srcfilename] = 0
|
|
142
157
|
rejdatecounts[srcfilename] = 0
|
|
143
158
|
|
|
159
|
+
# main processing loop, for each input file
|
|
144
160
|
for srcfilename in rules_input_files:
|
|
145
161
|
outcounts = {}
|
|
146
162
|
rejcounts = {}
|
|
@@ -169,17 +185,15 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
|
|
|
169
185
|
datetime_col = inputcolmap[infile_datetime_source]
|
|
170
186
|
print("--------------------------------------------------------------------------------")
|
|
171
187
|
print("Processing input: {0}".format(srcfilename))
|
|
172
|
-
|
|
173
|
-
|
|
188
|
+
|
|
189
|
+
# for each input record
|
|
174
190
|
for indata in csvr:
|
|
175
|
-
#indata = inputline.strip().split(",")
|
|
176
191
|
key = srcfilename + "~all~all~all~"
|
|
177
192
|
metrics.increment_key_count(key, "input_count")
|
|
178
193
|
rcount += 1
|
|
179
194
|
strdate = indata[datetime_col].split(" ")[0]
|
|
180
195
|
fulldate = parse_date(strdate)
|
|
181
196
|
if fulldate != None:
|
|
182
|
-
#fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
|
|
183
197
|
indata[datetime_col] = fulldate
|
|
184
198
|
else:
|
|
185
199
|
metrics.increment_key_count(key, "invalid_date_fields")
|
|
@@ -236,28 +250,26 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
|
|
|
236
250
|
print("INPUT file data : {0}: input count {1}, time since start {2:.5} secs".format(srcfilename, str(rcount), (nowtime - starttime)))
|
|
237
251
|
for outtablename, count in outcounts.items():
|
|
238
252
|
print("TARGET: {0}: output count {1}".format(outtablename, str(count)))
|
|
253
|
+
# END main processing loop
|
|
239
254
|
|
|
240
255
|
print("--------------------------------------------------------------------------------")
|
|
241
256
|
data_summary = metrics.get_mapstream_summary()
|
|
242
|
-
log_report = metrics.get_log_data()
|
|
243
257
|
try:
|
|
244
258
|
dsfh = open(output_dir + "/summary_mapstream.tsv", mode="w")
|
|
245
259
|
dsfh.write(data_summary)
|
|
246
260
|
dsfh.close()
|
|
247
|
-
logfh = open(output_dir + "/error_report.txt", mode="w")
|
|
248
|
-
logfh.write(log_report)
|
|
249
|
-
logfh.close()
|
|
250
261
|
except IOError as e:
|
|
251
262
|
print("I/O error({0}): {1}".format(e.errno, e.strerror))
|
|
252
263
|
print("Unable to write file")
|
|
253
264
|
|
|
265
|
+
# END mapstream
|
|
254
266
|
nowtime = time.time()
|
|
255
267
|
print("Elapsed time = {0:.5f} secs".format(nowtime - starttime))
|
|
256
|
-
#profiler.disable()
|
|
257
|
-
#stats = pstats.Stats(profiler).sort_stats('ncalls')
|
|
258
|
-
#stats.print_stats()
|
|
259
268
|
|
|
260
269
|
def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srccolmap, srcfilename, omopcdm, metrics):
|
|
270
|
+
"""
|
|
271
|
+
build all target records for a given input field
|
|
272
|
+
"""
|
|
261
273
|
build_records = False
|
|
262
274
|
tgtrecords = []
|
|
263
275
|
date_col_data = omopcdm.get_omop_datetime_linked_fields(tgtfilename)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
# Function inherited from the "old" CaRROT-CDM (modfied to exit on error)
|
|
6
|
+
|
|
7
|
+
def load_json(f_in):
|
|
8
|
+
try:
|
|
9
|
+
data = json.load(open(f_in))
|
|
10
|
+
except Exception as err:
|
|
11
|
+
print ("{0} not found. Or cannot parse as json".format(f_in))
|
|
12
|
+
sys.exit()
|
|
13
|
+
|
|
14
|
+
return data
|
|
15
|
+
|
|
@@ -4,6 +4,10 @@ import carrottransform.tools as tools
|
|
|
4
4
|
from .omopcdm import OmopCDM
|
|
5
5
|
|
|
6
6
|
class MappingRules:
|
|
7
|
+
"""
|
|
8
|
+
self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
|
|
9
|
+
as a file-specific dictionary allowing rules to be "looked-up" depending on data content
|
|
10
|
+
"""
|
|
7
11
|
|
|
8
12
|
def __init__(self, rulesfilepath, omopcdm):
|
|
9
13
|
self.rules_data = tools.load_json(rulesfilepath)
|
|
@@ -1,12 +1,21 @@
|
|
|
1
1
|
class Metrics():
|
|
2
|
+
"""
|
|
3
|
+
Capture metrics for output to a summary tsv file, record counts at multiple levels
|
|
4
|
+
The main principle is to increment counts associated with datakeys (dkey) at different levels
|
|
5
|
+
"""
|
|
2
6
|
def __init__(self, dataset_name, log_threshold=0):
|
|
7
|
+
"""
|
|
8
|
+
self.datasummary holds all the saved counts
|
|
9
|
+
"""
|
|
3
10
|
self.datasummary={}
|
|
4
11
|
self.allcounts={}
|
|
5
|
-
self.log_data=""
|
|
6
12
|
self.dataset_name=dataset_name
|
|
7
13
|
self.log_threshold = log_threshold
|
|
8
14
|
|
|
9
15
|
def get_new_mapstream_counts(self):
|
|
16
|
+
"""
|
|
17
|
+
return a new, initialised, count structure
|
|
18
|
+
"""
|
|
10
19
|
counts = {}
|
|
11
20
|
counts["input_count"] = 0
|
|
12
21
|
counts["invalid_persids"] = 0
|
|
@@ -118,10 +127,3 @@ class Metrics():
|
|
|
118
127
|
summary_str += self.dataset_name + "\t" + source + "\t" + fieldname + "\t" + tablename + "\t" + concept_id + "\t" + additional +"\t" + input_count + "\t" + invalid_person_ids + "\t" + invalid_date_fields + "\t" + invalid_source_fields + "\t" + output_count + "\n"
|
|
119
128
|
|
|
120
129
|
return summary_str
|
|
121
|
-
|
|
122
|
-
def add_log_data(self, msg):
|
|
123
|
-
self.log_data += msg + "\n"
|
|
124
|
-
|
|
125
|
-
def get_log_data(self):
|
|
126
|
-
return self.log_data
|
|
127
|
-
|
|
@@ -4,6 +4,11 @@ import re
|
|
|
4
4
|
import sys
|
|
5
5
|
|
|
6
6
|
class OmopCDM:
|
|
7
|
+
"""
|
|
8
|
+
Load and parse OMOP DDL data, to make an in-memory json CDM
|
|
9
|
+
Merge in manual additions (currently necessary to identify, person id, date / time fields and autonumber fields)
|
|
10
|
+
Define a series of "get" functions to allow CDM component discovery
|
|
11
|
+
"""
|
|
7
12
|
|
|
8
13
|
def __init__(self, omopddl, omopcfg):
|
|
9
14
|
self.numeric_types = ["integer", "numeric"]
|
|
@@ -24,8 +29,8 @@ class OmopCDM:
|
|
|
24
29
|
def load_ddl(self, omopddl):
|
|
25
30
|
try:
|
|
26
31
|
fp = open(omopddl, "r")
|
|
27
|
-
except
|
|
28
|
-
print("
|
|
32
|
+
except Exception as err:
|
|
33
|
+
print("OMOP ddl file ({0}) not found".format(omopddl))
|
|
29
34
|
sys.exit()
|
|
30
35
|
|
|
31
36
|
return(self.process_ddl(fp))
|
|
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "carrot-transform"
|
|
7
|
-
version = "0.3"
|
|
7
|
+
version = "0.3.2"
|
|
8
8
|
authors = [
|
|
9
9
|
{ name="PD Appleby", email="pdappleby@gmail.com" },
|
|
10
10
|
]
|
|
11
|
-
description = "Carrot
|
|
11
|
+
description = "Carrot simple transformer, input rules and data csv's, output OMOP"
|
|
12
12
|
readme = "README.md"
|
|
13
13
|
requires-python = ">=3.9"
|
|
14
14
|
classifiers = [
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import json
|
|
3
|
-
|
|
4
|
-
def load_json(f_in):
|
|
5
|
-
if os.path.exists(f_in):
|
|
6
|
-
data = json.load(open(f_in))
|
|
7
|
-
else:
|
|
8
|
-
try:
|
|
9
|
-
data = json.loads(f_in)
|
|
10
|
-
except Exception as err:
|
|
11
|
-
raise FileNotFoundError(f"{f_in} not found. Or cannot parse as json")
|
|
12
|
-
|
|
13
|
-
return data
|
|
14
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{carrot_transform-0.3 → carrot_transform-0.3.2}/carrot_transform.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|