carrot-transform 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

@@ -0,0 +1,161 @@
1
+ import os
2
+ import json
3
+ import carrottransform.tools as tools
4
+ from .omopcdm import OmopCDM
5
+
6
+ class MappingRules:
7
+ """
8
+ self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
9
+ as a file-specific dictionary allowing rules to be "looked-up" depending on data content
10
+ """
11
+
12
+ def __init__(self, rulesfilepath, omopcdm):
13
+ self.rules_data = tools.load_json(rulesfilepath)
14
+ self.omopcdm = omopcdm
15
+
16
+ self.parsed_rules = {}
17
+ self.outfile_names = {}
18
+
19
+ self.dataset_name = self.get_dsname_from_rules()
20
+
21
+ def dump_parsed_rules(self):
22
+ return(json.dumps(self.parsed_rules, indent=2))
23
+
24
+ def get_dsname_from_rules(self):
25
+ dsname = "Unknown"
26
+
27
+ if "metadata" in self.rules_data:
28
+ if "dataset" in self.rules_data["metadata"]:
29
+ dsname = self.rules_data["metadata"]["dataset"]
30
+
31
+ return dsname
32
+
33
+ def get_dataset_name(self):
34
+ return self.dataset_name
35
+
36
+ def get_all_outfile_names(self):
37
+ file_list = []
38
+
39
+ for outfilename in self.rules_data["cdm"]:
40
+ file_list.append(outfilename)
41
+
42
+ return file_list
43
+
44
+ def get_all_infile_names(self):
45
+ file_list = []
46
+
47
+ for outfilename, conditions in self.rules_data["cdm"].items():
48
+ for outfield, source_field in conditions.items():
49
+ for source_field_name, source_data in source_field.items():
50
+ if "source_table" in source_data:
51
+ if source_data["source_table"] not in file_list:
52
+ file_list.append(source_data["source_table"])
53
+
54
+ return file_list
55
+
56
+ def get_infile_data_fields(self, infilename):
57
+ data_fields_lists = {}
58
+
59
+ outfilenames, outdata = self.parse_rules_src_to_tgt(infilename)
60
+
61
+ for outfilename in outfilenames:
62
+ data_fields_lists[outfilename] = []
63
+
64
+ for key, outfield_data in outdata.items():
65
+ keydata = key.split("~")
66
+ outfile = keydata[-1]
67
+ for outfield_elem in outfield_data:
68
+ for infield, outfields in outfield_elem.items():
69
+ for outfield in outfields:
70
+ outfielddata = outfield.split("~")
71
+ if self.omopcdm.is_omop_data_field(outfile, outfielddata[0]):
72
+ if infield not in data_fields_lists[outfile]:
73
+ data_fields_lists[outfile].append(infield)
74
+
75
+ return data_fields_lists
76
+
77
+ def get_infile_date_person_id(self, infilename):
78
+ outfilenames, outdata = self.parse_rules_src_to_tgt(infilename)
79
+ datetime_source = ""
80
+ person_id_source = ""
81
+
82
+ for key, outfield_data in outdata.items():
83
+ keydata = key.split("~")
84
+ outfile = keydata[-1]
85
+ for outfield_elem in outfield_data:
86
+ for infield, outfield_list in outfield_elem.items():
87
+ #print("{0}, {1}, {2}".format(outfile, infield, str(outfield_list)))
88
+ for outfield in outfield_list:
89
+ if outfield in self.omopcdm.get_omop_datetime_fields(outfile):
90
+ datetime_source = infield
91
+ if outfield == self.omopcdm.get_omop_person_id_field(outfile):
92
+ person_id_source = infield
93
+
94
+ return datetime_source, person_id_source
95
+
96
+ def get_person_source_field_info(self, tgtfilename):
97
+ """
98
+ Specific discovery of input data field names for 'person' in these rules
99
+ """
100
+ birth_datetime_source = None
101
+ person_id_source = None
102
+ if tgtfilename in self.rules_data["cdm"]:
103
+ source_rules_data = self.rules_data["cdm"][tgtfilename]
104
+ for rule_name, rule_fields in source_rules_data.items():
105
+ if "birth_datetime" in rule_fields:
106
+ birth_datetime_source = rule_fields["birth_datetime"]["source_field"]
107
+ if "person_id" in rule_fields:
108
+ person_id_source = rule_fields["person_id"]["source_field"]
109
+
110
+ return birth_datetime_source, person_id_source
111
+
112
+ def parse_rules_src_to_tgt(self, infilename):
113
+ """
114
+ Parse rules to produce a map of source to target data for a given input file
115
+ """
116
+ if infilename in self.outfile_names and infilename in self.parsed_rules:
117
+ return self.outfile_names[infilename], self.parsed_rules[infilename]
118
+ outfilenames = []
119
+ outdata = {}
120
+
121
+ for outfilename, rules_set in self.rules_data["cdm"].items():
122
+ for datatype, rules in rules_set.items():
123
+ key, data = self.process_rules(infilename, outfilename, rules)
124
+ if key != "":
125
+ if key not in outdata:
126
+ outdata[key] = []
127
+ outdata[key].append(data)
128
+ if outfilename not in outfilenames:
129
+ outfilenames.append(outfilename)
130
+
131
+ self.parsed_rules[infilename] = outdata
132
+ self.outfile_names[infilename] = outfilenames
133
+ return outfilenames, outdata
134
+
135
+ def process_rules(self, infilename, outfilename, rules):
136
+ """
137
+ Process rules for an infile, outfile combination
138
+ """
139
+ outkey = ""
140
+ data = {}
141
+ plain_key = ""
142
+ term_value_key = ""
143
+
144
+ for outfield, source_info in rules.items():
145
+ if source_info["source_field"] not in data:
146
+ data[source_info["source_field"]] = []
147
+ if source_info["source_table"] == infilename:
148
+ if "term_mapping" in source_info:
149
+ if type(source_info["term_mapping"]) is dict:
150
+ for inputvalue, term in source_info["term_mapping"].items():
151
+ term_value_key = infilename + "~" + source_info["source_field"] + "~" + str(inputvalue) + "~" + outfilename
152
+ data[source_info["source_field"]].append(outfield + "~" + str(source_info["term_mapping"][str(inputvalue)]))
153
+ else:
154
+ plain_key = infilename + "~" + source_info["source_field"] + "~" + outfilename
155
+ data[source_info["source_field"]].append(outfield + "~" + str(source_info["term_mapping"]))
156
+ else:
157
+ data[source_info["source_field"]].append(outfield)
158
+ if term_value_key != "":
159
+ return term_value_key, data
160
+
161
+ return plain_key, data
@@ -0,0 +1,129 @@
1
+ class Metrics():
2
+ """
3
+ Capture metrics for output to a summary tsv file, record counts at multiple levels
4
+ The main principle is to increment counts associated with datakeys (dkey) at different levels
5
+ """
6
+ def __init__(self, dataset_name, log_threshold=0):
7
+ """
8
+ self.datasummary holds all the saved counts
9
+ """
10
+ self.datasummary={}
11
+ self.allcounts={}
12
+ self.dataset_name=dataset_name
13
+ self.log_threshold = log_threshold
14
+
15
+ def get_new_mapstream_counts(self):
16
+ """
17
+ return a new, initialised, count structure
18
+ """
19
+ counts = {}
20
+ counts["input_count"] = 0
21
+ counts["invalid_persids"] = 0
22
+ counts["invalid_dates"] = 0
23
+ counts["invalid_source_fields"] = 0
24
+ counts["output_count"] = 0
25
+
26
+ return counts
27
+
28
+ def add_data(self, desttablename, increment):
29
+ """
30
+ add_data(self, destination table, data increment)
31
+ Apply the contents of a data increment to the stored self.datasummary
32
+ """
33
+ name = increment["name"]
34
+ for datakey, dataitem in increment.items():
35
+ if datakey == "valid_person_id":
36
+ dkey = "NA" + "." + desttablename + "." + name + "." + datakey
37
+ self.add_counts_to_summary(dkey, dataitem)
38
+ elif datakey == "person_id":
39
+ dkey = "NA" + "." + desttablename + "." + name + "." + datakey
40
+ self.add_counts_to_summary(dkey, dataitem)
41
+ elif datakey == "required_fields":
42
+ for fieldname in dataitem:
43
+ prfx = "NA"
44
+ if "source_files" in increment:
45
+ if fieldname in increment["source_files"]:
46
+ prfx = self.get_prefix(increment["source_files"][fieldname]["table"])
47
+ dkey = prfx + "." + desttablename + "." + name + "." + fieldname
48
+ self.add_counts_to_summary(dkey, dataitem[fieldname])
49
+
50
+ def get_prefix(self, fname):
51
+ return fname.split(".")[0]
52
+
53
+ def add_counts_to_summary(self, dkey, count_block):
54
+ if dkey not in self.datasummary:
55
+ self.datasummary[dkey] = {}
56
+ for counttype in count_block:
57
+ if counttype not in self.datasummary[dkey]:
58
+ self.datasummary[dkey][counttype] = 0
59
+ self.datasummary[dkey][counttype] += int(count_block[counttype])
60
+
61
+ def increment_key_count(self, dkey, count_type):
62
+ """
63
+ Intended to work with the mapstream functions
64
+ """
65
+ if dkey not in self.datasummary:
66
+ self.datasummary[dkey] = {}
67
+ if count_type not in self.datasummary[dkey]:
68
+ self.datasummary[dkey][count_type] = 0
69
+ self.datasummary[dkey][count_type] += 1
70
+
71
+ def get_summary(self):
72
+ summary_str = "source\ttablename\tname\tcolumn name\tbefore\tafter content check\tpct reject content check\tafter date format check\tpct reject date format\n"
73
+
74
+ for dkey in self.datasummary:
75
+ #print(dkey)
76
+ source, tablename, name, colname = dkey.split('.')
77
+ before_count = int(self.datasummary[dkey]["before"])
78
+ after_count = int(self.datasummary[dkey]["after"])
79
+ after_pct = (float)(before_count - after_count) * 100 / before_count
80
+ summary_str += source + "\t" + tablename + "\t" + name + "\t" + colname + "\t" + str(before_count) + "\t" + str(after_count) + "\t" + "{0:.3f}".format(after_pct) + "\t"
81
+ if "after_formatting" in self.datasummary[dkey]:
82
+ after_format_count = int(self.datasummary[dkey]["after_formatting"])
83
+ after_format_pct = (float)(after_count - after_format_count) * 100 / after_count
84
+ summary_str += str(after_format_count) + "\t" + "{0:.3f}".format(after_format_pct) + "\n"
85
+ else:
86
+ summary_str += "NA\tNA\n"
87
+
88
+ return summary_str
89
+
90
+ def get_data_summary(self):
91
+ return self.datasummary
92
+
93
+ def get_mapstream_summary(self):
94
+ summary_str = "dsname\tsource\tsource_field\ttarget\tconcept_id\tadditional\tincount\tinvalid_persid\tinvalid_date\tinvalid_source\toutcount\n"
95
+
96
+ for dkey in sorted(self.datasummary):
97
+ try:
98
+ source, fieldname, tablename, concept_id, additional = dkey.split('~')
99
+ except ValueError:
100
+ print("get_mapstream_summary - ValueError: {0}".format(dkey))
101
+ break
102
+
103
+ source = self.get_prefix(source)
104
+ dvalue = self.datasummary[dkey]
105
+
106
+ input_count = "0"
107
+ if "input_count" in dvalue:
108
+ input_count = str(dvalue["input_count"])
109
+
110
+ invalid_person_ids = "0"
111
+ if "invalid_person_ids" in dvalue:
112
+ invalid_person_ids = str(dvalue["invalid_person_ids"])
113
+
114
+ invalid_source_fields = "0"
115
+ if "invalid_source_fields" in dvalue:
116
+ invalid_source_fields = str(dvalue["invalid_source_fields"])
117
+
118
+ invalid_date_fields = "0"
119
+ if "invalid_date_fields" in dvalue:
120
+ invalid_date_fields = str(dvalue["invalid_date_fields"])
121
+
122
+ output_count = "0"
123
+ if "output_count" in dvalue:
124
+ output_count = str(dvalue["output_count"])
125
+
126
+ if (int(output_count) >= self.log_threshold):
127
+ summary_str += self.dataset_name + "\t" + source + "\t" + fieldname + "\t" + tablename + "\t" + concept_id + "\t" + additional +"\t" + input_count + "\t" + invalid_person_ids + "\t" + invalid_date_fields + "\t" + invalid_source_fields + "\t" + output_count + "\n"
128
+
129
+ return summary_str
@@ -0,0 +1,187 @@
1
+ import carrottransform.tools as tools
2
+ import json
3
+ import re
4
+ import sys
5
+
6
+ class OmopCDM:
7
+ """
8
+ Load and parse OMOP DDL data, to make an in-memory json CDM
9
+ Merge in manual additions (currently necessary to identify, person id, date / time fields and autonumber fields)
10
+ Define a series of "get" functions to allow CDM component discovery
11
+ """
12
+
13
+ def __init__(self, omopddl, omopcfg):
14
+ self.numeric_types = ["integer", "numeric"]
15
+ self.datetime_types = ["timestamp"]
16
+ self.date_types = ["date"]
17
+ self.omop_json = self.load_ddl(omopddl)
18
+ self.omop_json = self.merge_json(self.omop_json, omopcfg)
19
+ self.all_columns = self.get_columns("all_columns")
20
+ self.numeric_fields = self.get_columns("numeric_fields")
21
+ self.notnull_numeric_fields = self.get_columns("notnull_numeric_fields")
22
+ self.datetime_linked_fields = self.get_columns("datetime_linked_fields")
23
+ self.date_field_components = self.get_columns("date_field_components")
24
+ self.datetime_fields = self.get_columns("datetime_fields")
25
+ self.person_id_field = self.get_columns("person_id_field")
26
+ self.auto_number_field = self.get_columns("auto_number_field")
27
+
28
+
29
+ def load_ddl(self, omopddl):
30
+ try:
31
+ fp = open(omopddl, "r")
32
+ except Exception as err:
33
+ print("OMOP ddl file ({0}) not found".format(omopddl))
34
+ sys.exit()
35
+
36
+ return(self.process_ddl(fp))
37
+
38
+ def process_ddl(self, fp):
39
+ """
40
+ Process the omop ddl file to output the attributes which CaRROT-CDM understands
41
+ Matching of selected parts of the ddl definition is performed using rgx's
42
+ """
43
+ output_dict = {}
44
+ output_dict["all_columns"] = {}
45
+ output_dict["numeric_fields"] = {}
46
+ output_dict["notnull_numeric_fields"] = {}
47
+ output_dict["datetime_fields"] = {}
48
+ output_dict["date_fields"] = {}
49
+
50
+ ver_rgx = re.compile(r'^--postgresql.*(\d+\.\d+)$')
51
+ start_rgx = re.compile(r'^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([A-Z_]+)')
52
+ datatype_rgx = re.compile(r'^\s*([a-z_]+)\s+([a-zA-Z_]+)')
53
+ end_rgx = re.compile(r'.*[)];$')
54
+ vermatched = False
55
+ processing_table_data = False
56
+ tabname = ""
57
+
58
+ for line in fp:
59
+ line = line.strip()
60
+ # check for line with version, if present
61
+ if vermatched == False:
62
+ vmatch = ver_rgx.search(line)
63
+ if vmatch != None:
64
+ version_string = vmatch.group(1)
65
+ output_dict["omop_version"] = version_string
66
+ vermatched = True
67
+ # check for start of table definition
68
+ if processing_table_data == False:
69
+ smatch = start_rgx.search(line)
70
+ if smatch != None:
71
+ processing_table_data = True
72
+ tabname = smatch.group(2).lower()
73
+ else:
74
+ idtmatch = datatype_rgx.search(line)
75
+ if idtmatch != None:
76
+ fname = idtmatch.group(1)
77
+ ftype = idtmatch.group(2)
78
+
79
+ # Check for dictionary element presence
80
+ if tabname not in output_dict["all_columns"]:
81
+ output_dict["all_columns"][tabname] = []
82
+ if tabname not in output_dict["numeric_fields"]:
83
+ output_dict["numeric_fields"][tabname] = []
84
+ if tabname not in output_dict["notnull_numeric_fields"]:
85
+ output_dict["notnull_numeric_fields"][tabname] = []
86
+ if tabname not in output_dict["datetime_fields"]:
87
+ output_dict["datetime_fields"][tabname] = []
88
+ if tabname not in output_dict["date_fields"]:
89
+ output_dict["date_fields"][tabname] = []
90
+
91
+ # Add in required column / field data
92
+ output_dict["all_columns"][tabname].append(fname)
93
+ if ftype.lower() in self.numeric_types:
94
+ output_dict["numeric_fields"][tabname].append(fname)
95
+ if ftype.lower() in self.numeric_types and "NOT" in line and "NULL" in line:
96
+ output_dict["notnull_numeric_fields"][tabname].append(fname)
97
+ if ftype.lower() in self.datetime_types:
98
+ output_dict["datetime_fields"][tabname].append(fname)
99
+ if ftype.lower() in self.date_types:
100
+ output_dict["date_fields"][tabname].append(fname)
101
+
102
+ ematch = end_rgx.search(line)
103
+ if ematch != None:
104
+ processing_table_data = False
105
+
106
+ return(output_dict)
107
+
108
+ def dump_ddl(self):
109
+ return(json.dumps(self.omop_json, indent=2))
110
+
111
+ def merge_json(self, omopjson, omopcfg):
112
+ tmp_json = tools.load_json(omopcfg)
113
+ for key, data in tmp_json.items():
114
+ omopjson[key] = data
115
+ return(omopjson)
116
+
117
+ def get_columns(self, colkey):
118
+ if colkey in self.omop_json:
119
+ return self.omop_json[colkey]
120
+ return None
121
+
122
+ def get_column_map(self, colarr, delim=","):
123
+ colmap = {}
124
+ for i, col in enumerate(colarr):
125
+ colmap[col] = i
126
+ return colmap
127
+
128
+ def get_omop_column_map(self, tablename):
129
+ if tablename in self.all_columns:
130
+ return self.get_column_map(self.all_columns[tablename])
131
+ return None
132
+
133
+ def get_omop_column_list(self, tablename):
134
+ if tablename in self.all_columns:
135
+ return self.all_columns[tablename]
136
+ return None
137
+
138
+ def is_omop_data_field(self, tablename, fieldname):
139
+ if fieldname in self.get_omop_datetime_linked_fields(tablename):
140
+ return False
141
+ if fieldname in self.get_omop_datetime_fields(tablename):
142
+ return False
143
+ if fieldname in self.get_omop_person_id_field(tablename):
144
+ return False
145
+ return True
146
+
147
+ def get_omop_numeric_fields(self, tablename):
148
+ if self.numeric_fields != None:
149
+ if tablename in self.numeric_fields:
150
+ return self.numeric_fields[tablename]
151
+ return []
152
+
153
+ def get_omop_notnull_numeric_fields(self, tablename):
154
+ if self.notnull_numeric_fields != None:
155
+ if tablename in self.notnull_numeric_fields:
156
+ return self.notnull_numeric_fields[tablename]
157
+ return []
158
+
159
+ def get_omop_datetime_linked_fields(self, tablename):
160
+ if self.datetime_linked_fields != None:
161
+ if tablename in self.datetime_linked_fields:
162
+ return self.datetime_linked_fields[tablename]
163
+ return {}
164
+
165
+ def get_omop_date_field_components(self, tablename):
166
+ if self.date_field_components != None:
167
+ if tablename in self.date_field_components:
168
+ return self.date_field_components[tablename]
169
+ return {}
170
+
171
+ def get_omop_datetime_fields(self, tablename):
172
+ if self.datetime_fields != None:
173
+ if tablename in self.datetime_fields:
174
+ return self.datetime_fields[tablename]
175
+ return []
176
+
177
+ def get_omop_person_id_field(self, tablename):
178
+ if self.person_id_field != None:
179
+ if tablename in self.person_id_field:
180
+ return self.person_id_field[tablename]
181
+ return None
182
+
183
+ def get_omop_auto_number_field(self, tablename):
184
+ if self.auto_number_field != None:
185
+ if tablename in self.auto_number_field:
186
+ return self.auto_number_field[tablename]
187
+ return None