carrot-transform 0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

@@ -0,0 +1,157 @@
1
+ import os
2
+ import json
3
+ import carrottransform.tools as tools
4
+ from .omopcdm import OmopCDM
5
+
6
+ class MappingRules:
7
+
8
+ def __init__(self, rulesfilepath, omopcdm):
9
+ self.rules_data = tools.load_json(rulesfilepath)
10
+ self.omopcdm = omopcdm
11
+
12
+ self.parsed_rules = {}
13
+ self.outfile_names = {}
14
+
15
+ self.dataset_name = self.get_dsname_from_rules()
16
+
17
+ def dump_parsed_rules(self):
18
+ return(json.dumps(self.parsed_rules, indent=2))
19
+
20
+ def get_dsname_from_rules(self):
21
+ dsname = "Unknown"
22
+
23
+ if "metadata" in self.rules_data:
24
+ if "dataset" in self.rules_data["metadata"]:
25
+ dsname = self.rules_data["metadata"]["dataset"]
26
+
27
+ return dsname
28
+
29
+ def get_dataset_name(self):
30
+ return self.dataset_name
31
+
32
+ def get_all_outfile_names(self):
33
+ file_list = []
34
+
35
+ for outfilename in self.rules_data["cdm"]:
36
+ file_list.append(outfilename)
37
+
38
+ return file_list
39
+
40
+ def get_all_infile_names(self):
41
+ file_list = []
42
+
43
+ for outfilename, conditions in self.rules_data["cdm"].items():
44
+ for outfield, source_field in conditions.items():
45
+ for source_field_name, source_data in source_field.items():
46
+ if "source_table" in source_data:
47
+ if source_data["source_table"] not in file_list:
48
+ file_list.append(source_data["source_table"])
49
+
50
+ return file_list
51
+
52
+ def get_infile_data_fields(self, infilename):
53
+ data_fields_lists = {}
54
+
55
+ outfilenames, outdata = self.parse_rules_src_to_tgt(infilename)
56
+
57
+ for outfilename in outfilenames:
58
+ data_fields_lists[outfilename] = []
59
+
60
+ for key, outfield_data in outdata.items():
61
+ keydata = key.split("~")
62
+ outfile = keydata[-1]
63
+ for outfield_elem in outfield_data:
64
+ for infield, outfields in outfield_elem.items():
65
+ for outfield in outfields:
66
+ outfielddata = outfield.split("~")
67
+ if self.omopcdm.is_omop_data_field(outfile, outfielddata[0]):
68
+ if infield not in data_fields_lists[outfile]:
69
+ data_fields_lists[outfile].append(infield)
70
+
71
+ return data_fields_lists
72
+
73
+ def get_infile_date_person_id(self, infilename):
74
+ outfilenames, outdata = self.parse_rules_src_to_tgt(infilename)
75
+ datetime_source = ""
76
+ person_id_source = ""
77
+
78
+ for key, outfield_data in outdata.items():
79
+ keydata = key.split("~")
80
+ outfile = keydata[-1]
81
+ for outfield_elem in outfield_data:
82
+ for infield, outfield_list in outfield_elem.items():
83
+ #print("{0}, {1}, {2}".format(outfile, infield, str(outfield_list)))
84
+ for outfield in outfield_list:
85
+ if outfield in self.omopcdm.get_omop_datetime_fields(outfile):
86
+ datetime_source = infield
87
+ if outfield == self.omopcdm.get_omop_person_id_field(outfile):
88
+ person_id_source = infield
89
+
90
+ return datetime_source, person_id_source
91
+
92
+ def get_person_source_field_info(self, tgtfilename):
93
+ """
94
+ Specific discovery of input data field names for 'person' in these rules
95
+ """
96
+ birth_datetime_source = None
97
+ person_id_source = None
98
+ if tgtfilename in self.rules_data["cdm"]:
99
+ source_rules_data = self.rules_data["cdm"][tgtfilename]
100
+ for rule_name, rule_fields in source_rules_data.items():
101
+ if "birth_datetime" in rule_fields:
102
+ birth_datetime_source = rule_fields["birth_datetime"]["source_field"]
103
+ if "person_id" in rule_fields:
104
+ person_id_source = rule_fields["person_id"]["source_field"]
105
+
106
+ return birth_datetime_source, person_id_source
107
+
108
+ def parse_rules_src_to_tgt(self, infilename):
109
+ """
110
+ Parse rules to produce a map of source to target data for a given input file
111
+ """
112
+ if infilename in self.outfile_names and infilename in self.parsed_rules:
113
+ return self.outfile_names[infilename], self.parsed_rules[infilename]
114
+ outfilenames = []
115
+ outdata = {}
116
+
117
+ for outfilename, rules_set in self.rules_data["cdm"].items():
118
+ for datatype, rules in rules_set.items():
119
+ key, data = self.process_rules(infilename, outfilename, rules)
120
+ if key != "":
121
+ if key not in outdata:
122
+ outdata[key] = []
123
+ outdata[key].append(data)
124
+ if outfilename not in outfilenames:
125
+ outfilenames.append(outfilename)
126
+
127
+ self.parsed_rules[infilename] = outdata
128
+ self.outfile_names[infilename] = outfilenames
129
+ return outfilenames, outdata
130
+
131
+ def process_rules(self, infilename, outfilename, rules):
132
+ """
133
+ Process rules for an infile, outfile combination
134
+ """
135
+ outkey = ""
136
+ data = {}
137
+ plain_key = ""
138
+ term_value_key = ""
139
+
140
+ for outfield, source_info in rules.items():
141
+ if source_info["source_field"] not in data:
142
+ data[source_info["source_field"]] = []
143
+ if source_info["source_table"] == infilename:
144
+ if "term_mapping" in source_info:
145
+ if type(source_info["term_mapping"]) is dict:
146
+ for inputvalue, term in source_info["term_mapping"].items():
147
+ term_value_key = infilename + "~" + source_info["source_field"] + "~" + str(inputvalue) + "~" + outfilename
148
+ data[source_info["source_field"]].append(outfield + "~" + str(source_info["term_mapping"][str(inputvalue)]))
149
+ else:
150
+ plain_key = infilename + "~" + source_info["source_field"] + "~" + outfilename
151
+ data[source_info["source_field"]].append(outfield + "~" + str(source_info["term_mapping"]))
152
+ else:
153
+ data[source_info["source_field"]].append(outfield)
154
+ if term_value_key != "":
155
+ return term_value_key, data
156
+
157
+ return plain_key, data
@@ -0,0 +1,127 @@
1
+ class Metrics():
2
+ def __init__(self, dataset_name, log_threshold=0):
3
+ self.datasummary={}
4
+ self.allcounts={}
5
+ self.log_data=""
6
+ self.dataset_name=dataset_name
7
+ self.log_threshold = log_threshold
8
+
9
+ def get_new_mapstream_counts(self):
10
+ counts = {}
11
+ counts["input_count"] = 0
12
+ counts["invalid_persids"] = 0
13
+ counts["invalid_dates"] = 0
14
+ counts["invalid_source_fields"] = 0
15
+ counts["output_count"] = 0
16
+
17
+ return counts
18
+
19
+ def add_data(self, desttablename, increment):
20
+ """
21
+ add_data(self, destination table, data increment)
22
+ Apply the contents of a data increment to the stored self.datasummary
23
+ """
24
+ name = increment["name"]
25
+ for datakey, dataitem in increment.items():
26
+ if datakey == "valid_person_id":
27
+ dkey = "NA" + "." + desttablename + "." + name + "." + datakey
28
+ self.add_counts_to_summary(dkey, dataitem)
29
+ elif datakey == "person_id":
30
+ dkey = "NA" + "." + desttablename + "." + name + "." + datakey
31
+ self.add_counts_to_summary(dkey, dataitem)
32
+ elif datakey == "required_fields":
33
+ for fieldname in dataitem:
34
+ prfx = "NA"
35
+ if "source_files" in increment:
36
+ if fieldname in increment["source_files"]:
37
+ prfx = self.get_prefix(increment["source_files"][fieldname]["table"])
38
+ dkey = prfx + "." + desttablename + "." + name + "." + fieldname
39
+ self.add_counts_to_summary(dkey, dataitem[fieldname])
40
+
41
+ def get_prefix(self, fname):
42
+ return fname.split(".")[0]
43
+
44
+ def add_counts_to_summary(self, dkey, count_block):
45
+ if dkey not in self.datasummary:
46
+ self.datasummary[dkey] = {}
47
+ for counttype in count_block:
48
+ if counttype not in self.datasummary[dkey]:
49
+ self.datasummary[dkey][counttype] = 0
50
+ self.datasummary[dkey][counttype] += int(count_block[counttype])
51
+
52
+ def increment_key_count(self, dkey, count_type):
53
+ """
54
+ Intended to work with the mapstream functions
55
+ """
56
+ if dkey not in self.datasummary:
57
+ self.datasummary[dkey] = {}
58
+ if count_type not in self.datasummary[dkey]:
59
+ self.datasummary[dkey][count_type] = 0
60
+ self.datasummary[dkey][count_type] += 1
61
+
62
+ def get_summary(self):
63
+ summary_str = "source\ttablename\tname\tcolumn name\tbefore\tafter content check\tpct reject content check\tafter date format check\tpct reject date format\n"
64
+
65
+ for dkey in self.datasummary:
66
+ #print(dkey)
67
+ source, tablename, name, colname = dkey.split('.')
68
+ before_count = int(self.datasummary[dkey]["before"])
69
+ after_count = int(self.datasummary[dkey]["after"])
70
+ after_pct = (float)(before_count - after_count) * 100 / before_count
71
+ summary_str += source + "\t" + tablename + "\t" + name + "\t" + colname + "\t" + str(before_count) + "\t" + str(after_count) + "\t" + "{0:.3f}".format(after_pct) + "\t"
72
+ if "after_formatting" in self.datasummary[dkey]:
73
+ after_format_count = int(self.datasummary[dkey]["after_formatting"])
74
+ after_format_pct = (float)(after_count - after_format_count) * 100 / after_count
75
+ summary_str += str(after_format_count) + "\t" + "{0:.3f}".format(after_format_pct) + "\n"
76
+ else:
77
+ summary_str += "NA\tNA\n"
78
+
79
+ return summary_str
80
+
81
+ def get_data_summary(self):
82
+ return self.datasummary
83
+
84
+ def get_mapstream_summary(self):
85
+ summary_str = "dsname\tsource\tsource_field\ttarget\tconcept_id\tadditional\tincount\tinvalid_persid\tinvalid_date\tinvalid_source\toutcount\n"
86
+
87
+ for dkey in sorted(self.datasummary):
88
+ try:
89
+ source, fieldname, tablename, concept_id, additional = dkey.split('~')
90
+ except ValueError:
91
+ print("get_mapstream_summary - ValueError: {0}".format(dkey))
92
+ break
93
+
94
+ source = self.get_prefix(source)
95
+ dvalue = self.datasummary[dkey]
96
+
97
+ input_count = "0"
98
+ if "input_count" in dvalue:
99
+ input_count = str(dvalue["input_count"])
100
+
101
+ invalid_person_ids = "0"
102
+ if "invalid_person_ids" in dvalue:
103
+ invalid_person_ids = str(dvalue["invalid_person_ids"])
104
+
105
+ invalid_source_fields = "0"
106
+ if "invalid_source_fields" in dvalue:
107
+ invalid_source_fields = str(dvalue["invalid_source_fields"])
108
+
109
+ invalid_date_fields = "0"
110
+ if "invalid_date_fields" in dvalue:
111
+ invalid_date_fields = str(dvalue["invalid_date_fields"])
112
+
113
+ output_count = "0"
114
+ if "output_count" in dvalue:
115
+ output_count = str(dvalue["output_count"])
116
+
117
+ if (int(output_count) >= self.log_threshold):
118
+ summary_str += self.dataset_name + "\t" + source + "\t" + fieldname + "\t" + tablename + "\t" + concept_id + "\t" + additional +"\t" + input_count + "\t" + invalid_person_ids + "\t" + invalid_date_fields + "\t" + invalid_source_fields + "\t" + output_count + "\n"
119
+
120
+ return summary_str
121
+
122
+ def add_log_data(self, msg):
123
+ self.log_data += msg + "\n"
124
+
125
+ def get_log_data(self):
126
+ return self.log_data
127
+
@@ -0,0 +1,182 @@
1
+ import carrottransform.tools as tools
2
+ import json
3
+ import re
4
+ import sys
5
+
6
+ class OmopCDM:
7
+
8
+ def __init__(self, omopddl, omopcfg):
9
+ self.numeric_types = ["integer", "numeric"]
10
+ self.datetime_types = ["timestamp"]
11
+ self.date_types = ["date"]
12
+ self.omop_json = self.load_ddl(omopddl)
13
+ self.omop_json = self.merge_json(self.omop_json, omopcfg)
14
+ self.all_columns = self.get_columns("all_columns")
15
+ self.numeric_fields = self.get_columns("numeric_fields")
16
+ self.notnull_numeric_fields = self.get_columns("notnull_numeric_fields")
17
+ self.datetime_linked_fields = self.get_columns("datetime_linked_fields")
18
+ self.date_field_components = self.get_columns("date_field_components")
19
+ self.datetime_fields = self.get_columns("datetime_fields")
20
+ self.person_id_field = self.get_columns("person_id_field")
21
+ self.auto_number_field = self.get_columns("auto_number_field")
22
+
23
+
24
+ def load_ddl(self, omopddl):
25
+ try:
26
+ fp = open(omopddl, "r")
27
+ except IOError as e:
28
+ print("I/O error for ddl file ({0}): {1}".format(e.errno, e.strerror))
29
+ sys.exit()
30
+
31
+ return(self.process_ddl(fp))
32
+
33
+ def process_ddl(self, fp):
34
+ """
35
+ Process the omop ddl file to output the attributes which CaRROT-CDM understands
36
+ Matching of selected parts of the ddl definition is performed using rgx's
37
+ """
38
+ output_dict = {}
39
+ output_dict["all_columns"] = {}
40
+ output_dict["numeric_fields"] = {}
41
+ output_dict["notnull_numeric_fields"] = {}
42
+ output_dict["datetime_fields"] = {}
43
+ output_dict["date_fields"] = {}
44
+
45
+ ver_rgx = re.compile(r'^--postgresql.*(\d+\.\d+)$')
46
+ start_rgx = re.compile(r'^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([A-Z_]+)')
47
+ datatype_rgx = re.compile(r'^\s*([a-z_]+)\s+([a-zA-Z_]+)')
48
+ end_rgx = re.compile(r'.*[)];$')
49
+ vermatched = False
50
+ processing_table_data = False
51
+ tabname = ""
52
+
53
+ for line in fp:
54
+ line = line.strip()
55
+ # check for line with version, if present
56
+ if vermatched == False:
57
+ vmatch = ver_rgx.search(line)
58
+ if vmatch != None:
59
+ version_string = vmatch.group(1)
60
+ output_dict["omop_version"] = version_string
61
+ vermatched = True
62
+ # check for start of table definition
63
+ if processing_table_data == False:
64
+ smatch = start_rgx.search(line)
65
+ if smatch != None:
66
+ processing_table_data = True
67
+ tabname = smatch.group(2).lower()
68
+ else:
69
+ idtmatch = datatype_rgx.search(line)
70
+ if idtmatch != None:
71
+ fname = idtmatch.group(1)
72
+ ftype = idtmatch.group(2)
73
+
74
+ # Check for dictionary element presence
75
+ if tabname not in output_dict["all_columns"]:
76
+ output_dict["all_columns"][tabname] = []
77
+ if tabname not in output_dict["numeric_fields"]:
78
+ output_dict["numeric_fields"][tabname] = []
79
+ if tabname not in output_dict["notnull_numeric_fields"]:
80
+ output_dict["notnull_numeric_fields"][tabname] = []
81
+ if tabname not in output_dict["datetime_fields"]:
82
+ output_dict["datetime_fields"][tabname] = []
83
+ if tabname not in output_dict["date_fields"]:
84
+ output_dict["date_fields"][tabname] = []
85
+
86
+ # Add in required column / field data
87
+ output_dict["all_columns"][tabname].append(fname)
88
+ if ftype.lower() in self.numeric_types:
89
+ output_dict["numeric_fields"][tabname].append(fname)
90
+ if ftype.lower() in self.numeric_types and "NOT" in line and "NULL" in line:
91
+ output_dict["notnull_numeric_fields"][tabname].append(fname)
92
+ if ftype.lower() in self.datetime_types:
93
+ output_dict["datetime_fields"][tabname].append(fname)
94
+ if ftype.lower() in self.date_types:
95
+ output_dict["date_fields"][tabname].append(fname)
96
+
97
+ ematch = end_rgx.search(line)
98
+ if ematch != None:
99
+ processing_table_data = False
100
+
101
+ return(output_dict)
102
+
103
+ def dump_ddl(self):
104
+ return(json.dumps(self.omop_json, indent=2))
105
+
106
+ def merge_json(self, omopjson, omopcfg):
107
+ tmp_json = tools.load_json(omopcfg)
108
+ for key, data in tmp_json.items():
109
+ omopjson[key] = data
110
+ return(omopjson)
111
+
112
+ def get_columns(self, colkey):
113
+ if colkey in self.omop_json:
114
+ return self.omop_json[colkey]
115
+ return None
116
+
117
+ def get_column_map(self, colarr, delim=","):
118
+ colmap = {}
119
+ for i, col in enumerate(colarr):
120
+ colmap[col] = i
121
+ return colmap
122
+
123
+ def get_omop_column_map(self, tablename):
124
+ if tablename in self.all_columns:
125
+ return self.get_column_map(self.all_columns[tablename])
126
+ return None
127
+
128
+ def get_omop_column_list(self, tablename):
129
+ if tablename in self.all_columns:
130
+ return self.all_columns[tablename]
131
+ return None
132
+
133
+ def is_omop_data_field(self, tablename, fieldname):
134
+ if fieldname in self.get_omop_datetime_linked_fields(tablename):
135
+ return False
136
+ if fieldname in self.get_omop_datetime_fields(tablename):
137
+ return False
138
+ if fieldname in self.get_omop_person_id_field(tablename):
139
+ return False
140
+ return True
141
+
142
+ def get_omop_numeric_fields(self, tablename):
143
+ if self.numeric_fields != None:
144
+ if tablename in self.numeric_fields:
145
+ return self.numeric_fields[tablename]
146
+ return []
147
+
148
+ def get_omop_notnull_numeric_fields(self, tablename):
149
+ if self.notnull_numeric_fields != None:
150
+ if tablename in self.notnull_numeric_fields:
151
+ return self.notnull_numeric_fields[tablename]
152
+ return []
153
+
154
+ def get_omop_datetime_linked_fields(self, tablename):
155
+ if self.datetime_linked_fields != None:
156
+ if tablename in self.datetime_linked_fields:
157
+ return self.datetime_linked_fields[tablename]
158
+ return {}
159
+
160
+ def get_omop_date_field_components(self, tablename):
161
+ if self.date_field_components != None:
162
+ if tablename in self.date_field_components:
163
+ return self.date_field_components[tablename]
164
+ return {}
165
+
166
+ def get_omop_datetime_fields(self, tablename):
167
+ if self.datetime_fields != None:
168
+ if tablename in self.datetime_fields:
169
+ return self.datetime_fields[tablename]
170
+ return []
171
+
172
+ def get_omop_person_id_field(self, tablename):
173
+ if self.person_id_field != None:
174
+ if tablename in self.person_id_field:
175
+ return self.person_id_field[tablename]
176
+ return None
177
+
178
+ def get_omop_auto_number_field(self, tablename):
179
+ if self.auto_number_field != None:
180
+ if tablename in self.auto_number_field:
181
+ return self.auto_number_field[tablename]
182
+ return None