PyPI - carrot-transform - Versions diffs - 0.1.0__py3-none-any.whl - Mend

carrot-transform 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of carrot-transform might be problematic. Click here for more details.

Files changed (17) hide show

carrot_transform-0.1.0.dist-info/LICENSE +21 -0
carrot_transform-0.1.0.dist-info/METADATA +48 -0
carrot_transform-0.1.0.dist-info/RECORD +17 -0
carrot_transform-0.1.0.dist-info/WHEEL +4 -0
carrottransform/__init__.py +5 -0
carrottransform/_version.py +2 -0
carrottransform/cli/__init__.py +0 -0
carrottransform/cli/command.py +21 -0
carrottransform/cli/subcommands/__init__.py +0 -0
carrottransform/cli/subcommands/run.py +496 -0
carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql +508 -0
carrottransform/config/omop.json +67 -0
carrottransform/tools/__init__.py +17 -0
carrottransform/tools/file_helpers.py +15 -0
carrottransform/tools/mappingrules.py +161 -0
carrottransform/tools/metrics.py +129 -0
carrottransform/tools/omopcdm.py +187 -0

carrottransform/tools/mappingrules.py ADDED Viewed

@@ -0,0 +1,161 @@
+import os
+import json
+import carrottransform.tools as tools
+from .omopcdm import OmopCDM
+class MappingRules:
+    """
+    self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
+    as a file-specific dictionary allowing rules to be "looked-up" depending on data content
+    """
+    def __init__(self, rulesfilepath, omopcdm):
+        self.rules_data = tools.load_json(rulesfilepath)
+        self.omopcdm = omopcdm
+        self.parsed_rules = {}
+        self.outfile_names = {}
+        self.dataset_name = self.get_dsname_from_rules()
+    def dump_parsed_rules(self):
+        return(json.dumps(self.parsed_rules, indent=2))
+    def get_dsname_from_rules(self):
+        dsname = "Unknown"
+        if "metadata" in self.rules_data:
+            if "dataset" in self.rules_data["metadata"]:
+                dsname = self.rules_data["metadata"]["dataset"]
+        return dsname
+    def get_dataset_name(self):
+        return self.dataset_name
+    def get_all_outfile_names(self):
+        file_list = []
+        for outfilename in self.rules_data["cdm"]:
+            file_list.append(outfilename)
+        return file_list
+    def get_all_infile_names(self):
+        file_list = []
+        for outfilename, conditions in self.rules_data["cdm"].items():
+            for outfield, source_field in conditions.items():
+                for source_field_name, source_data in source_field.items():
+                    if "source_table" in source_data:
+                        if source_data["source_table"] not in file_list:
+                            file_list.append(source_data["source_table"])
+        return file_list
+    def get_infile_data_fields(self, infilename):
+        data_fields_lists = {}
+        outfilenames, outdata = self.parse_rules_src_to_tgt(infilename)
+        for outfilename in outfilenames:
+            data_fields_lists[outfilename] = []
+        for key, outfield_data in outdata.items():
+            keydata = key.split("~")
+            outfile = keydata[-1]
+            for outfield_elem in outfield_data:
+                for infield, outfields in outfield_elem.items():
+                    for outfield in outfields:
+                        outfielddata = outfield.split("~")
+                        if self.omopcdm.is_omop_data_field(outfile, outfielddata[0]):
+                            if infield not in data_fields_lists[outfile]:
+                                data_fields_lists[outfile].append(infield)
+        return data_fields_lists
+    def get_infile_date_person_id(self, infilename):
+        outfilenames, outdata = self.parse_rules_src_to_tgt(infilename)
+        datetime_source = ""
+        person_id_source = ""
+        for key, outfield_data in outdata.items():
+            keydata = key.split("~")
+            outfile = keydata[-1]
+            for outfield_elem in outfield_data:
+                for infield, outfield_list in outfield_elem.items():
+                    #print("{0}, {1}, {2}".format(outfile, infield, str(outfield_list)))
+                    for outfield in outfield_list:
+                        if outfield in self.omopcdm.get_omop_datetime_fields(outfile):
+                            datetime_source = infield
+                        if outfield == self.omopcdm.get_omop_person_id_field(outfile):
+                            person_id_source = infield
+        return datetime_source, person_id_source
+    def get_person_source_field_info(self, tgtfilename):
+        """
+        Specific discovery of input data field names for 'person' in these rules
+        """
+        birth_datetime_source = None
+        person_id_source = None
+        if tgtfilename in self.rules_data["cdm"]:
+            source_rules_data = self.rules_data["cdm"][tgtfilename]
+            for rule_name, rule_fields in source_rules_data.items():
+                if "birth_datetime" in rule_fields:
+                    birth_datetime_source = rule_fields["birth_datetime"]["source_field"]
+                if "person_id" in rule_fields:
+                    person_id_source = rule_fields["person_id"]["source_field"]
+        return birth_datetime_source, person_id_source
+    def parse_rules_src_to_tgt(self, infilename):
+        """
+        Parse rules to produce a map of source to target data for a given input file
+        """
+        if infilename in self.outfile_names and infilename in self.parsed_rules:
+            return self.outfile_names[infilename], self.parsed_rules[infilename]
+        outfilenames = []
+        outdata = {}
+        for outfilename, rules_set in self.rules_data["cdm"].items():
+            for datatype, rules in rules_set.items():
+                key, data = self.process_rules(infilename, outfilename, rules)
+                if key != "":
+                    if key not in outdata:
+                        outdata[key] = []
+                    outdata[key].append(data)
+                    if outfilename not in outfilenames:
+                        outfilenames.append(outfilename)
+        self.parsed_rules[infilename] = outdata
+        self.outfile_names[infilename] = outfilenames
+        return outfilenames, outdata
+    def process_rules(self, infilename, outfilename, rules):
+        """
+        Process rules for an infile, outfile combination
+        """
+        outkey = ""
+        data = {}
+        plain_key = ""
+        term_value_key = ""
+        for outfield, source_info in rules.items():
+            if source_info["source_field"] not in data:
+                data[source_info["source_field"]] = []
+            if source_info["source_table"] == infilename:
+                if "term_mapping" in source_info:
+                    if type(source_info["term_mapping"]) is dict:
+                        for inputvalue, term in source_info["term_mapping"].items():
+                            term_value_key = infilename + "~" + source_info["source_field"] + "~" + str(inputvalue) + "~" + outfilename
+                            data[source_info["source_field"]].append(outfield + "~" + str(source_info["term_mapping"][str(inputvalue)]))
+                    else:
+                        plain_key = infilename + "~" + source_info["source_field"] + "~" + outfilename
+                        data[source_info["source_field"]].append(outfield + "~" + str(source_info["term_mapping"]))
+                else:
+                    data[source_info["source_field"]].append(outfield)
+        if term_value_key != "":
+            return term_value_key, data
+        return plain_key, data

carrottransform/tools/metrics.py ADDED Viewed

@@ -0,0 +1,129 @@
+class Metrics():
+    """
+    Capture metrics for output to a summary tsv file, record counts at multiple levels
+    The main principle is to increment counts associated with datakeys (dkey) at different levels
+    """
+    def __init__(self, dataset_name, log_threshold=0):
+        """
+        self.datasummary holds all the saved counts
+        """
+        self.datasummary={}
+        self.allcounts={}
+        self.dataset_name=dataset_name
+        self.log_threshold = log_threshold
+    def get_new_mapstream_counts(self):
+        """
+        return a new, initialised,  count structure
+        """
+        counts = {}
+        counts["input_count"] = 0
+        counts["invalid_persids"] = 0
+        counts["invalid_dates"] = 0
+        counts["invalid_source_fields"] = 0
+        counts["output_count"] = 0
+        return counts
+    def add_data(self, desttablename, increment):
+        """
+        add_data(self, destination table, data increment)
+        Apply the contents of a data increment to the stored self.datasummary
+        """
+        name = increment["name"]
+        for datakey, dataitem in increment.items():
+            if datakey == "valid_person_id":
+                dkey = "NA" + "." + desttablename + "." + name + "." + datakey
+                self.add_counts_to_summary(dkey, dataitem)
+            elif datakey == "person_id":
+                dkey = "NA" + "." + desttablename + "." + name + "." + datakey
+                self.add_counts_to_summary(dkey, dataitem)
+            elif datakey == "required_fields":
+                for fieldname in dataitem:
+                    prfx = "NA"
+                    if "source_files" in increment:
+                        if fieldname in increment["source_files"]:
+                            prfx = self.get_prefix(increment["source_files"][fieldname]["table"])
+                            dkey = prfx + "." + desttablename + "." + name + "." + fieldname
+                            self.add_counts_to_summary(dkey, dataitem[fieldname])
+    def get_prefix(self, fname):
+        return fname.split(".")[0]
+    def add_counts_to_summary(self, dkey, count_block):
+        if dkey not in self.datasummary:
+            self.datasummary[dkey] = {}
+        for counttype in count_block:
+            if counttype not in self.datasummary[dkey]:
+                self.datasummary[dkey][counttype] = 0
+            self.datasummary[dkey][counttype] += int(count_block[counttype])
+    def increment_key_count(self, dkey, count_type):
+        """
+        Intended to work with the mapstream functions
+        """
+        if dkey not in self.datasummary:
+            self.datasummary[dkey] = {}
+        if count_type not in self.datasummary[dkey]:
+            self.datasummary[dkey][count_type] = 0
+        self.datasummary[dkey][count_type] += 1
+    def get_summary(self):
+        summary_str = "source\ttablename\tname\tcolumn name\tbefore\tafter content check\tpct reject content check\tafter date format check\tpct reject date format\n"
+        for dkey in self.datasummary:
+            #print(dkey)
+            source, tablename, name, colname = dkey.split('.')
+            before_count = int(self.datasummary[dkey]["before"])
+            after_count = int(self.datasummary[dkey]["after"])
+            after_pct = (float)(before_count - after_count) * 100 / before_count
+            summary_str += source + "\t" + tablename + "\t" + name + "\t" + colname + "\t" + str(before_count) + "\t" + str(after_count) + "\t" + "{0:.3f}".format(after_pct) + "\t"
+            if "after_formatting" in self.datasummary[dkey]:
+                after_format_count = int(self.datasummary[dkey]["after_formatting"])
+                after_format_pct = (float)(after_count - after_format_count) * 100 / after_count
+                summary_str += str(after_format_count) + "\t" + "{0:.3f}".format(after_format_pct) + "\n"
+            else:
+                summary_str += "NA\tNA\n"
+        return summary_str
+    def get_data_summary(self):
+        return self.datasummary
+    def get_mapstream_summary(self):
+        summary_str = "dsname\tsource\tsource_field\ttarget\tconcept_id\tadditional\tincount\tinvalid_persid\tinvalid_date\tinvalid_source\toutcount\n"
+        for dkey in sorted(self.datasummary):
+            try:
+                source, fieldname, tablename, concept_id, additional = dkey.split('~')
+            except ValueError:
+                print("get_mapstream_summary - ValueError: {0}".format(dkey))
+                break
+            source = self.get_prefix(source)
+            dvalue = self.datasummary[dkey]
+            input_count = "0"
+            if "input_count" in dvalue:
+                input_count = str(dvalue["input_count"])
+            invalid_person_ids = "0"
+            if "invalid_person_ids" in dvalue:
+                invalid_person_ids = str(dvalue["invalid_person_ids"])
+            invalid_source_fields = "0"
+            if "invalid_source_fields" in dvalue:
+                invalid_source_fields = str(dvalue["invalid_source_fields"])
+            invalid_date_fields = "0"
+            if "invalid_date_fields" in dvalue:
+                invalid_date_fields = str(dvalue["invalid_date_fields"])
+            output_count = "0"
+            if "output_count" in dvalue:
+                output_count = str(dvalue["output_count"])
+            if (int(output_count) >= self.log_threshold):
+              summary_str += self.dataset_name + "\t" + source + "\t" + fieldname + "\t" + tablename + "\t" + concept_id + "\t" + additional +"\t" + input_count + "\t" + invalid_person_ids + "\t" + invalid_date_fields + "\t" + invalid_source_fields + "\t" + output_count + "\n"
+        return summary_str

carrottransform/tools/omopcdm.py ADDED Viewed

@@ -0,0 +1,187 @@
+import carrottransform.tools as tools
+import json
+import re
+import sys
+class OmopCDM:
+    """
+    Load and parse OMOP DDL data, to make an in-memory json CDM
+    Merge in manual additions (currently necessary to identify, person id, date / time fields and autonumber fields)
+    Define a series of "get" functions to allow CDM component discovery
+    """
+    def __init__(self, omopddl, omopcfg):
+        self.numeric_types = ["integer", "numeric"]
+        self.datetime_types = ["timestamp"]
+        self.date_types = ["date"]
+        self.omop_json = self.load_ddl(omopddl)
+        self.omop_json = self.merge_json(self.omop_json, omopcfg)
+        self.all_columns = self.get_columns("all_columns")
+        self.numeric_fields = self.get_columns("numeric_fields")
+        self.notnull_numeric_fields = self.get_columns("notnull_numeric_fields")
+        self.datetime_linked_fields = self.get_columns("datetime_linked_fields")
+        self.date_field_components = self.get_columns("date_field_components")
+        self.datetime_fields = self.get_columns("datetime_fields")
+        self.person_id_field = self.get_columns("person_id_field")
+        self.auto_number_field = self.get_columns("auto_number_field")
+    def load_ddl(self, omopddl):
+        try:
+            fp = open(omopddl, "r")
+        except Exception as err:
+            print("OMOP ddl file ({0}) not found".format(omopddl))
+            sys.exit()
+        return(self.process_ddl(fp))
+    def process_ddl(self, fp):
+        """
+        Process the omop ddl file to output the attributes which CaRROT-CDM understands
+        Matching of selected parts of the ddl definition is performed using rgx's
+        """
+        output_dict = {}
+        output_dict["all_columns"] = {}
+        output_dict["numeric_fields"] = {}
+        output_dict["notnull_numeric_fields"] = {}
+        output_dict["datetime_fields"] = {}
+        output_dict["date_fields"] = {}
+        ver_rgx = re.compile(r'^--postgresql.*(\d+\.\d+)$')
+        start_rgx = re.compile(r'^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([A-Z_]+)')
+        datatype_rgx = re.compile(r'^\s*([a-z_]+)\s+([a-zA-Z_]+)')
+        end_rgx = re.compile(r'.*[)];$')
+        vermatched = False
+        processing_table_data = False
+        tabname = ""
+        for line in fp:
+            line = line.strip()
+            # check for line with version, if present
+            if vermatched == False:
+                vmatch = ver_rgx.search(line)
+                if vmatch != None:
+                    version_string = vmatch.group(1)
+                    output_dict["omop_version"] = version_string
+                    vermatched = True
+            # check for start of table definition
+            if processing_table_data == False:
+                smatch = start_rgx.search(line)
+                if smatch != None:
+                    processing_table_data = True
+                    tabname = smatch.group(2).lower()
+            else:
+                idtmatch = datatype_rgx.search(line)
+                if idtmatch != None:
+                    fname = idtmatch.group(1)
+                    ftype = idtmatch.group(2)
+                    # Check for dictionary element presence
+                    if tabname not in output_dict["all_columns"]:
+                        output_dict["all_columns"][tabname] = []
+                    if tabname not in output_dict["numeric_fields"]:
+                        output_dict["numeric_fields"][tabname] = []
+                    if tabname not in output_dict["notnull_numeric_fields"]:
+                        output_dict["notnull_numeric_fields"][tabname] = []
+                    if tabname not in output_dict["datetime_fields"]:
+                        output_dict["datetime_fields"][tabname] = []
+                    if tabname not in output_dict["date_fields"]:
+                        output_dict["date_fields"][tabname] = []
+                    # Add in required column / field data
+                    output_dict["all_columns"][tabname].append(fname)
+                    if ftype.lower() in self.numeric_types:
+                        output_dict["numeric_fields"][tabname].append(fname)
+                    if ftype.lower() in self.numeric_types and "NOT" in line and "NULL" in line:
+                        output_dict["notnull_numeric_fields"][tabname].append(fname)
+                    if ftype.lower() in self.datetime_types:
+                        output_dict["datetime_fields"][tabname].append(fname)
+                    if ftype.lower() in self.date_types:
+                        output_dict["date_fields"][tabname].append(fname)
+            ematch = end_rgx.search(line)
+            if ematch != None:
+                processing_table_data = False
+        return(output_dict)
+    def dump_ddl(self):
+        return(json.dumps(self.omop_json, indent=2))
+    def merge_json(self, omopjson, omopcfg):
+        tmp_json = tools.load_json(omopcfg)
+        for key, data in tmp_json.items():
+            omopjson[key] = data
+        return(omopjson)
+    def get_columns(self, colkey):
+        if colkey in self.omop_json:
+            return self.omop_json[colkey]
+        return None
+    def get_column_map(self, colarr, delim=","):
+        colmap = {}
+        for i, col in enumerate(colarr):
+            colmap[col] = i
+        return colmap
+    def get_omop_column_map(self, tablename):
+        if tablename in self.all_columns:
+            return self.get_column_map(self.all_columns[tablename])
+        return None
+    def get_omop_column_list(self, tablename):
+        if tablename in self.all_columns:
+            return self.all_columns[tablename]
+        return None
+    def is_omop_data_field(self, tablename, fieldname):
+        if fieldname in self.get_omop_datetime_linked_fields(tablename):
+            return False
+        if fieldname in self.get_omop_datetime_fields(tablename):
+            return False
+        if fieldname in self.get_omop_person_id_field(tablename):
+            return False
+        return True
+    def get_omop_numeric_fields(self, tablename):
+        if self.numeric_fields != None:
+            if tablename in self.numeric_fields:
+                return self.numeric_fields[tablename]
+        return []
+    def get_omop_notnull_numeric_fields(self, tablename):
+        if self.notnull_numeric_fields != None:
+            if tablename in self.notnull_numeric_fields:
+                return self.notnull_numeric_fields[tablename]
+        return []
+    def get_omop_datetime_linked_fields(self, tablename):
+        if self.datetime_linked_fields != None:
+            if tablename in self.datetime_linked_fields:
+                return self.datetime_linked_fields[tablename]
+        return {}
+    def get_omop_date_field_components(self, tablename):
+        if self.date_field_components != None:
+            if tablename in self.date_field_components:
+                return self.date_field_components[tablename]
+        return {}
+    def get_omop_datetime_fields(self, tablename):
+        if self.datetime_fields != None:
+            if tablename in self.datetime_fields:
+                return self.datetime_fields[tablename]
+        return []
+    def get_omop_person_id_field(self, tablename):
+        if self.person_id_field != None:
+            if tablename in self.person_id_field:
+                return self.person_id_field[tablename]
+        return None
+    def get_omop_auto_number_field(self, tablename):
+        if self.auto_number_field != None:
+            if tablename in self.auto_number_field:
+                return self.auto_number_field[tablename]
+        return None