PyPI - carrot-transform - Versions diffs - 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

carrot-transform 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of carrot-transform might be problematic. Click here for more details.

Files changed (33) hide show

{carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
carrot_transform-0.4.0.dist-info/RECORD +41 -0
{carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
carrottransform/__init__.py +1 -1
carrottransform/_version.py +2 -2
carrottransform/cli/command.py +9 -5
carrottransform/cli/subcommands/run.py +302 -443
carrottransform/cli/subcommands/run_v2.py +145 -0
carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
carrottransform/examples/test/rules/v1.json +280 -0
carrottransform/examples/test/rules/v2.json +115 -0
carrottransform/tools/__init__.py +4 -14
carrottransform/tools/args.py +128 -0
carrottransform/tools/click.py +21 -0
carrottransform/tools/concept_helpers.py +61 -0
carrottransform/tools/core.py +163 -0
carrottransform/tools/date_helpers.py +79 -0
carrottransform/tools/file_helpers.py +177 -7
carrottransform/tools/logger.py +19 -0
carrottransform/tools/mapping_types.py +32 -0
carrottransform/tools/mappingrules.py +298 -32
carrottransform/tools/metrics.py +274 -49
carrottransform/tools/omopcdm.py +42 -32
carrottransform/tools/orchestrator.py +381 -0
carrottransform/tools/person_helpers.py +126 -0
carrottransform/tools/record_builder.py +413 -0
carrottransform/tools/stream_helpers.py +71 -0
carrottransform/tools/types.py +71 -0
carrottransform/tools/validation.py +62 -0
carrot_transform-0.3.4.dist-info/RECORD +0 -24
carrot_transform-0.3.4.dist-info/entry_points.txt +0 -3
{carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0

carrottransform/tools/metrics.py CHANGED Viewed

@@ -1,15 +1,125 @@
-class Metrics():
+from dataclasses import dataclass, field
+from typing import Dict, List
+from carrottransform.tools.logger import logger_setup
+logger = logger_setup()
+@dataclass
+class DataKey:
+    source: str
+    fieldname: str
+    tablename: str
+    concept_id: str
+    additional: str
+    def __str__(self) -> str:
+        """
+        The original implementation used strings as keys, then split by `~`.
+        This is here in case that representation is needed somewhere
+        """
+        return f"{self.source}~{self.fieldname}~{self.tablename}~{self.concept_id}~{self.additional}"
+    def __hash__(self) -> int:
+        """
+        The DataKey is used as a key for a dictionary of key counts
+        """
+        return hash(
+            (
+                self.source,
+                self.fieldname,
+                self.tablename,
+                self.concept_id,
+                self.additional,
+            )
+        )
+@dataclass
+class CountData:
+    counts: Dict[str, int] = field(default_factory=dict)
+    def increment(self, count_type: str):
+        if count_type not in self.counts:
+            self.counts[count_type] = 0
+        self.counts[count_type] += 1
+    def get_count(self, count_type: str, default: int = 0):
+        return self.counts.get(count_type, default)
+@dataclass
+class MapstreamSummaryRow:
+    """Represents a single row in the mapstream summary"""
+    dataset_name: str
+    source: str
+    fieldname: str
+    tablename: str
+    concept_id: str
+    additional: str
+    input_count: int = 0
+    invalid_person_ids: int = 0
+    invalid_date_fields: int = 0
+    invalid_source_fields: int = 0
+    output_count: int = 0
+    def to_tsv_row(self) -> str:
+        """Convert the row to a tab-separated string"""
+        row_list = [
+            str(col)
+            for col in [
+                self.dataset_name,
+                self.source,
+                self.fieldname,
+                self.tablename,
+                self.concept_id,
+                self.additional,
+                self.input_count,
+                self.invalid_person_ids,
+                self.invalid_date_fields,
+                self.invalid_source_fields,
+                self.output_count,
+            ]
+        ]
+        # If python gets updated, you can move the row_str expression into the f-string
+        row_str = "\t".join(row_list)
+        return f"{row_str}\n"
+    @classmethod
+    def get_header(cls) -> str:
+        """Return the TSV header row"""
+        header = [
+            "dsname",
+            "source",
+            "source_field",
+            "target",
+            "concept_id",
+            "additional",
+            "incount",
+            "invalid_persid",
+            "invalid_date",
+            "invalid_source",
+            "outcount",
+        ]
+        header_str = "\t".join(header)
+        return f"{header_str}\n"
+class Metrics:
     """
     Capture metrics for output to a summary tsv file, record counts at multiple levels
     The main principle is to increment counts associated with datakeys (dkey) at different levels
     """
     def __init__(self, dataset_name, log_threshold=0):
         """
         self.datasummary holds all the saved counts
         """
-        self.datasummary={}
-        self.allcounts={}
-        self.dataset_name=dataset_name
+        self.datasummary = {}
+        self.allcounts = {}
+        self.dataset_name = dataset_name
         self.log_threshold = log_threshold
     def get_new_mapstream_counts(self):
@@ -43,8 +153,18 @@ class Metrics():
                     prfx = "NA"
                     if "source_files" in increment:
                         if fieldname in increment["source_files"]:
-                            prfx = self.get_prefix(increment["source_files"][fieldname]["table"])
-                            dkey = prfx + "." + desttablename + "." + name + "." + fieldname
+                            prfx = self.get_prefix(
+                                increment["source_files"][fieldname]["table"]
+                            )
+                            dkey = (
+                                prfx
+                                + "."
+                                + desttablename
+                                + "."
+                                + name
+                                + "."
+                                + fieldname
+                            )
                             self.add_counts_to_summary(dkey, dataitem[fieldname])
     def get_prefix(self, fname):
@@ -58,30 +178,122 @@ class Metrics():
                 self.datasummary[dkey][counttype] = 0
             self.datasummary[dkey][counttype] += int(count_block[counttype])
-    def increment_key_count(self, dkey, count_type):
-        """
-        Intended to work with the mapstream functions
-        """
+    def increment_key_count(
+        self, source, fieldname, tablename, concept_id, additional, count_type
+    ):
+        dkey = DataKey(source, fieldname, tablename, concept_id, additional)
         if dkey not in self.datasummary:
-            self.datasummary[dkey] = {}
-        if count_type not in self.datasummary[dkey]:
-            self.datasummary[dkey][count_type] = 0
-        self.datasummary[dkey][count_type] += 1
+            self.datasummary[dkey] = CountData()
+        self.datasummary[dkey].increment(count_type)
+    def increment_with_datacol(
+        self,
+        source_path: str,
+        target_file: str,
+        datacol: str,
+        out_record: List[str],
+    ) -> None:
+        # Are the parameters for DataKeys hierarchical?
+        # If so, a nested structure where a Source contains n Fields etc. and each has a method to sum its children would be better
+        # But I don't know if that's the desired behaviour
+        # A lot of these increment the same thing, so I have defined `increment_this`
+        def increment_this(
+            fieldname: str,
+            concept_id: str,
+            additional="",
+        ) -> None:
+            self.increment_key_count(
+                source=source_path,
+                fieldname=fieldname,
+                tablename=target_file,
+                concept_id=concept_id,
+                additional=additional,
+                count_type="output_count",
+            )
+        self.increment_key_count(
+            source=source_path,
+            fieldname="all",
+            tablename="all",
+            concept_id="all",
+            additional="",
+            count_type="output_count",
+        )
+        self.increment_key_count(
+            source="all",
+            fieldname="all",
+            tablename=target_file,
+            concept_id="all",
+            additional="",
+            count_type="output_count",
+        )
+        increment_this(fieldname="all", concept_id="all")
+        if target_file == "person":
+            increment_this(fieldname="all", concept_id=out_record[1])
+            increment_this(
+                fieldname="all", concept_id=out_record[1], additional=out_record[2]
+            )
+        else:
+            increment_this(fieldname=datacol, concept_id=out_record[2])
+            increment_this(fieldname="all", concept_id=out_record[2])
+            self.increment_key_count(
+                source="all",
+                fieldname="all",
+                tablename=target_file,
+                concept_id=out_record[2],
+                additional="",
+                count_type="output_count",
+            )
+            self.increment_key_count(
+                source="all",
+                fieldname="all",
+                tablename="all",
+                concept_id=out_record[2],
+                additional="",
+                count_type="output_count",
+            )
     def get_summary(self):
         summary_str = "source\ttablename\tname\tcolumn name\tbefore\tafter content check\tpct reject content check\tafter date format check\tpct reject date format\n"
         for dkey in self.datasummary:
-            #print(dkey)
-            source, tablename, name, colname = dkey.split('.')
+            logger.debug(dkey)
+            source, tablename, name, colname = dkey.split(".")
             before_count = int(self.datasummary[dkey]["before"])
             after_count = int(self.datasummary[dkey]["after"])
             after_pct = (float)(before_count - after_count) * 100 / before_count
-            summary_str += source + "\t" + tablename + "\t" + name + "\t" + colname + "\t" + str(before_count) + "\t" + str(after_count) + "\t" + "{0:.3f}".format(after_pct) + "\t"
+            summary_str += (
+                source
+                + "\t"
+                + tablename
+                + "\t"
+                + name
+                + "\t"
+                + colname
+                + "\t"
+                + str(before_count)
+                + "\t"
+                + str(after_count)
+                + "\t"
+                + "{0:.3f}".format(after_pct)
+                + "\t"
+            )
             if "after_formatting" in self.datasummary[dkey]:
                 after_format_count = int(self.datasummary[dkey]["after_formatting"])
-                after_format_pct = (float)(after_count - after_format_count) * 100 / after_count
-                summary_str += str(after_format_count) + "\t" + "{0:.3f}".format(after_format_pct) + "\n"
+                after_format_pct = (
+                    (float)(after_count - after_format_count) * 100 / after_count
+                )
+                summary_str += (
+                    str(after_format_count)
+                    + "\t"
+                    + "{0:.3f}".format(after_format_pct)
+                    + "\n"
+                )
             else:
                 summary_str += "NA\tNA\n"
@@ -90,40 +302,53 @@ class Metrics():
     def get_data_summary(self):
         return self.datasummary
-    def get_mapstream_summary(self):
-        summary_str = "dsname\tsource\tsource_field\ttarget\tconcept_id\tadditional\tincount\tinvalid_persid\tinvalid_date\tinvalid_source\toutcount\n"
-        for dkey in sorted(self.datasummary):
-            try:
-                source, fieldname, tablename, concept_id, additional = dkey.split('~')
-            except ValueError:
-                print("get_mapstream_summary - ValueError: {0}".format(dkey))
-                break
-            source = self.get_prefix(source)
-            dvalue = self.datasummary[dkey]
+    def get_mapstream_summary_rows(self) -> List[MapstreamSummaryRow]:
+        """
+        Creates a list of MapstreamSummaryRow from the datasummary
+        """
+        rows = []
-            input_count = "0"
-            if "input_count" in dvalue:
-                input_count = str(dvalue["input_count"])
+        for d_key in sorted(self.datasummary.keys(), key=str):
+            source = self.get_prefix(d_key.source)
+            count_data = self.datasummary[d_key]
-            invalid_person_ids = "0"
-            if "invalid_person_ids" in dvalue:
-                invalid_person_ids = str(dvalue["invalid_person_ids"])
+            row = MapstreamSummaryRow(
+                dataset_name=self.dataset_name,
+                source=source,
+                fieldname=d_key.fieldname,
+                tablename=d_key.tablename,
+                concept_id=d_key.concept_id,
+                additional=d_key.additional,
+                input_count=count_data.get_count("input_count"),
+                invalid_person_ids=count_data.get_count("invalid_person_ids"),
+                invalid_date_fields=count_data.get_count("invalid_date_fields"),
+                invalid_source_fields=count_data.get_count("invalid_source_fields"),
+                output_count=count_data.get_count("output_count"),
+            )
-            invalid_source_fields = "0"
-            if "invalid_source_fields" in dvalue:
-                invalid_source_fields = str(dvalue["invalid_source_fields"])
+            if row.output_count >= self.log_threshold:
+                rows.append(row)
+        return rows
-            invalid_date_fields = "0"
-            if "invalid_date_fields" in dvalue:
-                invalid_date_fields = str(dvalue["invalid_date_fields"])
+    def get_mapstream_summary(self) -> str:
+        """
+        Makes a TSV string of the mapstream summary
+        """
+        summary_rows = self.get_mapstream_summary_rows()
+        result = MapstreamSummaryRow.get_header()
-            output_count = "0"
-            if "output_count" in dvalue:
-                output_count = str(dvalue["output_count"])
+        for row in summary_rows:
+            result += row.to_tsv_row()
-            if (int(output_count) >= self.log_threshold):
-              summary_str += self.dataset_name + "\t" + source + "\t" + fieldname + "\t" + tablename + "\t" + concept_id + "\t" + additional +"\t" + input_count + "\t" + invalid_person_ids + "\t" + invalid_date_fields + "\t" + invalid_source_fields + "\t" + output_count + "\n"
+        return result
-        return summary_str
+    def get_mapstream_summary_dict(self) -> Dict:
+        """
+        Makes a dict of the mapstream summary
+        """
+        rows = self.get_mapstream_summary_rows()
+        return {
+            "dataset": self.dataset_name,
+            "threshold": self.log_threshold,
+            "rows": [vars(row) for row in rows],
+        }

carrottransform/tools/omopcdm.py CHANGED Viewed

@@ -1,13 +1,19 @@
 import carrottransform.tools as tools
 import json
+from carrottransform.tools.logger import logger_setup
 import re
 import sys
+from pathlib import Path
+logger = logger_setup()
 class OmopCDM:
     """
     Load and parse OMOP DDL data, to make an in-memory json CDM
     Merge in manual additions (currently necessary to identify, person id, date / time fields and autonumber fields)
-    Define a series of "get" functions to allow CDM component discovery
+    Define a series of "get" functions to allow CDM component discovery
     """
     def __init__(self, omopddl, omopcfg):
@@ -28,15 +34,14 @@ class OmopCDM:
         self.person_id_field = self.get_columns("person_id_field")
         self.auto_number_field = self.get_columns("auto_number_field")
-    def load_ddl(self, omopddl):
+    def load_ddl(self, omopddl: Path):
         try:
-            fp = open(omopddl, "r")
-        except Exception as err:
-            print("OMOP ddl file ({0}) not found".format(omopddl))
+            fp = omopddl.open("r")
+        except Exception:
+            logger.exception("OMOP ddl file ({0}) not found".format(omopddl))
             sys.exit()
-        return(self.process_ddl(fp))
+        return self.process_ddl(fp)
     def process_ddl(self, fp):
         """
@@ -51,13 +56,13 @@ class OmopCDM:
         output_dict["date_fields"] = {}
         ## matching for version number - matches '--postgres', any number of chars and some digits of the form X.Y, plus an end of string or end of line
-        ver_rgx = re.compile(r'^--postgresql.*(\d+\.\d+)$')
+        ver_rgx = re.compile(r"^--postgresql.*(\d+\.\d+)$")
         ## matching for table name - matches 'CREATE TABLE @', some letters (upper and lower case), '.' and some more letters (lower case)
-        start_rgx = re.compile(r'^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([a-zA-Z_]+)')
+        start_rgx = re.compile(r"^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([a-zA-Z_]+)")
         ## matches some whitespace, lower case letters(or underscores), whitespace, letters (upper/lower and underscores)
-        datatype_rgx = re.compile(r'^\s*([a-z_]+)\s+([a-zA-Z_]+)')
+        datatype_rgx = re.compile(r"^\s*([a-z_]+)\s+([a-zA-Z_]+)")
         ## matching for end of file - matches close bracket, semi colon, end of file or line
-        end_rgx = re.compile(r'.*[)];$')
+        end_rgx = re.compile(r".*[)];$")
         vermatched = False
         processing_table_data = False
         tabname = ""
@@ -65,21 +70,22 @@ class OmopCDM:
         for line in fp:
             line = line.strip()
             # check for line with version, if present
-            if vermatched == False:
+            if not vermatched:
                 vmatch = ver_rgx.search(line)
-                if vmatch != None:
+                if vmatch is not None:
                     version_string = vmatch.group(1)
                     output_dict["omop_version"] = version_string
                     vermatched = True
             # check for start of table definition
-            if processing_table_data == False:
+            if not processing_table_data:
                 smatch = start_rgx.search(line)
-                if smatch != None:
+                if smatch is not None:
                     processing_table_data = True
                     tabname = smatch.group(2).lower()
             else:
                 idtmatch = datatype_rgx.search(line)
-                if idtmatch != None:
+                if idtmatch is not None:
                     fname = idtmatch.group(1)
                     ftype = idtmatch.group(2)
@@ -94,12 +100,16 @@ class OmopCDM:
                         output_dict["datetime_fields"][tabname] = []
                     if tabname not in output_dict["date_fields"]:
                         output_dict["date_fields"][tabname] = []
                     # Add in required column / field data
                     output_dict["all_columns"][tabname].append(fname)
                     if ftype.lower() in self.numeric_types:
                         output_dict["numeric_fields"][tabname].append(fname)
-                    if ftype.lower() in self.numeric_types and "NOT" in line and "NULL" in line:
+                    if (
+                        ftype.lower() in self.numeric_types
+                        and "NOT" in line
+                        and "NULL" in line
+                    ):
                         output_dict["notnull_numeric_fields"][tabname].append(fname)
                     if ftype.lower() in self.datetime_types:
                         output_dict["datetime_fields"][tabname].append(fname)
@@ -107,19 +117,19 @@ class OmopCDM:
                         output_dict["date_fields"][tabname].append(fname)
             ematch = end_rgx.search(line)
-            if ematch != None:
+            if ematch is not None:
                 processing_table_data = False
-        return(output_dict)
+        return output_dict
     def dump_ddl(self):
-        return(json.dumps(self.omop_json, indent=2))
+        return json.dumps(self.omop_json, indent=2)
     def merge_json(self, omopjson, omopcfg):
         tmp_json = tools.load_json(omopcfg)
         for key, data in tmp_json.items():
             omopjson[key] = data
-        return(omopjson)
+        return omopjson
     def get_columns(self, colkey):
         if colkey in self.omop_json:
@@ -152,43 +162,43 @@ class OmopCDM:
         return True
     def get_omop_numeric_fields(self, tablename):
-        if self.numeric_fields != None:
+        if self.numeric_fields is not None:
             if tablename in self.numeric_fields:
                 return self.numeric_fields[tablename]
         return []
     def get_omop_notnull_numeric_fields(self, tablename):
-        if self.notnull_numeric_fields != None:
+        if self.notnull_numeric_fields is not None:
             if tablename in self.notnull_numeric_fields:
                 return self.notnull_numeric_fields[tablename]
         return []
     def get_omop_datetime_linked_fields(self, tablename):
-        if self.datetime_linked_fields != None:
+        if self.datetime_linked_fields is not None:
             if tablename in self.datetime_linked_fields:
                 return self.datetime_linked_fields[tablename]
         return {}
     def get_omop_date_field_components(self, tablename):
-        if self.date_field_components != None:
+        if self.date_field_components is not None:
             if tablename in self.date_field_components:
                 return self.date_field_components[tablename]
         return {}
     def get_omop_datetime_fields(self, tablename):
-        if self.datetime_fields != None:
+        if self.datetime_fields is not None:
             if tablename in self.datetime_fields:
                 return self.datetime_fields[tablename]
         return []
     def get_omop_person_id_field(self, tablename):
-        if self.person_id_field != None:
+        if self.person_id_field is not None:
             if tablename in self.person_id_field:
                 return self.person_id_field[tablename]
         return None
     def get_omop_auto_number_field(self, tablename):
-        if self.auto_number_field != None:
+        if self.auto_number_field is not None:
             if tablename in self.auto_number_field:
                 return self.auto_number_field[tablename]
         return None

carrot-transform 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

carrot-transform 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl