PyPI - carrot-transform - Versions diffs - 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

carrot-transform 0.3.3py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of carrot-transform might be problematic. Click here for more details.

Files changed (20) hide show

carrot_transform-0.3.5.dist-info/METADATA +106 -0
carrot_transform-0.3.5.dist-info/RECORD +25 -0
{carrot_transform-0.3.3.dist-info → carrot_transform-0.3.5.dist-info}/WHEEL +1 -1
carrot_transform-0.3.5.dist-info/entry_points.txt +3 -0
carrottransform/_version.py +6 -2
carrottransform/cli/subcommands/run.py +445 -193
carrottransform/examples/test/inputs/Covid19_test.csv +801 -0
carrottransform/examples/test/inputs/Demographics.csv +1001 -0
carrottransform/examples/test/inputs/Symptoms.csv +801 -0
carrottransform/examples/test/inputs/covid19_antibody.csv +1001 -0
carrottransform/examples/test/inputs/vaccine.csv +501 -0
carrottransform/examples/test/rules/rules_14June2021.json +300 -0
carrottransform/tools/click.py +21 -0
carrottransform/tools/file_helpers.py +30 -4
carrottransform/tools/mappingrules.py +13 -10
carrottransform/tools/metrics.py +212 -40
carrottransform/tools/omopcdm.py +17 -5
carrot_transform-0.3.3.dist-info/METADATA +0 -48
carrot_transform-0.3.3.dist-info/RECORD +0 -17
{carrot_transform-0.3.3.dist-info → carrot_transform-0.3.5.dist-info}/LICENSE +0 -0

carrottransform/tools/metrics.py CHANGED Viewed

@@ -1,3 +1,95 @@
+import logging
+logger = logging.getLogger(__name__)
+from dataclasses import dataclass, field
+from typing import Dict, List
+@dataclass
+class DataKey:
+    source: str
+    fieldname:str
+    tablename:str
+    concept_id:str
+    additional:str
+    def __str__(self) -> str:
+        """
+        The original implementation used strings as keys, then split by `~`.
+        This is here in case that representation is needed somewhere
+        """
+        return f"{self.source}~{self.fieldname}~{self.tablename}~{self.concept_id}~{self.additional}"
+    def __hash__(self) -> int:
+        """
+        The DataKey is used as a key for a dictionary of key counts
+        """
+        return hash((self.source, self.fieldname, self.tablename, self.concept_id, self.additional))
+@dataclass
+class CountData:
+    counts: Dict[str, int] = field(default_factory=dict)
+    def increment(self, count_type: str):
+        if count_type not in self.counts:
+            self.counts[count_type] = 0
+        self.counts[count_type] += 1
+    def get_count(self, count_type: str, default: int=0):
+        return self.counts.get(count_type, default)
+@dataclass
+class MapstreamSummaryRow:
+    """Represents a single row in the mapstream summary"""
+    dataset_name: str
+    source: str
+    fieldname: str
+    tablename: str
+    concept_id: str
+    additional: str
+    input_count: int = 0
+    invalid_person_ids: int = 0
+    invalid_date_fields: int = 0
+    invalid_source_fields: int = 0
+    output_count: int = 0
+    def to_tsv_row(self) -> str:
+        """Convert the row to a tab-separated string"""
+        row_list = [str(col) for col in [
+            self.dataset_name,
+            self.source,
+            self.fieldname,
+            self.tablename,
+            self.concept_id,
+            self.additional,
+            self.input_count,
+            self.invalid_person_ids,
+            self.invalid_date_fields,
+            self.invalid_source_fields,
+            self.output_count
+            ]]
+        # If python gets updated, you can move the row_str expression into the f-string
+        row_str = '\t'.join(row_list)
+        return f"{row_str}\n"
+    @classmethod
+    def get_header(cls) -> str:
+        """Return the TSV header row"""
+        header = [
+                "dsname",
+                "source",
+                "source_field",
+                "target",
+                "concept_id",
+                "additional",
+                "incount",
+                "invalid_persid",
+                "invalid_date",
+                "invalid_source",
+                "outcount"
+                ]
+        header_str = '\t'.join(header)
+        return f"{header_str}\n"
 class Metrics():
     """
     Capture metrics for output to a summary tsv file, record counts at multiple levels
@@ -58,21 +150,87 @@ class Metrics():
                 self.datasummary[dkey][counttype] = 0
             self.datasummary[dkey][counttype] += int(count_block[counttype])
-    def increment_key_count(self, dkey, count_type):
-        """
-        Intended to work with the mapstream functions
-        """
+    def increment_key_count(self, source, fieldname, tablename, concept_id, additional, count_type):
+        dkey = DataKey(source, fieldname, tablename, concept_id, additional)
         if dkey not in self.datasummary:
-            self.datasummary[dkey] = {}
-        if count_type not in self.datasummary[dkey]:
-            self.datasummary[dkey][count_type] = 0
-        self.datasummary[dkey][count_type] += 1
+            self.datasummary[dkey] = CountData()
+        self.datasummary[dkey].increment(count_type)
+    def increment_with_datacol(
+            self,
+            source_path: str,
+            target_file: str,
+            datacol: str,
+            out_record: List[str],
+            ) -> None:
+        #Are the parameters for DataKeys hierarchical?
+        #If so, a nested structure where a Source contains n Fields etc. and each has a method to sum its children would be better
+        #But I don't know if that's the desired behaviour
+        #A lot of these increment the same thing, so I have defined `increment_this`
+        def increment_this(
+                fieldname: str,
+                concept_id: str,
+                additional = "",
+                ) -> None:
+            self.increment_key_count(
+                    source=source_path,
+                    fieldname=fieldname,
+                    tablename=target_file,
+                    concept_id=concept_id,
+                    additional=additional,
+                    count_type="output_count"
+                    )
+        self.increment_key_count(
+                source=source_path,
+                fieldname="all",
+                tablename="all",
+                concept_id="all",
+                additional="",
+                count_type="output_count"
+                )
+        self.increment_key_count(
+                source="all",
+                fieldname="all",
+                tablename=target_file,
+                concept_id="all",
+                additional="",
+                count_type="output_count"
+                )
+        increment_this(fieldname="all", concept_id="all")
+        if target_file == "person":
+            increment_this(fieldname="all", concept_id=out_record[1])
+            increment_this(fieldname="all", concept_id=out_record[1], additional=out_record[2])
+        else:
+            increment_this(fieldname=datacol, concept_id=out_record[2])
+            increment_this(fieldname="all", concept_id=out_record[2])
+            self.increment_key_count(
+                    source="all",
+                    fieldname="all",
+                    tablename=target_file,
+                    concept_id=out_record[2],
+                    additional="",
+                    count_type="output_count"
+                    )
+            self.increment_key_count(
+                    source="all",
+                    fieldname="all",
+                    tablename="all",
+                    concept_id=out_record[2],
+                    additional="",
+                    count_type="output_count"
+                    )
     def get_summary(self):
         summary_str = "source\ttablename\tname\tcolumn name\tbefore\tafter content check\tpct reject content check\tafter date format check\tpct reject date format\n"
         for dkey in self.datasummary:
-            #print(dkey)
+            logger.debug(dkey)
             source, tablename, name, colname = dkey.split('.')
             before_count = int(self.datasummary[dkey]["before"])
             after_count = int(self.datasummary[dkey]["after"])
@@ -90,40 +248,54 @@ class Metrics():
     def get_data_summary(self):
         return self.datasummary
-    def get_mapstream_summary(self):
-        summary_str = "dsname\tsource\tsource_field\ttarget\tconcept_id\tadditional\tincount\tinvalid_persid\tinvalid_date\tinvalid_source\toutcount\n"
-        for dkey in sorted(self.datasummary):
-            try:
-                source, fieldname, tablename, concept_id, additional = dkey.split('~')
-            except ValueError:
-                print("get_mapstream_summary - ValueError: {0}".format(dkey))
-                break
-            source = self.get_prefix(source)
-            dvalue = self.datasummary[dkey]
-            input_count = "0"
-            if "input_count" in dvalue:
-                input_count = str(dvalue["input_count"])
+    def get_mapstream_summary_rows(self) -> List[MapstreamSummaryRow]:
+        """
+        Creates a list of MapstreamSummaryRow from the datasummary
+        """
+        rows = []
-            invalid_person_ids = "0"
-            if "invalid_person_ids" in dvalue:
-                invalid_person_ids = str(dvalue["invalid_person_ids"])
+        for d_key in sorted(self.datasummary.keys(), key=str):
+            source = self.get_prefix(d_key.source)
+            count_data = self.datasummary[d_key]
-            invalid_source_fields = "0"
-            if "invalid_source_fields" in dvalue:
-                invalid_source_fields = str(dvalue["invalid_source_fields"])
+            row = MapstreamSummaryRow(
+                dataset_name=self.dataset_name,
+                source=source,
+                fieldname=d_key.fieldname,
+                tablename=d_key.tablename,
+                concept_id=d_key.concept_id,
+                additional=d_key.additional,
+                input_count=count_data.get_count("input_count"),
+                invalid_person_ids=count_data.get_count("invalid_person_ids"),
+                invalid_date_fields=count_data.get_count("invalid_date_fields"),
+                invalid_source_fields=count_data.get_count("invalid_source_fields"),
+                output_count=count_data.get_count("output_count")
+            )
-            invalid_date_fields = "0"
-            if "invalid_date_fields" in dvalue:
-                invalid_date_fields = str(dvalue["invalid_date_fields"])
+            if row.output_count >= self.log_threshold:
+                rows.append(row)
+        return rows
-            output_count = "0"
-            if "output_count" in dvalue:
-                output_count = str(dvalue["output_count"])
-            if (int(output_count) >= self.log_threshold):
-              summary_str += self.dataset_name + "\t" + source + "\t" + fieldname + "\t" + tablename + "\t" + concept_id + "\t" + additional +"\t" + input_count + "\t" + invalid_person_ids + "\t" + invalid_date_fields + "\t" + invalid_source_fields + "\t" + output_count + "\n"
+    def get_mapstream_summary(self) -> str:
+        """
+        Makes a TSV string of the mapstream summary
+        """
+        summary_rows = self.get_mapstream_summary_rows()
+        result = MapstreamSummaryRow.get_header()
+        for row in summary_rows:
+            result += row.to_tsv_row()
+        return result
-        return summary_str
+    def get_mapstream_summary_dict(self) -> Dict:
+        """
+        Makes a dict of the mapstream summary
+        """
+        rows = self.get_mapstream_summary_rows()
+        return {
+            "dataset": self.dataset_name,
+            "threshold": self.log_threshold,
+            "rows": [vars(row) for row in rows]
+        }

carrottransform/tools/omopcdm.py CHANGED Viewed

@@ -1,8 +1,13 @@
 import carrottransform.tools as tools
 import json
+import logging
 import re
 import sys
+from pathlib import Path
+logger = logging.getLogger(__name__)
 class OmopCDM:
     """
     Load and parse OMOP DDL data, to make an in-memory json CDM
@@ -14,7 +19,10 @@ class OmopCDM:
         self.numeric_types = ["integer", "numeric"]
         self.datetime_types = ["timestamp"]
         self.date_types = ["date"]
+        ## ddl sets the headers to go in each table, and whether or not to make it null. Also allows for more tables than we will use.
+        ## also adds additional useful keys, like 'all_columns' - before merge
         self.omop_json = self.load_ddl(omopddl)
+        ## adds fields as a dict of dicts - is this so they can get picked up by some of these get_columns?
         self.omop_json = self.merge_json(self.omop_json, omopcfg)
         self.all_columns = self.get_columns("all_columns")
         self.numeric_fields = self.get_columns("numeric_fields")
@@ -26,11 +34,11 @@ class OmopCDM:
         self.auto_number_field = self.get_columns("auto_number_field")
-    def load_ddl(self, omopddl):
+    def load_ddl(self, omopddl: Path):
         try:
-            fp = open(omopddl, "r")
+            fp = omopddl.open("r")
         except Exception as err:
-            print("OMOP ddl file ({0}) not found".format(omopddl))
+            logger.exception("OMOP ddl file ({0}) not found".format(omopddl))
             sys.exit()
         return(self.process_ddl(fp))
@@ -47,9 +55,13 @@ class OmopCDM:
         output_dict["datetime_fields"] = {}
         output_dict["date_fields"] = {}
+        ## matching for version number - matches '--postgres', any number of chars and some digits of the form X.Y, plus an end of string or end of line
         ver_rgx = re.compile(r'^--postgresql.*(\d+\.\d+)$')
-        start_rgx = re.compile(r'^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([A-Z_]+)')
+        ## matching for table name - matches 'CREATE TABLE @', some letters (upper and lower case), '.' and some more letters (lower case)
+        start_rgx = re.compile(r'^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([a-zA-Z_]+)')
+        ## matches some whitespace, lower case letters(or underscores), whitespace, letters (upper/lower and underscores)
         datatype_rgx = re.compile(r'^\s*([a-z_]+)\s+([a-zA-Z_]+)')
+        ## matching for end of file - matches close bracket, semi colon, end of file or line
         end_rgx = re.compile(r'.*[)];$')
         vermatched = False
         processing_table_data = False
@@ -76,7 +88,7 @@ class OmopCDM:
                     fname = idtmatch.group(1)
                     ftype = idtmatch.group(2)
-                    # Check for dictionary element presence
+                    # Check for dictionary element presence, adn start an empty list if it doesn't already exist
                     if tabname not in output_dict["all_columns"]:
                         output_dict["all_columns"][tabname] = []
                     if tabname not in output_dict["numeric_fields"]:

carrot_transform-0.3.3.dist-info/METADATA DELETED Viewed

@@ -1,48 +0,0 @@
-Metadata-Version: 2.3
-Name: carrot_transform
-Version: 0.3.3
-Summary:
-Author: anwarfg
-Author-email: 913028+anwarfg@users.noreply.github.com
-Requires-Python: >=3.10,<4.0
-Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
-Classifier: Programming Language :: Python :: 3.12
-Classifier: Programming Language :: Python :: 3.13
-Requires-Dist: click (>=8.1.7,<9.0.0)
-Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
-Requires-Dist: pandas (>=2.2.3,<3.0.0)
-Description-Content-Type: text/markdown
-<p align="center">
-  <a href="https://carrot.ac.uk/" target="_blank">
-  <picture>
-    <source media="(prefers-color-scheme: dark)" srcset="/images/logo-dark.png">
-    <img alt="Carrot Logo" src="/images/logo-primary.png" width="280"/>
-  </picture>
-  </a>
-</p>
-<div align="center">
-  <strong>
-  <h2>Streamlined Data Mapping to OMOP</h2>
-  <a href="https://carrot.ac.uk/">Carrot Tranform</a> executes the conversion of the data to the OMOP CDM.<br />
-  </strong>
-</div>
-TODO:
-- Document carrot-transform
-- Add more comments in-code
-- Handle capture of ddl and json config via the command-line as optional args
-Reduction in complexity over the original CaRROT-CDM version for the Transform part of _ETL_ - In practice _Extract_ is always
-performed by Data Partners, _Load_ by database bulk-load software.
-Statistics
-External libraries imported (approximate)
-carrot-cdm 61
-carrot-transform 12

carrot_transform-0.3.3.dist-info/RECORD DELETED Viewed

@@ -1,17 +0,0 @@
-carrottransform/__init__.py,sha256=cQJKTCpG2qmKxDl-VtSWQ3_WFjyzg4u_8nZacWAHFcU,73
-carrottransform/_version.py,sha256=NfGqG2TgfjxxrlCHaOtwl3BcE0f6UH0VPrQgoDPjV7Y,72
-carrottransform/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-carrottransform/cli/command.py,sha256=xYTaJsVZyRYv0CzUwrh7ZPK8hhGyC3MDfvVYxHcXYSM,508
-carrottransform/cli/subcommands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-carrottransform/cli/subcommands/run.py,sha256=3z5cRG4ekyPOP5tvjZOyHUxbclKfBr_Z0tQRRoKj73E,20651
-carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql,sha256=fXrPfdL3IzU5ux55ogsQKjjd-c1KzdP_N2A_JjlY3gk,18084
-carrottransform/config/omop.json,sha256=OT3jvfPjKhjsDnQcQw1OAEOHhQLoHXNxTj_MDwNbYqo,1934
-carrottransform/tools/__init__.py,sha256=b3JuCwgJVx0rqx5igB8hNNKO0ktlbQjHGHwy-vzpdo0,198
-carrottransform/tools/file_helpers.py,sha256=xlODDAUpsx0H4sweGZ81ttjJjNQGn2spNUa1Fndotw8,316
-carrottransform/tools/mappingrules.py,sha256=bV6tXHBwVeKAUgCwFTZE2-qTcxKtbs3zbJWedBSviVI,6567
-carrottransform/tools/metrics.py,sha256=LOzm80-YIVM9mvgvQXRpyArl2nSfSTTW9DikqJ5M2Yg,5700
-carrottransform/tools/omopcdm.py,sha256=ycyPGgUTUwui7MLxH8JXd-MyCRkG0xOfEoDhCXeogmQ,7623
-carrot_transform-0.3.3.dist-info/LICENSE,sha256=pqIiuuTs6Na-oFd10MMsZoZmdfhfUhHeOtQzgzSkcaw,1082
-carrot_transform-0.3.3.dist-info/METADATA,sha256=23mVHLHLXOqgXUFLoU7cSaqIr_yzl9mYf_zgZnteeoY,1474
-carrot_transform-0.3.3.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
-carrot_transform-0.3.3.dist-info/RECORD,,

{carrot_transform-0.3.3.dist-info → carrot_transform-0.3.5.dist-info}/LICENSE RENAMED Viewed

File without changes

carrot-transform 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

Potentially problematic release.

carrot-transform 0.3.3py3-none-any.whl → 0.3.5py3-none-any.whl