PyPI - carrot-transform - Versions diffs - 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

carrot-transform 0.3.5py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of carrot-transform might be problematic. Click here for more details.

Files changed (32) hide show

{carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
carrot_transform-0.4.0.dist-info/RECORD +41 -0
{carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
carrottransform/__init__.py +1 -1
carrottransform/_version.py +2 -2
carrottransform/cli/command.py +9 -5
carrottransform/cli/subcommands/run.py +214 -526
carrottransform/cli/subcommands/run_v2.py +145 -0
carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
carrottransform/examples/test/rules/v1.json +280 -0
carrottransform/examples/test/rules/v2.json +115 -0
carrottransform/tools/__init__.py +4 -14
carrottransform/tools/args.py +128 -0
carrottransform/tools/concept_helpers.py +61 -0
carrottransform/tools/core.py +163 -0
carrottransform/tools/date_helpers.py +79 -0
carrottransform/tools/file_helpers.py +153 -9
carrottransform/tools/logger.py +19 -0
carrottransform/tools/mapping_types.py +32 -0
carrottransform/tools/mappingrules.py +297 -34
carrottransform/tools/metrics.py +162 -109
carrottransform/tools/omopcdm.py +37 -32
carrottransform/tools/orchestrator.py +381 -0
carrottransform/tools/person_helpers.py +126 -0
carrottransform/tools/record_builder.py +413 -0
carrottransform/tools/stream_helpers.py +71 -0
carrottransform/tools/types.py +71 -0
carrottransform/tools/validation.py +62 -0
carrot_transform-0.3.5.dist-info/RECORD +0 -25
carrot_transform-0.3.5.dist-info/entry_points.txt +0 -3
{carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0

carrottransform/tools/mappingrules.py CHANGED Viewed

@@ -1,29 +1,122 @@
-import os
 import json
+from pathlib import Path
 import carrottransform.tools as tools
-from .omopcdm import OmopCDM
+from typing import Dict, Any, List, Optional
+from carrottransform.tools.mapping_types import (
+    PersonIdMapping,
+    DateMapping,
+    ConceptMapping,
+    V2TableMapping,
+)
+from carrottransform.tools.logger import logger_setup
+from carrottransform.tools.omopcdm import OmopCDM
+logger = logger_setup()
-import logging
-logger = logging.getLogger(__name__)
 class MappingRules:
     """
-    self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
+    self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
     as a file-specific dictionary allowing rules to be "looked-up" depending on data content
     """
-    def __init__(self, rulesfilepath: os.PathLike, omopcdm: OmopCDM):
+    def __init__(self, rulesfilepath: Path, omopcdm: OmopCDM):
         ## just loads the json directly
-        self.rules_data = tools.load_json(rulesfilepath)
+        self.rules_data = tools.load_json(Path(rulesfilepath))
         self.omopcdm = omopcdm
-        self.parsed_rules = {}
-        self.outfile_names = {}
+        # Detect format version and parse accordingly
+        self.is_v2_format = self._is_v2_format()
+        if self.is_v2_format:
+            logger.info("Detected v2.json format, using direct v2 parser...")
+            self.v2_mappings = self._parse_v2_format()
+        else:
+            logger.info("Detected v1.json format, using legacy parser...")
+        self.parsed_rules: Dict[str, Dict[str, Any]] = {}
+        self.outfile_names: Dict[str, List[str]] = {}
         self.dataset_name = self.get_dsname_from_rules()
+    def _is_v2_format(self) -> bool:
+        """
+        Detect if the rules file is in v2 format by checking for characteristic v2 structures
+        """
+        # Check if any table has the v2 structure (source_table -> mapping_types)
+        for table_name, table_data in self.rules_data["cdm"].items():
+            if isinstance(table_data, dict):
+                for key, value in table_data.items():
+                    # v2 format has CSV filenames as keys, with mapping types as values
+                    if isinstance(value, dict) and all(
+                        mapping_type in value
+                        for mapping_type in [
+                            "person_id_mapping",
+                            "date_mapping",
+                            "concept_mappings",
+                        ]
+                    ):
+                        return True
+        return False
+    def _parse_v2_format(self) -> Dict[str, Dict[str, V2TableMapping]]:
+        """
+        Parse v2 format into clean data structures
+        Returns: Dict[table_name, Dict[source_table, V2TableMapping]]
+        """
+        v2_mappings: Dict[str, Dict[str, V2TableMapping]] = {}
+        for table_name, table_data in self.rules_data["cdm"].items():
+            v2_mappings[table_name] = {}
+            for source_table, mappings in table_data.items():
+                # Parse person_id_mapping
+                person_id_mapping = None
+                if "person_id_mapping" in mappings:
+                    pid_data = mappings["person_id_mapping"]
+                    person_id_mapping = PersonIdMapping(
+                        source_field=pid_data["source_field"],
+                        dest_field=pid_data["dest_field"],
+                    )
+                # Parse date_mapping
+                date_mapping = None
+                if "date_mapping" in mappings:
+                    date_data = mappings["date_mapping"]
+                    date_mapping = DateMapping(
+                        source_field=date_data["source_field"],
+                        dest_fields=date_data["dest_field"],
+                    )
+                # Parse concept_mappings
+                concept_mappings = {}
+                if "concept_mappings" in mappings:
+                    for source_field, field_mappings in mappings[
+                        "concept_mappings"
+                    ].items():
+                        original_value_fields = field_mappings.get("original_value", [])
+                        value_mappings = {}
+                        for source_value, dest_mappings in field_mappings.items():
+                            if source_value != "original_value":
+                                value_mappings[source_value] = dest_mappings
+                        concept_mappings[source_field] = ConceptMapping(
+                            source_field=source_field,
+                            value_mappings=value_mappings,
+                            original_value_fields=original_value_fields,
+                        )
+                v2_mappings[table_name][source_table] = V2TableMapping(
+                    source_table=source_table,
+                    person_id_mapping=person_id_mapping,
+                    date_mapping=date_mapping,
+                    concept_mappings=concept_mappings,
+                )
+        return v2_mappings
     def dump_parsed_rules(self):
-        return(json.dumps(self.parsed_rules, indent=2))
+        return json.dumps(self.parsed_rules, indent=2)
     def get_dsname_from_rules(self):
         dsname = "Unknown"
@@ -38,23 +131,62 @@ class MappingRules:
         return self.dataset_name
     def get_all_outfile_names(self):
-        return list(self.rules_data["cdm"])
+        if self.is_v2_format:
+            return list(self.v2_mappings.keys())
+        else:
+            return list(self.rules_data["cdm"])
     def get_all_infile_names(self):
+        if self.is_v2_format:
+            return self._get_all_infile_names_v2()
+        else:
+            return self._get_all_infile_names_v1()
+    def _get_all_infile_names_v2(self) -> List[str]:
+        """Get all input file names from v2 format"""
         file_list = []
+        for table_mappings in self.v2_mappings.values():
+            for source_table in table_mappings.keys():
+                if source_table not in file_list:
+                    file_list.append(source_table)
+        return file_list
+    def _get_all_infile_names_v1(self) -> List[str]:
+        """Get all input file names from v1 format (legacy method)"""
+        file_list = []
         for outfilename, conditions in self.rules_data["cdm"].items():
             for outfield, source_field in conditions.items():
                 for source_field_name, source_data in source_field.items():
                     if "source_table" in source_data:
                         if source_data["source_table"] not in file_list:
                             file_list.append(source_data["source_table"])
         return file_list
-    def get_infile_data_fields(self, infilename):
-        data_fields_lists = {}
+    def get_infile_data_fields(self, infilename: str):
+        if self.is_v2_format:
+            return self._get_infile_data_fields_v2(infilename)
+        else:
+            return self._get_infile_data_fields_v1(infilename)
+    def _get_infile_data_fields_v2(self, infilename: str) -> Dict[str, List[str]]:
+        """Get data fields for a specific input file from v2 format"""
+        data_fields_lists: Dict[str, List[str]] = {}
+        for table_name, table_mappings in self.v2_mappings.items():
+            if infilename in table_mappings:
+                mapping = table_mappings[infilename]
+                data_fields_lists[table_name] = []
+                # Add fields from concept mappings
+                for source_field in mapping.concept_mappings.keys():
+                    if source_field not in data_fields_lists[table_name]:
+                        data_fields_lists[table_name].append(source_field)
+        return data_fields_lists
+    def _get_infile_data_fields_v1(self, infilename: str) -> Dict[str, List[str]]:
+        """Get data fields for a specific input file from v1 format (legacy method)"""
+        data_fields_lists: Dict[str, List[str]] = {}
         outfilenames, outdata = self.parse_rules_src_to_tgt(infilename)
         for outfilename in outfilenames:
@@ -73,7 +205,36 @@ class MappingRules:
         return data_fields_lists
-    def get_infile_date_person_id(self, infilename):
+    def get_infile_date_person_id(self, infilename: str):
+        if self.is_v2_format:
+            return self._get_infile_date_person_id_v2(infilename)
+        else:
+            return self._get_infile_date_person_id_v1(infilename)
+    # TODO: combine this with _get_person_source_field_info_v2
+    def _get_infile_date_person_id_v2(self, infilename: str) -> tuple[str, str]:
+        """Get datetime and person_id source fields for v2 format"""
+        datetime_source = ""
+        person_id_source = ""
+        for table_mappings in self.v2_mappings.values():
+            if infilename in table_mappings:
+                mapping = table_mappings[infilename]
+                if mapping.date_mapping:
+                    datetime_source = mapping.date_mapping.source_field
+                if mapping.person_id_mapping:
+                    person_id_source = mapping.person_id_mapping.source_field
+                # If we found both, we can break
+                if datetime_source and person_id_source:
+                    break
+        return datetime_source, person_id_source
+    def _get_infile_date_person_id_v1(self, infilename: str) -> tuple[str, str]:
+        """Get datetime and person_id source fields for v1 format (legacy method)"""
         outfilenames, outdata = self.parse_rules_src_to_tgt(infilename)
         datetime_source = ""
         person_id_source = ""
@@ -83,27 +244,65 @@ class MappingRules:
             outfile = keydata[-1]
             for outfield_elem in outfield_data:
                 for infield, outfield_list in outfield_elem.items():
-                    logger.debug("{0}, {1}, {2}".format(outfile, infield, str(outfield_list)))
+                    logger.debug(
+                        "{0}, {1}, {2}".format(outfile, infield, str(outfield_list))
+                    )
                     for outfield in outfield_list:
-                        if outfield.split('~')[0] in self.omopcdm.get_omop_datetime_fields(outfile):
+                        if outfield.split("~")[
+                            0
+                        ] in self.omopcdm.get_omop_datetime_fields(outfile):
                             datetime_source = infield
-                        if outfield.split('~')[0] == self.omopcdm.get_omop_person_id_field(outfile):
+                        if outfield.split("~")[
+                            0
+                        ] == self.omopcdm.get_omop_person_id_field(outfile):
                             person_id_source = infield
         return datetime_source, person_id_source
-    def get_person_source_field_info(self, tgtfilename):
+    def get_person_source_field_info(self, tgtfilename: str):
+        if self.is_v2_format:
+            return self._get_person_source_field_info_v2(tgtfilename)
+        else:
+            return self._get_person_source_field_info_v1(tgtfilename)
+    def _get_person_source_field_info_v2(
+        self, tgtfilename: str
+    ) -> tuple[Optional[str], Optional[str]]:
         """
-        Specific discovery of input data field names for 'person' in these rules
+        Get person source field info for v2 format,
+        from the dest. table "Person" in the rules file.
         """
         birth_datetime_source = None
         person_id_source = None
+        if tgtfilename in self.v2_mappings:
+            for mapping in self.v2_mappings[tgtfilename].values():
+                if mapping.date_mapping:
+                    birth_datetime_source = mapping.date_mapping.source_field
+                if mapping.person_id_mapping:
+                    person_id_source = mapping.person_id_mapping.source_field
+                # If we found both, we can break
+                if birth_datetime_source and person_id_source:
+                    break
+        return birth_datetime_source, person_id_source
+    def _get_person_source_field_info_v1(
+        self, tgtfilename: str
+    ) -> tuple[Optional[str], Optional[str]]:
+        """Get person source field info for v1 format (legacy method)"""
+        birth_datetime_source = None
+        person_id_source = None
         if tgtfilename in self.rules_data["cdm"]:
             source_rules_data = self.rules_data["cdm"][tgtfilename]
             ## this loops over all the fields in the person part of the rules, which will lead to overwriting of the source variables and unneccesary looping
             for rule_name, rule_fields in source_rules_data.items():
                 if "birth_datetime" in rule_fields:
-                    birth_datetime_source = rule_fields["birth_datetime"]["source_field"]
+                    birth_datetime_source = rule_fields["birth_datetime"][
+                        "source_field"
+                    ]
                 if "person_id" in rule_fields:
                     person_id_source = rule_fields["person_id"]["source_field"]
@@ -125,7 +324,23 @@ class MappingRules:
                 if key != "":
                     if key not in outdata:
                         outdata[key] = []
-                    outdata[key].append(data)
+                        if key.split("~")[-1] == "person":
+                            outdata[key].append(data)
+                    if key.split("~")[-1] == "person":
+                        # Find matching source field keys and merge their dictionaries
+                        for source_field, value in data.items():
+                            if source_field in outdata[key][0] and isinstance(
+                                outdata[key][0][source_field], dict
+                            ):
+                                # Merge the dictionaries for this source field
+                                outdata[key][0][source_field].update(value)
+                            else:
+                                # If no matching dict or new source field, just set it
+                                outdata[key][0][source_field] = value
+                            pass
+                    else:
+                        outdata[key].append(data)
                     if outfilename not in outfilenames:
                         outfilenames.append(outfilename)
@@ -137,27 +352,75 @@ class MappingRules:
         """
         Process rules for an infile, outfile combination
         """
-        outkey = ""
         data = {}
+        ### used for mapping simple fields that are always mapped (e.g., dob)
         plain_key = ""
-        term_value_key = ""
+        term_value_key = ""  ### used for mapping terms (e.g., gender, race, ethnicity)
         ## iterate through the rules, looking for rules that apply to the input file.
         for outfield, source_info in rules.items():
-            if source_info["source_field"] not in data:
-                data[source_info["source_field"]] = []
+            # Check if this rule applies to our input file
             if source_info["source_table"] == infilename:
                 if "term_mapping" in source_info:
                     if type(source_info["term_mapping"]) is dict:
                         for inputvalue, term in source_info["term_mapping"].items():
-                            ## add a key/add to the list of data in the dict for the given input file
-                            term_value_key = infilename + "~" + source_info["source_field"] + "~" + str(inputvalue) + "~" + outfilename
-                            data[source_info["source_field"]].append(outfield + "~" + str(source_info["term_mapping"][str(inputvalue)]))
+                            if outfilename == "person":
+                                term_value_key = infilename + "~person"
+                                source_field = source_info["source_field"]
+                                if source_field not in data:
+                                    data[source_field] = {}
+                                if str(inputvalue) not in data[source_field]:
+                                    try:
+                                        data[source_field][str(inputvalue)] = []
+                                    except TypeError:
+                                        ### need to convert data[source_field] to a dict
+                                        ### like this: {'F': ['gender_concept_id~8532', 'gender_source_concept_id~8532', 'gender_source_value']}
+                                        temp_data_list = data[source_field].copy()
+                                        data[source_field] = {}
+                                        data[source_field][str(inputvalue)] = (
+                                            temp_data_list
+                                        )
+                                data[source_field][str(inputvalue)].append(
+                                    outfield + "~" + str(term)
+                                )
+                            else:
+                                term_value_key = (
+                                    infilename
+                                    + "~"
+                                    + source_info["source_field"]
+                                    + "~"
+                                    + str(inputvalue)
+                                    + "~"
+                                    + outfilename
+                                )
+                                if source_info["source_field"] not in data:
+                                    data[source_info["source_field"]] = []
+                                data[source_info["source_field"]].append(
+                                    outfield + "~" + str(term)
+                                )
                     else:
-                        plain_key = infilename + "~" + source_info["source_field"] + "~" + outfilename
-                        data[source_info["source_field"]].append(outfield + "~" + str(source_info["term_mapping"]))
+                        plain_key = (
+                            infilename
+                            + "~"
+                            + source_info["source_field"]
+                            + "~"
+                            + outfilename
+                        )
+                        if source_info["source_field"] not in data:
+                            data[source_info["source_field"]] = []
+                        data[source_info["source_field"]].append(
+                            outfield + "~" + str(source_info["term_mapping"])
+                        )
                 else:
-                    data[source_info["source_field"]].append(outfield)
+                    if source_info["source_field"] not in data:
+                        data[source_info["source_field"]] = []
+                    if type(data[source_info["source_field"]]) is dict:
+                        data[source_info["source_field"]][str(inputvalue)].append(
+                            outfield
+                        )
+                    else:
+                        data[source_info["source_field"]].append(outfield)
         if term_value_key != "":
             return term_value_key, data

carrot-transform 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

carrot-transform 0.3.5py3-none-any.whl → 0.4.0py3-none-any.whl