PyPI - carrot-transform - Versions diffs - 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

carrot-transform 0.3.5py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of carrot-transform might be problematic. Click here for more details.

Files changed (32) hide show

{carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
carrot_transform-0.4.0.dist-info/RECORD +41 -0
{carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
carrottransform/__init__.py +1 -1
carrottransform/_version.py +2 -2
carrottransform/cli/command.py +9 -5
carrottransform/cli/subcommands/run.py +214 -526
carrottransform/cli/subcommands/run_v2.py +145 -0
carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
carrottransform/examples/test/rules/v1.json +280 -0
carrottransform/examples/test/rules/v2.json +115 -0
carrottransform/tools/__init__.py +4 -14
carrottransform/tools/args.py +128 -0
carrottransform/tools/concept_helpers.py +61 -0
carrottransform/tools/core.py +163 -0
carrottransform/tools/date_helpers.py +79 -0
carrottransform/tools/file_helpers.py +153 -9
carrottransform/tools/logger.py +19 -0
carrottransform/tools/mapping_types.py +32 -0
carrottransform/tools/mappingrules.py +297 -34
carrottransform/tools/metrics.py +162 -109
carrottransform/tools/omopcdm.py +37 -32
carrottransform/tools/orchestrator.py +381 -0
carrottransform/tools/person_helpers.py +126 -0
carrottransform/tools/record_builder.py +413 -0
carrottransform/tools/stream_helpers.py +71 -0
carrottransform/tools/types.py +71 -0
carrottransform/tools/validation.py +62 -0
carrot_transform-0.3.5.dist-info/RECORD +0 -25
carrot_transform-0.3.5.dist-info/entry_points.txt +0 -3
{carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0

carrottransform/examples/test/rules/v1.json ADDED Viewed

@@ -0,0 +1,280 @@
+{
+  "metadata": {
+    "date_created": "2025-07-22T10:41:33.297641+00:00",
+    "dataset": "transform"
+  },
+  "cdm": {
+    "observation": {
+      "Asian or Asian British 34532": {
+        "observation_datetime": {
+          "source_table": "Demographics.csv",
+          "source_field": "date_of_birth"
+        },
+        "observation_source_concept_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity",
+          "term_mapping": { "Asian": 35825508 }
+        },
+        "observation_concept_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity",
+          "term_mapping": { "Asian": 35825508 }
+        },
+        "observation_source_value": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity"
+        },
+        "person_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "PersonID"
+        },
+        "value_as_string": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity"
+        }
+      },
+      "Indian 34534": {
+        "person_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "PersonID"
+        },
+        "observation_datetime": {
+          "source_table": "Demographics.csv",
+          "source_field": "date_of_birth"
+        },
+        "observation_source_concept_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity",
+          "term_mapping": { "Indian": 35826241 }
+        },
+        "observation_concept_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity",
+          "term_mapping": { "Indian": 35826241 }
+        },
+        "observation_source_value": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity"
+        },
+        "value_as_string": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity"
+        }
+      },
+      "White and Asian 34537": {
+        "person_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "PersonID"
+        },
+        "observation_datetime": {
+          "source_table": "Demographics.csv",
+          "source_field": "date_of_birth"
+        },
+        "observation_source_concept_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity",
+          "term_mapping": { "White and Asian": 35827395 }
+        },
+        "observation_concept_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity",
+          "term_mapping": { "White and Asian": 35827395 }
+        },
+        "observation_source_value": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity"
+        },
+        "value_as_string": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity"
+        }
+      },
+      "Bangladesh 34542": {
+        "value_as_string": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity"
+        },
+        "observation_source_value": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity"
+        },
+        "observation_concept_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity",
+          "term_mapping": { "White and Asian": 35825531 }
+        },
+        "observation_source_concept_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity",
+          "term_mapping": { "White and Asian": 35825531 }
+        },
+        "observation_datetime": {
+          "source_table": "Demographics.csv",
+          "source_field": "date_of_birth"
+        },
+        "person_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "PersonID"
+        }
+      }
+    },
+    "condition_occurrence": {
+      "Cough 34538": {
+        "person_id": {
+          "source_table": "Symptoms.csv",
+          "source_field": "PersonID"
+        },
+        "condition_start_datetime": {
+          "source_table": "Symptoms.csv",
+          "source_field": "visit_date"
+        },
+        "condition_end_datetime": {
+          "source_table": "Symptoms.csv",
+          "source_field": "visit_date"
+        },
+        "condition_source_concept_id": {
+          "source_table": "Symptoms.csv",
+          "source_field": "symptom1",
+          "term_mapping": { "Y": 254761 }
+        },
+        "condition_concept_id": {
+          "source_table": "Symptoms.csv",
+          "source_field": "symptom1",
+          "term_mapping": { "Y": 254761 }
+        },
+        "condition_source_value": {
+          "source_table": "Symptoms.csv",
+          "source_field": "symptom1"
+        }
+      }
+    },
+    "person": {
+      "FEMALE 34539": {
+        "gender_source_value": {
+          "source_table": "Demographics.csv",
+          "source_field": "sex"
+        },
+        "person_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "PersonID"
+        },
+        "birth_datetime": {
+          "source_table": "Demographics.csv",
+          "source_field": "date_of_birth"
+        },
+        "gender_source_concept_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "sex",
+          "term_mapping": { "F": 8532 }
+        },
+        "gender_concept_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "sex",
+          "term_mapping": { "F": 8532 }
+        }
+      },
+      "MALE 34540": {
+        "gender_source_value": {
+          "source_table": "Demographics.csv",
+          "source_field": "sex"
+        },
+        "gender_concept_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "sex",
+          "term_mapping": { "M": 8507 }
+        },
+        "gender_source_concept_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "sex",
+          "term_mapping": { "M": 8507 }
+        },
+        "birth_datetime": {
+          "source_table": "Demographics.csv",
+          "source_field": "date_of_birth"
+        },
+        "person_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "PersonID"
+        }
+      },
+      "White 212273": {
+        "person_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "PersonID"
+        },
+        "birth_datetime": {
+          "source_table": "Demographics.csv",
+          "source_field": "date_of_birth"
+        },
+        "race_source_concept_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity",
+          "term_mapping": { "White": 8527 }
+        },
+        "race_concept_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity",
+          "term_mapping": { "White": 8527 }
+        },
+        "race_source_value": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity"
+        }
+      },
+      "African 212274": {
+        "person_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "PersonID"
+        },
+        "birth_datetime": {
+          "source_table": "Demographics.csv",
+          "source_field": "date_of_birth"
+        },
+        "race_source_concept_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity",
+          "term_mapping": { "Black": 38003600 }
+        },
+        "race_concept_id": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity",
+          "term_mapping": { "Black": 38003600 }
+        },
+        "race_source_value": {
+          "source_table": "Demographics.csv",
+          "source_field": "ethnicity"
+        }
+      }
+    },
+    "measurement": {
+      "IgG 34541": {
+        "measurement_source_value": {
+          "source_table": "covid19_antibody.csv",
+          "source_field": "IgG"
+        },
+        "person_id": {
+          "source_table": "covid19_antibody.csv",
+          "source_field": "PersonID"
+        },
+        "measurement_datetime": {
+          "source_table": "covid19_antibody.csv",
+          "source_field": "date"
+        },
+        "value_as_number": {
+          "source_table": "covid19_antibody.csv",
+          "source_field": "IgG"
+        },
+        "measurement_source_concept_id": {
+          "source_table": "covid19_antibody.csv",
+          "source_field": "IgG",
+          "term_mapping": 37398191
+        },
+        "measurement_concept_id": {
+          "source_table": "covid19_antibody.csv",
+          "source_field": "IgG",
+          "term_mapping": 37398191
+        }
+      }
+    }
+  }
+}

carrottransform/examples/test/rules/v2.json ADDED Viewed

@@ -0,0 +1,115 @@
+{
+  "metadata": {
+    "date_created": "2025-07-22T11:13:38.203888+00:00",
+    "dataset": "transform"
+  },
+  "cdm": {
+    "condition_occurrence": {
+      "Symptoms.csv": {
+        "person_id_mapping": {
+          "source_field": "PersonID",
+          "dest_field": "person_id"
+        },
+        "date_mapping": {
+          "source_field": "visit_date",
+          "dest_field": ["condition_start_datetime", "condition_end_datetime"]
+        },
+        "concept_mappings": {
+          "symptom1": {
+            "Y": {
+              "condition_source_concept_id": [254761],
+              "condition_concept_id": [254761]
+            },
+            "original_value": ["condition_source_value"]
+          }
+        }
+      }
+    },
+    "measurement": {
+      "covid19_antibody.csv": {
+        "person_id_mapping": {
+          "source_field": "PersonID",
+          "dest_field": "person_id"
+        },
+        "date_mapping": {
+          "source_field": "date",
+          "dest_field": ["measurement_datetime"]
+        },
+        "concept_mappings": {
+          "IgG": {
+            "*": {
+              "measurement_source_concept_id": [37398191],
+              "measurement_concept_id": [37398191]
+            },
+            "original_value": ["measurement_source_value", "value_as_number"]
+          }
+        }
+      }
+    },
+    "observation": {
+      "Demographics.csv": {
+        "person_id_mapping": {
+          "source_field": "PersonID",
+          "dest_field": "person_id"
+        },
+        "date_mapping": {
+          "source_field": "date_of_birth",
+          "dest_field": ["observation_datetime"]
+        },
+        "concept_mappings": {
+          "ethnicity": {
+            "White and Asian": {
+              "observation_concept_id": [35825531, 35827395],
+              "observation_source_concept_id": [35825531, 35827395]
+            },
+            "Indian": {
+              "observation_concept_id": [35826241],
+              "observation_source_concept_id": [35826241]
+            },
+            "Asian": {
+              "observation_concept_id": [35825508],
+              "observation_source_concept_id": [35825508]
+            },
+            "original_value": ["value_as_string", "observation_source_value"]
+          }
+        }
+      }
+    },
+    "person": {
+      "Demographics.csv": {
+        "person_id_mapping": {
+          "source_field": "PersonID",
+          "dest_field": "person_id"
+        },
+        "date_mapping": {
+          "source_field": "date_of_birth",
+          "dest_field": ["birth_datetime"]
+        },
+        "concept_mappings": {
+          "ethnicity": {
+            "Black": {
+              "race_source_concept_id": [38003600],
+              "race_concept_id": [38003600]
+            },
+            "White": {
+              "race_concept_id": [8527],
+              "race_source_concept_id": [8527]
+            },
+            "original_value": ["race_source_value"]
+          },
+          "sex": {
+            "M": {
+              "gender_concept_id": [8507],
+              "gender_source_concept_id": [8507]
+            },
+            "F": {
+              "gender_concept_id": [8532],
+              "gender_source_concept_id": [8532]
+            },
+            "original_value": ["gender_source_value"]
+          }
+        }
+      }
+    }
+  }
+}

carrottransform/tools/__init__.py CHANGED Viewed

@@ -1,17 +1,7 @@
-import carrottransform
-import os
-import sys
-import json
-import time
+from .file_helpers import load_json as load_json
-from .file_helpers import (
-  load_json
-)
+from .metrics import Metrics as Metrics
-from .metrics import (
-  Metrics
-)
-from . import mappingrules
-from . import omopcdm
+from . import mappingrules as mappingrules
+from . import omopcdm as omopcdm

carrottransform/tools/args.py ADDED Viewed

@@ -0,0 +1,128 @@
+"""
+functions to handle args
+"""
+from pathlib import Path
+class OnlyOnePersonInputAllowed(Exception):
+    """Raised when they try to use more than one person file in the mapping"""
+    def __init__(self, rules_file: Path, person_file: Path, inputs: set[str]):
+        self._rules_file = rules_file
+        self._person_file = person_file
+        self._inputs = inputs
+class NoPersonMappings(Exception):
+    """Raised when they try to use more than one person file in the mapping"""
+    def __init__(self, rules_file: Path, person_file: Path):
+        self._rules_file = rules_file
+        self._person_file = person_file
+class WrongInputException(Exception):
+    """Raised when they try to read fromt he wrong table - and only the wrong table"""
+    def __init__(self, rules_file: Path, person_file: Path, source_table: str):
+        self._rules_file = rules_file
+        self._person_file = person_file
+        self._source_table = source_table
+class ObjectQueryError(Exception):
+    """Raised when the object path format is invalid."""
+class ObjectStructureError(Exception):
+    """Raised when the object path format points to inaccessible elements."""
+def person_rules_check(person_file: Path, rules_file: Path) -> None:
+    """check that the person rules file is correct.
+    we need all person/patient records to come from one file - the person file. this includes the gender mapping. this should/must also be the person_file parameter.
+    ... this does reopen the possibility of auto-detecting the person file from the rules file
+    """
+    # check the args are real files
+    if not person_file.is_file():
+        raise Exception(f"person file not found: {person_file=}")
+    if not rules_file.is_file():
+        raise Exception(f"person file not found: {rules_file=}")
+    # load the rules file
+    with open(rules_file) as file:
+        import json
+        rules_json = json.load(file)
+    # loop through the rules for person rules with wrong_inputs
+    seen_inputs: set[str] = set()
+    try:
+        for rule_name, person in object_query(rules_json, "cdm/person").items():
+            found_a_rule = True
+            for col in person:
+                source_table: str = person[col]["source_table"]
+                seen_inputs.add(source_table)
+    except ObjectStructureError as e:
+        if "Key 'person' not found in object" == str(e):
+            raise NoPersonMappings(rules_file, person_file)
+        else:
+            raise e
+    # for theoretical cases when there is a `"people":{}` entry that's empty
+    # ... i don't think that carrot-mapper would emit it, but, i think that it would be valid JSON
+    if not found_a_rule:
+        raise NoPersonMappings(rules_file, person_file)
+    # detect too many input files
+    if 1 < len(seen_inputs):
+        raise OnlyOnePersonInputAllowed(rules_file, person_file, seen_inputs)
+    # check if the seen file is correct
+    seen_table: str = list(seen_inputs)[0]
+    if person_file.name != seen_table:
+        raise WrongInputException(rules_file, person_file, seen_table)
+def object_query(data: dict[str, dict | str], path: str):
+    """
+    Navigate a nested dictionary using a `/`-delimited path string.
+    Args:
+        data: The dictionary to traverse.
+        path: The object path, e.g., "/foo/bar".
+    Returns:
+        The value at the given path.
+    Raises:
+        ObjectQueryError: If the path format is invalid or the key is missing.
+    """
+    if path.startswith("/") or path.endswith("/"):
+        raise ObjectQueryError(
+            f"Invalid path format: {path!r} (must not start with '/' and not end with '/')"
+        )
+    current_key, _, remaining_path = path.partition("/")
+    if not current_key:
+        raise ObjectQueryError(f"Invalid path: blank key at start in {path!r}")
+    if current_key not in data:
+        raise ObjectStructureError(f"Key {current_key!r} not found in object")
+    value = data[current_key]
+    if not remaining_path:
+        return value
+    if not isinstance(value, dict):
+        raise ObjectStructureError(
+            f"Cannot descend into non-dict value at key {current_key!r}"
+        )
+    return object_query(value, remaining_path)

carrottransform/tools/concept_helpers.py ADDED Viewed

@@ -0,0 +1,61 @@
+from typing import Dict, List, Optional
+from carrottransform.tools.mapping_types import ConceptMapping
+def generate_combinations(
+    value_mapping: Optional[Dict[str, List[int]]],
+) -> List[Dict[str, int]]:
+    """
+    Generate all concept combinations for multiple concept IDs
+    NOTE: this logic can handle un-even number of concept IDs across fields, even though this scenario needs more investigation.
+    For now, the len of dest_fields should be equal
+    For example, if value_mapping is:
+    {
+        "observation_concept_id": [35827395, 35825531],
+        "observation_source_concept_id": [35827395, 35825531]
+    }
+    This returns:
+    [
+        {"observation_concept_id": 35827395, "observation_source_concept_id": 35827395},
+        {"observation_concept_id": 35825531, "observation_source_concept_id": 35825531}
+    ]
+    """
+    if not value_mapping:
+        return []
+    # Find the maximum number of concept IDs across all fields
+    max_concepts = max(
+        len(concept_ids) for concept_ids in value_mapping.values() if concept_ids
+    )
+    combinations = []
+    for i in range(max_concepts):
+        combo = {}
+        for dest_field, concept_ids in value_mapping.items():
+            if concept_ids:
+                # Use the concept at index i, or the last one if not enough concepts
+                concept_index = min(i, len(concept_ids) - 1)
+                combo[dest_field] = concept_ids[concept_index]
+        combinations.append(combo)
+    return combinations
+def get_value_mapping(
+    concept_mapping: ConceptMapping, source_value: str
+) -> Optional[Dict[str, List[int]]]:
+    """
+    Get value mapping for a source value, handling wildcards
+    Priority:
+    1. Exact match for source value
+    2. Wildcard match (*) - maps all values to same concept
+    3. None
+    """
+    if source_value in concept_mapping.value_mappings:
+        return concept_mapping.value_mappings[source_value]
+    elif "*" in concept_mapping.value_mappings:
+        return concept_mapping.value_mappings["*"]
+    return None

carrot-transform 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

carrot-transform 0.3.5py3-none-any.whl → 0.4.0py3-none-any.whl