PyPI - carrot-transform - Versions diffs - 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

carrot-transform 0.3.5py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of carrot-transform might be problematic. Click here for more details.

Files changed (32) hide show

{carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
carrot_transform-0.4.0.dist-info/RECORD +41 -0
{carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
carrottransform/__init__.py +1 -1
carrottransform/_version.py +2 -2
carrottransform/cli/command.py +9 -5
carrottransform/cli/subcommands/run.py +214 -526
carrottransform/cli/subcommands/run_v2.py +145 -0
carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
carrottransform/examples/test/rules/v1.json +280 -0
carrottransform/examples/test/rules/v2.json +115 -0
carrottransform/tools/__init__.py +4 -14
carrottransform/tools/args.py +128 -0
carrottransform/tools/concept_helpers.py +61 -0
carrottransform/tools/core.py +163 -0
carrottransform/tools/date_helpers.py +79 -0
carrottransform/tools/file_helpers.py +153 -9
carrottransform/tools/logger.py +19 -0
carrottransform/tools/mapping_types.py +32 -0
carrottransform/tools/mappingrules.py +297 -34
carrottransform/tools/metrics.py +162 -109
carrottransform/tools/omopcdm.py +37 -32
carrottransform/tools/orchestrator.py +381 -0
carrottransform/tools/person_helpers.py +126 -0
carrottransform/tools/record_builder.py +413 -0
carrottransform/tools/stream_helpers.py +71 -0
carrottransform/tools/types.py +71 -0
carrottransform/tools/validation.py +62 -0
carrot_transform-0.3.5.dist-info/RECORD +0 -25
carrot_transform-0.3.5.dist-info/entry_points.txt +0 -3
{carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0

carrottransform/tools/core.py ADDED Viewed

@@ -0,0 +1,163 @@
+import carrottransform.tools as tools
+from carrottransform.tools.omopcdm import OmopCDM
+from carrottransform.tools.logger import logger_setup
+from carrottransform.tools.validation import valid_value
+from carrottransform.tools.date_helpers import get_datetime_value
+logger = logger_setup()
+def get_target_records(
+    tgtfilename: str,
+    tgtcolmap: dict[str, int],
+    rulesmap: dict[str, list[dict[str, list[str]]]],
+    srcfield: str,
+    srcdata: list[str],
+    srccolmap: dict[str, int],
+    srcfilename: str,
+    omopcdm: OmopCDM,
+    metrics: tools.metrics.Metrics,
+) -> tuple[bool, list[list[str]], tools.metrics.Metrics]:
+    """
+    build all target records for a given input field
+    """
+    build_records = False
+    tgtrecords = []
+    # Get field definitions from OMOP CDM
+    date_col_data = omopcdm.get_omop_datetime_linked_fields(tgtfilename)
+    date_component_data = omopcdm.get_omop_date_field_components(tgtfilename)
+    notnull_numeric_fields = omopcdm.get_omop_notnull_numeric_fields(tgtfilename)
+    # Build keys to look up rules
+    srckey = f"{srcfilename}~{srcfield}~{tgtfilename}"
+    # Check if source field has a value
+    if valid_value(str(srcdata[srccolmap[srcfield]])):
+        ## check if either or both of the srckey and summarykey are in the rules
+        srcfullkey = (
+            srcfilename
+            + "~"
+            + srcfield
+            + "~"
+            + str(srcdata[srccolmap[srcfield]])
+            + "~"
+            + tgtfilename
+        )
+        dictkeys = []
+        # Check if we have rules for either the full key or just the source field
+        if tgtfilename == "person":
+            build_records = True
+            dictkeys.append(srcfilename + "~person")
+        elif srcfullkey in rulesmap:
+            build_records = True
+            dictkeys.append(srcfullkey)
+        if srckey in rulesmap:
+            build_records = True
+            dictkeys.append(srckey)
+        if build_records:
+            # Process each matching rule
+            for dictkey in dictkeys:
+                for out_data_elem in rulesmap[dictkey]:
+                    valid_data_elem = True
+                    ## create empty list to store the data. Populate numerical data elements with 0 instead of empty string.
+                    tgtarray = [""] * len(tgtcolmap)
+                    # Initialize numeric fields to 0
+                    for req_integer in notnull_numeric_fields:
+                        tgtarray[tgtcolmap[req_integer]] = "0"
+                    # Process each field mapping
+                    for infield, outfield_list in out_data_elem.items():
+                        if tgtfilename == "person" and isinstance(outfield_list, dict):
+                            # Handle term mappings for person records
+                            input_value = srcdata[srccolmap[infield]]
+                            if str(input_value) in outfield_list:
+                                for output_col_data in outfield_list[str(input_value)]:
+                                    if "~" in output_col_data:
+                                        # Handle mapped values (like gender codes)
+                                        outcol, term = output_col_data.split("~")
+                                        tgtarray[tgtcolmap[outcol]] = term
+                                    else:
+                                        # Direct field copy
+                                        tgtarray[tgtcolmap[output_col_data]] = srcdata[
+                                            srccolmap[infield]
+                                        ]
+                        else:
+                            # Handle direct field copies and non-person records
+                            for output_col_data in outfield_list:
+                                if "~" in output_col_data:
+                                    # Handle mapped values (like gender codes)
+                                    outcol, term = output_col_data.split("~")
+                                    tgtarray[tgtcolmap[outcol]] = term
+                                else:
+                                    # Direct field copy
+                                    tgtarray[tgtcolmap[output_col_data]] = srcdata[
+                                        srccolmap[infield]
+                                    ]
+                            # get the value. this is out 8061 value that was previously normalised
+                            source_date = srcdata[srccolmap[infield]]
+                            # Special handling for date fields
+                            if output_col_data in date_component_data:
+                                # this side of the if/else seems to be fore birthdates which're split up into four fields
+                                # parse the date and store it in the old format ... as a way to branch
+                                # ... this check might be redudant. the datetime values should be ones that have already been normalised
+                                dt = get_datetime_value(source_date.split(" ")[0])
+                                if dt is None:
+                                    # if (as above) dt isn't going to be None than this branch shouldn't happen
+                                    # maybe brithdates can be None?
+                                    metrics.increment_key_count(
+                                        source=srcfilename,
+                                        fieldname=srcfield,
+                                        tablename=tgtfilename,
+                                        concept_id="all",
+                                        additional="",
+                                        count_type="invalid_date_fields",
+                                    )
+                                    valid_data_elem = False
+                                else:
+                                    year_field = date_component_data[output_col_data][
+                                        "year"
+                                    ]
+                                    month_field = date_component_data[output_col_data][
+                                        "month"
+                                    ]
+                                    day_field = date_component_data[output_col_data][
+                                        "day"
+                                    ]
+                                    tgtarray[tgtcolmap[year_field]] = str(dt.year)
+                                    tgtarray[tgtcolmap[month_field]] = str(dt.month)
+                                    tgtarray[tgtcolmap[day_field]] = str(dt.day)
+                                    tgtarray[tgtcolmap[output_col_data]] = source_date
+                            elif (
+                                output_col_data in date_col_data
+                            ):  # date_col_data for key $K$ is where $only_date(srcdata[K])$ should be copied and is there for all dates
+                                # this fork of the if/else seems to be for non-birthdates which're handled differently
+                                # copy the full value into this "full value"
+                                tgtarray[tgtcolmap[output_col_data]] = source_date
+                                # select the first 10 chars which will be YYYY-MM-DD
+                                tgtarray[tgtcolmap[date_col_data[output_col_data]]] = (
+                                    source_date[:10]
+                                )
+                    if valid_data_elem:
+                        tgtrecords.append(tgtarray)
+    else:
+        metrics.increment_key_count(
+            source=srcfilename,
+            fieldname=srcfield,
+            tablename=tgtfilename,
+            concept_id="all",
+            additional="",
+            count_type="invalid_source_fields",
+        )
+    return build_records, tgtrecords, metrics

carrottransform/tools/date_helpers.py ADDED Viewed

@@ -0,0 +1,79 @@
+import datetime
+import re
+def get_datetime_value(item: str) -> datetime.datetime | None:
+    """
+    Check if a date item is non-null and parses as ISO (YYYY-MM-DD), reverse-ISO (DD-MM-YYYY),
+    or UK format (DD/MM/YYYY).
+    Returns a datetime object if successful, None otherwise.
+    """
+    date_formats = [
+        "%Y-%m-%d",  # ISO format (YYYY-MM-DD)
+        "%d-%m-%Y",  # Reverse ISO format (DD-MM-YYYY)
+        "%d/%m/%Y",  # UK old-style format (DD/MM/YYYY)
+    ]
+    for date_format in date_formats:
+        try:
+            return datetime.datetime.strptime(item, date_format)
+        except ValueError:
+            continue
+    # If we get here, none of the formats worked
+    return None
+def normalise_to8601(item: str) -> str:
+    """parses, normalises, and formats a date value using regexes
+    could use just one regex but that seems bad.
+    """
+    both = item.split(" ")
+    match = re.match(r"(?P<year>\d{4})[-/](?P<month>\d{2})[-/](?P<day>\d{2})", both[0])
+    if not match:
+        match = re.match(
+            r"(?P<day>\d{2})[-/](?P<month>\d{2})[-/](?P<year>\d{4})", both[0]
+        )
+    if not match:
+        raise Exception(f"invalid date format {item=}")
+    data = match.groupdict()
+    year, month, day = data["year"], data["month"], data["day"]
+    value = str(int(year)).zfill(4)
+    value += "-"
+    value += str(int(month)).zfill(2)
+    value += "-"
+    value += str(int(day)).zfill(2)
+    value += " "
+    if 2 == len(both):
+        match = re.match(
+            r"(?P<hour>\d{2}):(?P<minute>\d{2})(:(?P<second>\d{2})(\.\d{6})?)?", both[1]
+        )
+        if match:
+            data = match.groupdict()
+            hour, minute, second = data["hour"], data["minute"], data["second"]
+        else:
+            hour, minute, second = None, None, None
+        # concat the time_suffix
+        if hour is not None:
+            if minute is None:
+                raise Exception(
+                    f"unrecognized format seems to have 'hours' but not 'minutes' {item=}"
+                )
+            value += str(int(hour)).zfill(2)
+            value += ":"
+            value += str(int(minute)).zfill(2)
+            value += ":"
+            value += str(int(second if second is not None else "0")).zfill(2)
+    if ":" not in value:
+        value += "00:00:00"
+    return value

carrottransform/tools/file_helpers.py CHANGED Viewed

@@ -1,11 +1,11 @@
+import csv
 import json
 import logging
-import os
 import sys
-import json
 import importlib.resources as resources
-from typing import List, Optional
+from typing import IO, Iterator, List, Optional, Dict, TextIO, Tuple, cast
 from pathlib import Path
+from carrottransform.tools.omopcdm import OmopCDM
 logger = logging.getLogger(__name__)
@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
 def load_json(f_in: Path):
     try:
         data = json.load(f_in.open())
-    except Exception as err:
+    except Exception:
         logger.exception("{0} not found. Or cannot parse as json".format(f_in))
         sys.exit()
@@ -26,16 +26,160 @@ def load_json(f_in: Path):
 def resolve_paths(args: List[Optional[Path]]) -> List[Optional[Path]]:
     """Resolve special path syntaxes in command line arguments."""
     try:
-        with resources.files('carrottransform').joinpath('__init__.py') as f:
-            package_path = f.parent
+        # Fix for Traversable parent issue - convert to Path first
+        package_files = resources.files("carrottransform")
+        package_path = Path(str(package_files)).resolve()
     except Exception:
         # Fallback for development environment
         import carrottransform
         package_path = Path(carrottransform.__file__).resolve().parent
     # Handle None values and replace @carrot with the actual package path
-    prefix = '@carrot'
+    prefix = "@carrot"
     return [
-        package_path / Path(str(arg).replace(prefix, '').lstrip('/')) if arg is not None and str(arg).startswith(prefix) else arg
+        (
+            package_path
+            / Path(str(arg).replace(prefix, "").replace("\\", "/").lstrip("/"))
+            if arg is not None and str(arg).startswith(prefix)
+            else arg
+        )
         for arg in args
     ]
+def check_dir_isvalid(directory: Path, create_if_missing: bool = False) -> None:
+    """Check if directory is valid, optionally create it if missing.
+    Args:
+        directory: Directory path as string or tuple
+        create_if_missing: If True, create directory if it doesn't exist
+    """
+    ## if not a directory, create it if requested (including parents. This option is for the output directory only).
+    if not directory.is_dir():
+        if create_if_missing:
+            try:
+                ## deliberately not using the exist_ok option, as we want to know whether it was created or not to provide different logger messages.
+                directory.mkdir(parents=True)
+                logger.info(f"Created directory: {directory}")
+            except OSError as e:
+                logger.warning(f"Failed to create directory {directory}: {e}")
+                sys.exit(1)
+        else:
+            logger.warning(f"Not a directory, dir {directory}")
+            sys.exit(1)
+def check_files_in_rules_exist(
+    rules_input_files: list[str], existing_input_files: list[str]
+) -> None:
+    for infile in existing_input_files:
+        if infile not in rules_input_files:
+            msg = (
+                "WARNING: no mapping rules found for existing input file - {0}".format(
+                    infile
+                )
+            )
+            logger.warning(msg)
+    for infile in rules_input_files:
+        if infile not in existing_input_files:
+            msg = "WARNING: no data for mapped input file - {0}".format(infile)
+            logger.warning(msg)
+def open_file(file_path: Path) -> tuple[IO[str], Iterator[list[str]]] | None:
+    """opens a file and does something related to CSVs"""
+    try:
+        fh = file_path.open(mode="r", encoding="utf-8-sig")
+        csvr = csv.reader(fh)
+        return fh, csvr
+    except IOError as e:
+        logger.exception("Unable to open: {0}".format(file_path))
+        logger.exception("I/O error({0}): {1}".format(e.errno, e.strerror))
+        return None
+def set_omop_filenames(
+    omop_ddl_file: Optional[Path],
+    omop_config_file: Optional[Path],
+    omop_version: Optional[str],
+) -> tuple[Optional[Path], Optional[Path]]:
+    """
+    Set default OMOP file paths when not explicitly provided.
+    This function provides a convenience mechanism where users can specify just
+    an OMOP version instead of providing full paths to both DDL and config files.
+    Args:
+        omop_ddl_file: Path to OMOP DDL file (optional)
+        omop_config_file: Path to OMOP config file (optional)
+        omop_version: OMOP version string (e.g., "5.3", "5.4")
+    Returns:
+        Tuple of (config_file_path, ddl_file_path) - either provided or defaults
+    Example:
+        # User provides version but no files - defaults will be used
+        config, ddl = set_omop_filenames(None, None, "5.3")
+        # User provides custom files - they will be returned unchanged
+        config, ddl = set_omop_filenames(custom_ddl, custom_config, "5.3")
+    """
+    # Only set defaults if BOTH files are None AND version is provided
+    if omop_ddl_file is None and omop_config_file is None and omop_version is not None:
+        logger.info(f"Using default OMOP files for version {omop_version}")
+        # Set default config file - convert Traversable to Path
+        config_traversable = resources.files("carrottransform") / "config" / "omop.json"
+        omop_config_file = Path(str(config_traversable))
+        # Set version-specific DDL file - convert Traversable to Path
+        omop_ddl_file_name = f"OMOPCDM_postgresql_{omop_version}_ddl.sql"
+        ddl_traversable = (
+            resources.files("carrottransform") / "config" / omop_ddl_file_name
+        )
+        omop_ddl_file = Path(str(ddl_traversable))
+        # Validate that the default files exist (now safe since they're Path objects)
+        if not omop_config_file.is_file():
+            logger.warning(f"Default config file not found: {omop_config_file}")
+        if not omop_ddl_file.is_file():
+            logger.warning(f"Default DDL file not found: {omop_ddl_file}")
+    return omop_config_file, omop_ddl_file
+class OutputFileManager:
+    """Manages output file creation and cleanup"""
+    def __init__(self, output_dir: Path, omopcdm: OmopCDM):
+        self.output_dir = output_dir
+        self.omopcdm = omopcdm
+        self.file_handles: Dict[str, TextIO] = {}
+    def setup_output_files(
+        self, output_files: List[str], write_mode: str
+    ) -> Tuple[Dict[str, TextIO], Dict[str, Dict[str, int]]]:
+        """Setup output files and return file handles and column maps"""
+        target_column_maps = {}
+        for target_file in output_files:
+            file_path = (self.output_dir / target_file).with_suffix(".tsv")
+            self.file_handles[target_file] = cast(
+                TextIO, file_path.open(mode=write_mode, encoding="utf-8")
+            )
+            if write_mode == "w":
+                output_header = self.omopcdm.get_omop_column_list(target_file)
+                self.file_handles[target_file].write("\t".join(output_header) + "\n")
+            target_column_maps[target_file] = self.omopcdm.get_omop_column_map(
+                target_file
+            )
+        return self.file_handles, target_column_maps
+    def close_all_files(self):
+        """Close all open file handles"""
+        for fh in self.file_handles.values():
+            fh.close()
+        self.file_handles.clear()

carrottransform/tools/logger.py ADDED Viewed

@@ -0,0 +1,19 @@
+import logging
+from logging import Logger
+def logger_setup() -> Logger:
+    logger = logging.getLogger(__name__)
+    if not logger.handlers:
+        logger.setLevel(logging.INFO)
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(logging.INFO)
+        formatter = logging.Formatter(
+            "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+        )
+        console_handler.setFormatter(formatter)
+        logger.addHandler(console_handler)
+    return logger

carrottransform/tools/mapping_types.py ADDED Viewed

@@ -0,0 +1,32 @@
+from typing import Dict, List, Optional
+from dataclasses import dataclass
+# To prevent circular import, these types should be in a separate file rather than in the types.py
+@dataclass
+class PersonIdMapping:
+    source_field: str
+    dest_field: str
+@dataclass
+class DateMapping:
+    source_field: str
+    dest_fields: List[str]
+@dataclass
+class ConceptMapping:
+    source_field: str
+    value_mappings: Dict[
+        str, Dict[str, List[int]]
+    ]  # value -> dest_field -> concept_ids
+    original_value_fields: List[str]
+@dataclass
+class V2TableMapping:
+    source_table: str
+    person_id_mapping: Optional[PersonIdMapping]
+    date_mapping: Optional[DateMapping]
+    concept_mappings: Dict[str, ConceptMapping]  # source_field -> ConceptMapping

carrot-transform 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

carrot-transform 0.3.5py3-none-any.whl → 0.4.0py3-none-any.whl