PyPI - carrot-transform - Versions diffs - 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

carrot-transform 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of carrot-transform might be problematic. Click here for more details.

Files changed (33) hide show

{carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
carrot_transform-0.4.0.dist-info/RECORD +41 -0
{carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
carrottransform/__init__.py +1 -1
carrottransform/_version.py +2 -2
carrottransform/cli/command.py +9 -5
carrottransform/cli/subcommands/run.py +302 -443
carrottransform/cli/subcommands/run_v2.py +145 -0
carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
carrottransform/examples/test/rules/v1.json +280 -0
carrottransform/examples/test/rules/v2.json +115 -0
carrottransform/tools/__init__.py +4 -14
carrottransform/tools/args.py +128 -0
carrottransform/tools/click.py +21 -0
carrottransform/tools/concept_helpers.py +61 -0
carrottransform/tools/core.py +163 -0
carrottransform/tools/date_helpers.py +79 -0
carrottransform/tools/file_helpers.py +177 -7
carrottransform/tools/logger.py +19 -0
carrottransform/tools/mapping_types.py +32 -0
carrottransform/tools/mappingrules.py +298 -32
carrottransform/tools/metrics.py +274 -49
carrottransform/tools/omopcdm.py +42 -32
carrottransform/tools/orchestrator.py +381 -0
carrottransform/tools/person_helpers.py +126 -0
carrottransform/tools/record_builder.py +413 -0
carrottransform/tools/stream_helpers.py +71 -0
carrottransform/tools/types.py +71 -0
carrottransform/tools/validation.py +62 -0
carrot_transform-0.3.4.dist-info/RECORD +0 -24
carrot_transform-0.3.4.dist-info/entry_points.txt +0 -3
{carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0

carrottransform/tools/orchestrator.py ADDED Viewed

@@ -0,0 +1,381 @@
+import csv
+from pathlib import Path
+from typing import Dict, Tuple, Any, Optional, List, Set
+import carrottransform.tools as tools
+from carrottransform.tools.mappingrules import MappingRules
+from carrottransform.tools.omopcdm import OmopCDM
+from carrottransform.tools.logger import logger_setup
+from carrottransform.tools.person_helpers import (
+    load_person_ids,
+    set_saved_person_id_file,
+)
+from carrottransform.tools.date_helpers import normalise_to8601
+from carrottransform.tools.types import (
+    ProcessingResult,
+    ProcessingContext,
+    RecordContext,
+)
+from carrottransform.tools.record_builder import RecordBuilderFactory
+from carrottransform.tools.file_helpers import OutputFileManager
+from carrottransform.tools.stream_helpers import StreamingLookupCache
+logger = logger_setup()
+class StreamProcessor:
+    """Efficient single-pass streaming processor"""
+    def __init__(self, context: ProcessingContext, lookup_cache: StreamingLookupCache):
+        self.context = context
+        self.cache = lookup_cache
+    def process_all_data(self) -> ProcessingResult:
+        """Process all data with single-pass streaming approach"""
+        logger.info("Processing data...")
+        total_output_counts = {outfile: 0 for outfile in self.context.output_files}
+        total_rejected_counts = {infile: 0 for infile in self.context.input_files}
+        # Process each input file
+        for source_filename in self.context.input_files:
+            try:
+                output_counts, rejected_count = self._process_input_file_stream(
+                    source_filename
+                )
+                # Update totals
+                for target_file, count in output_counts.items():
+                    total_output_counts[target_file] += count
+                total_rejected_counts[source_filename] = rejected_count
+            except Exception as e:
+                logger.error(f"Error processing file {source_filename}: {str(e)}")
+                return ProcessingResult(
+                    total_output_counts,
+                    total_rejected_counts,
+                    success=False,
+                    error_message=str(e),
+                )
+        return ProcessingResult(total_output_counts, total_rejected_counts)
+    def _process_input_file_stream(
+        self, source_filename: str
+    ) -> Tuple[Dict[str, int], int]:
+        """Stream process a single input file with direct output writing"""
+        logger.info(f"Streaming input file: {source_filename}")
+        file_path = self.context.input_dir / source_filename
+        if not file_path.exists():
+            logger.warning(f"Input file not found: {source_filename}")
+            return {}, 0
+        # Get which output tables this input file can map to
+        applicable_targets = self.cache.input_to_outputs.get(source_filename, set())
+        if not applicable_targets:
+            logger.info(f"No mappings found for {source_filename}")
+            return {}, 0
+        output_counts = {target: 0 for target in applicable_targets}
+        rejected_count = 0
+        # Get file metadata from cache
+        file_meta = self.cache.file_metadata_cache[source_filename]
+        if not file_meta["datetime_source"] or not file_meta["person_id_source"]:
+            logger.warning(f"Missing date or person ID mapping for {source_filename}")
+            return output_counts, rejected_count
+        try:
+            with file_path.open(mode="r", encoding="utf-8-sig") as fh:
+                csv_reader = csv.reader(fh)
+                csv_column_headers = next(csv_reader)
+                input_column_map = self.context.omopcdm.get_column_map(
+                    csv_column_headers
+                )
+                # Validate required columns exist
+                datetime_col_idx = input_column_map.get(file_meta["datetime_source"])
+                if datetime_col_idx is None:
+                    logger.warning(
+                        f"Date field {file_meta['datetime_source']} not found in {source_filename}"
+                    )
+                    return output_counts, rejected_count
+                # Stream process each row
+                for input_data in csv_reader:
+                    row_counts, row_rejected = self._process_single_row_stream(
+                        source_filename,
+                        input_data,
+                        input_column_map,
+                        applicable_targets,
+                        datetime_col_idx,
+                        file_meta,
+                    )
+                    for target, count in row_counts.items():
+                        output_counts[target] += count
+                    rejected_count += row_rejected
+        except Exception as e:
+            logger.error(f"Error streaming file {source_filename}: {str(e)}")
+        return output_counts, rejected_count
+    def _process_single_row_stream(
+        self,
+        source_filename: str,
+        input_data: List[str],
+        input_column_map: Dict[str, int],
+        applicable_targets: Set[str],
+        datetime_col_idx: int,
+        file_meta: Dict[str, Any],
+    ) -> Tuple[Dict[str, int], int]:
+        """Process single row and write directly to all applicable output files"""
+        # Increment input count
+        self.context.metrics.increment_key_count(
+            source=source_filename,
+            fieldname="all",
+            tablename="all",
+            concept_id="all",
+            additional="",
+            count_type="input_count",
+        )
+        # Normalize date once
+        fulldate = normalise_to8601(input_data[datetime_col_idx])
+        if fulldate is None:
+            self.context.metrics.increment_key_count(
+                source=source_filename,
+                fieldname="all",
+                tablename="all",
+                concept_id="all",
+                additional="",
+                count_type="input_date_fields",
+            )
+            return {}, 1
+        input_data[datetime_col_idx] = fulldate
+        row_output_counts = {}
+        total_rejected = 0
+        # Process this row for each applicable target table
+        for target_file in applicable_targets:
+            target_counts, target_rejected = self._process_row_for_target_stream(
+                source_filename, input_data, input_column_map, target_file, file_meta
+            )
+            row_output_counts[target_file] = target_counts
+            total_rejected += target_rejected
+        return row_output_counts, total_rejected
+    def _process_row_for_target_stream(
+        self,
+        source_filename: str,
+        input_data: List[str],
+        input_column_map: Dict[str, int],
+        target_file: str,
+        file_meta: Dict[str, Any],
+    ) -> Tuple[int, int]:
+        """Process row for specific target and write records directly"""
+        v2_mapping = self.context.mappingrules.v2_mappings[target_file][source_filename]
+        target_column_map = self.context.target_column_maps[target_file]
+        # Get target metadata from cache
+        target_meta = self.cache.target_metadata_cache[target_file]
+        auto_num_col = target_meta["auto_num_col"]
+        person_id_col = target_meta["person_id_col"]
+        date_col_data = target_meta["date_col_data"]
+        date_component_data = target_meta["date_component_data"]
+        notnull_numeric_fields = target_meta["notnull_numeric_fields"]
+        data_columns = file_meta["data_fields"].get(target_file, [])
+        output_count = 0
+        rejected_count = 0
+        # Process each data column for this target
+        for data_column in data_columns:
+            if data_column not in input_column_map:
+                continue
+            column_output, column_rejected = self._process_data_column_stream(
+                source_filename,
+                input_data,
+                input_column_map,
+                target_file,
+                v2_mapping,
+                target_column_map,
+                data_column,
+                auto_num_col,
+                person_id_col,
+                date_col_data,
+                date_component_data,
+                notnull_numeric_fields,
+            )
+            output_count += column_output
+            rejected_count += column_rejected
+        return output_count, rejected_count
+    def _process_data_column_stream(
+        self,
+        source_filename: str,
+        input_data: List[str],
+        input_column_map: Dict[str, int],
+        target_file: str,
+        v2_mapping,
+        target_column_map: Dict[str, int],
+        data_column: str,
+        auto_num_col: Optional[str],
+        person_id_col: str,
+        date_col_data: Dict[str, str],
+        date_component_data: Dict[str, Dict[str, str]],
+        notnull_numeric_fields: List[str],
+    ) -> Tuple[int, int]:
+        """Process data column and write records directly to output"""
+        rejected_count = 0
+        # Create context for record building with direct write capability
+        context = RecordContext(
+            tgtfilename=target_file,
+            tgtcolmap=target_column_map,
+            v2_mapping=v2_mapping,
+            srcfield=data_column,
+            srcdata=input_data,
+            srccolmap=input_column_map,
+            srcfilename=source_filename,
+            omopcdm=self.context.omopcdm,
+            metrics=self.context.metrics,
+            # Additional context for direct writing
+            person_lookup=self.context.person_lookup,
+            record_numbers=self.context.record_numbers,
+            file_handles=self.context.file_handles,
+            auto_num_col=auto_num_col,
+            person_id_col=person_id_col,
+            date_col_data=date_col_data,
+            date_component_data=date_component_data,
+            notnull_numeric_fields=notnull_numeric_fields,
+        )
+        # Build records
+        builder = RecordBuilderFactory.create_builder(context)
+        result = builder.build_records()
+        # Update metrics
+        self.context.metrics = result.metrics
+        if not result.success:
+            rejected_count += 1
+        return result.record_count, rejected_count
+class V2ProcessingOrchestrator:
+    """Main orchestrator for the entire V2 processing pipeline"""
+    def __init__(
+        self,
+        rules_file: Path,
+        output_dir: Path,
+        input_dir: Path,
+        person_file: Path,
+        omop_ddl_file: Optional[Path],
+        omop_config_file: Optional[Path],
+        write_mode: str = "w",
+    ):
+        self.rules_file = rules_file
+        self.output_dir = output_dir
+        self.input_dir = input_dir
+        self.person_file = person_file
+        self.omop_ddl_file = omop_ddl_file
+        self.omop_config_file = omop_config_file
+        self.write_mode = write_mode
+        # Initialize components immediately
+        self.initialize_components()
+    def initialize_components(self):
+        """Initialize all processing components"""
+        self.omopcdm = OmopCDM(self.omop_ddl_file, self.omop_config_file)
+        self.mappingrules = MappingRules(self.rules_file, self.omopcdm)
+        if not self.mappingrules.is_v2_format:
+            raise ValueError("Rules file is not in v2 format!")
+        self.metrics = tools.metrics.Metrics(self.mappingrules.get_dataset_name())
+        self.output_manager = OutputFileManager(self.output_dir, self.omopcdm)
+        # Pre-compute lookup cache for efficient streaming
+        self.lookup_cache = StreamingLookupCache(self.mappingrules, self.omopcdm)
+    def setup_person_lookup(self) -> Tuple[Dict[str, str], int]:
+        """Setup person ID lookup and save mapping"""
+        saved_person_id_file = set_saved_person_id_file(None, self.output_dir)
+        person_lookup, rejected_person_count = load_person_ids(
+            saved_person_id_file,
+            self.person_file,
+            self.mappingrules,
+            use_input_person_ids="N",
+        )
+        # Save person IDs
+        with saved_person_id_file.open(mode="w") as fhpout:
+            fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
+            for person_id, person_assigned_id in person_lookup.items():
+                fhpout.write(f"{str(person_id)}\t{str(person_assigned_id)}\n")
+        return person_lookup, rejected_person_count
+    def execute_processing(self) -> ProcessingResult:
+        """Execute the complete processing pipeline with efficient streaming"""
+        try:
+            # Setup person lookup
+            person_lookup, rejected_person_count = self.setup_person_lookup()
+            # Setup output files - keep all open for streaming
+            output_files = self.mappingrules.get_all_outfile_names()
+            file_handles, target_column_maps = self.output_manager.setup_output_files(
+                output_files, self.write_mode
+            )
+            # Create processing context
+            context = ProcessingContext(
+                mappingrules=self.mappingrules,
+                omopcdm=self.omopcdm,
+                input_dir=self.input_dir,
+                person_lookup=person_lookup,
+                record_numbers={output_file: 1 for output_file in output_files},
+                file_handles=file_handles,
+                target_column_maps=target_column_maps,
+                metrics=self.metrics,
+            )
+            # Process data using efficient streaming approach
+            processor = StreamProcessor(context, self.lookup_cache)
+            result = processor.process_all_data()
+            # Log results
+            logger.info(
+                f"person_id stats: total loaded {len(person_lookup)}, reject count {rejected_person_count}"
+            )
+            for target_file, count in result.output_counts.items():
+                logger.info(f"TARGET: {target_file}: output count {count}")
+            # Write summary
+            data_summary = self.metrics.get_mapstream_summary()
+            with (self.output_dir / "summary_mapstream.tsv").open(mode="w") as dsfh:
+                dsfh.write(data_summary)
+            return result
+        finally:
+            # Always close files
+            if self.output_manager:
+                self.output_manager.close_all_files()

carrottransform/tools/person_helpers.py ADDED Viewed

@@ -0,0 +1,126 @@
+import csv
+import sys
+from pathlib import Path
+from carrottransform.tools.logger import logger_setup
+from carrottransform.tools.validation import valid_value, valid_date_value
+from carrottransform.tools.mappingrules import MappingRules
+logger = logger_setup()
+def load_last_used_ids(last_used_ids_file: Path, last_used_ids):
+    fh = last_used_ids_file.open(mode="r", encoding="utf-8-sig")
+    csvr = csv.reader(fh, delimiter="\t")
+    for last_ids_data in csvr:
+        last_used_ids[last_ids_data[0]] = int(last_ids_data[1]) + 1
+    fh.close()
+    return last_used_ids
+def load_person_ids(
+    saved_person_id_file,
+    person_file,
+    mappingrules: MappingRules,
+    use_input_person_ids,
+    delim=",",
+):
+    person_ids, person_number = _get_person_lookup(saved_person_id_file)
+    fh = person_file.open(mode="r", encoding="utf-8-sig")
+    csvr = csv.reader(fh, delimiter=delim)
+    person_columns = {}
+    person_col_in_hdr_number = 0
+    reject_count = 0
+    # Header row of the person file
+    personhdr = next(csvr)
+    # TODO: not sure if this is needed
+    logger.info("Headers in Person file: %s", personhdr)
+    # Make a dictionary of column names vs their positions
+    for col in personhdr:
+        person_columns[col] = person_col_in_hdr_number
+        person_col_in_hdr_number += 1
+    ## check the mapping rules for person to find where to get the person data) i.e., which column in the person file contains dob, sex
+    birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info(
+        "person"
+    )
+    ## get the column index of the PersonID from the input file
+    person_col = person_columns[person_id_source]
+    for persondata in csvr:
+        if not valid_value(
+            persondata[person_columns[person_id_source]]
+        ):  # just checking that the id is not an empty string
+            reject_count += 1
+            continue
+        if not valid_date_value(persondata[person_columns[birth_datetime_source]]):
+            reject_count += 1
+            continue
+        if (
+            persondata[person_col] not in person_ids
+        ):  # if not already in person_ids dict, add it
+            if use_input_person_ids == "N":
+                person_ids[persondata[person_col]] = str(
+                    person_number
+                )  # create a new integer person_id
+                person_number += 1
+            else:
+                person_ids[persondata[person_col]] = str(
+                    persondata[person_col]
+                )  # use existing person_id
+    fh.close()
+    return person_ids, reject_count
+# TODO: understand the purpose of this function and simplify it
+def set_saved_person_id_file(
+    saved_person_id_file: Path | None, output_dir: Path
+) -> Path:
+    """check if there is a saved person id file set in options - if not, check if the file exists and remove it"""
+    if saved_person_id_file is None:
+        saved_person_id_file = output_dir / "person_ids.tsv"
+        if saved_person_id_file.is_dir():
+            logger.exception(
+                f"the detected saved_person_id_file {saved_person_id_file} is already a dir"
+            )
+            sys.exit(1)
+        if saved_person_id_file.exists():
+            saved_person_id_file.unlink()
+    else:
+        if saved_person_id_file.is_dir():
+            logger.exception(
+                f"the passed saved_person_id_file {saved_person_id_file} is already a dir"
+            )
+            sys.exit(1)
+    return saved_person_id_file
+def _get_person_lookup(saved_person_id_file: Path) -> tuple[dict[str, str], int]:
+    # Saved-person-file existence test, reload if found, return last used integer
+    if saved_person_id_file.is_file():
+        person_lookup, last_used_integer = _load_saved_person_ids(saved_person_id_file)
+    else:
+        person_lookup = {}
+        last_used_integer = 1
+    return person_lookup, last_used_integer
+def _load_saved_person_ids(person_file: Path):
+    fh = person_file.open(mode="r", encoding="utf-8-sig")
+    csvr = csv.reader(fh, delimiter="\t")
+    last_int = 1
+    person_ids = {}
+    next(csvr)
+    for persondata in csvr:
+        person_ids[persondata[0]] = persondata[1]
+        last_int += 1
+    fh.close()
+    return person_ids, last_int

carrot-transform 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

carrot-transform 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl