PyPI - carrot-transform - Versions diffs - 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

carrot-transform 0.3.4py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of carrot-transform might be problematic. Click here for more details.

Files changed (11) hide show

{carrot_transform-0.3.4.dist-info → carrot_transform-0.3.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: carrot_transform
-Version: 0.3.4
+Version: 0.3.5
 Summary:
 Author: anwarfg
 Author-email: 913028+anwarfg@users.noreply.github.com
@@ -12,8 +12,8 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Requires-Dist: click (>=8.1.7,<9.0.0)
 Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
+Requires-Dist: numpy (<2)
 Requires-Dist: pandas (>=2.2.3,<3.0.0)
-Requires-Dist: pytest (>=8.3.4,<9.0.0)
 Description-Content-Type: text/markdown
 <p align="center">

{carrot_transform-0.3.4.dist-info → carrot_transform-0.3.5.dist-info}/RECORD RENAMED Viewed

@@ -3,7 +3,7 @@ carrottransform/_version.py,sha256=bm7SM-_MN0gstlNsCDO6dAajKcjQD-NxI_xpvfRx0Ts,1
 carrottransform/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 carrottransform/cli/command.py,sha256=xYTaJsVZyRYv0CzUwrh7ZPK8hhGyC3MDfvVYxHcXYSM,508
 carrottransform/cli/subcommands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-carrottransform/cli/subcommands/run.py,sha256=r2XanTvy4QowPbziZ5lqs-Tm8CAzCquL7DRy4lTT9Ak,23977
+carrottransform/cli/subcommands/run.py,sha256=GfRHG_aLoBxuXkpGTTrRmsEcNUjTUB6cl8f1B7lTBt8,28461
 carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql,sha256=fXrPfdL3IzU5ux55ogsQKjjd-c1KzdP_N2A_JjlY3gk,18084
 carrottransform/config/omop.json,sha256=OT3jvfPjKhjsDnQcQw1OAEOHhQLoHXNxTj_MDwNbYqo,1934
 carrottransform/examples/test/inputs/Covid19_test.csv,sha256=d5t7Lfhkwbfe3Uk2IBqB2ZT5o0h9QaeraC8E5-IMERo,67521
@@ -13,12 +13,13 @@ carrottransform/examples/test/inputs/covid19_antibody.csv,sha256=SPCpyqpTbVq9987
 carrottransform/examples/test/inputs/vaccine.csv,sha256=_gcM-SIymyt2Dkkr_zGmQI9keIdmDm-gDI_QvXXLFrY,44037
 carrottransform/examples/test/rules/rules_14June2021.json,sha256=n2OYNFhbx-NLhmqjAad6RsfXjQFknZIgQ7a5uyJF0Co,13226
 carrottransform/tools/__init__.py,sha256=b3JuCwgJVx0rqx5igB8hNNKO0ktlbQjHGHwy-vzpdo0,198
-carrottransform/tools/file_helpers.py,sha256=xlODDAUpsx0H4sweGZ81ttjJjNQGn2spNUa1Fndotw8,316
-carrottransform/tools/mappingrules.py,sha256=IiZx24G27Rag-YgV-4jDxprJea9Ce7SZUbjxMm0n49k,7040
-carrottransform/tools/metrics.py,sha256=LOzm80-YIVM9mvgvQXRpyArl2nSfSTTW9DikqJ5M2Yg,5700
-carrottransform/tools/omopcdm.py,sha256=MwS_MwwBrypwjbFLuxoE0xlddWIi0T3BEPgN9LPkGAs,8508
-carrot_transform-0.3.4.dist-info/LICENSE,sha256=pqIiuuTs6Na-oFd10MMsZoZmdfhfUhHeOtQzgzSkcaw,1082
-carrot_transform-0.3.4.dist-info/METADATA,sha256=mbB8-GgOH6EnJXDr2j46Q97R3ID4Dro9IbgAFcJVAXY,4219
-carrot_transform-0.3.4.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
-carrot_transform-0.3.4.dist-info/entry_points.txt,sha256=z7qmjTl7C8shrYiPBy6yZo9RRZ31Jcvo6L8ntdqbs2E,74
-carrot_transform-0.3.4.dist-info/RECORD,,
+carrottransform/tools/click.py,sha256=5fxl9zL6piwWMN4cSule0tG90E9g7eFNosoSu1ES1og,471
+carrottransform/tools/file_helpers.py,sha256=_NRswYjqpBBkp4efMBhFf9XIRaqYTw1-jA22usyrbqA,1204
+carrottransform/tools/mappingrules.py,sha256=jvWTLCQoLoCegmLWHPyRSRVOTLejp7LzmFMr-ENmuTU,7121
+carrottransform/tools/metrics.py,sha256=VrcePVGwgHCJqQ1i9Q_KqL6Cv8IbIce2pSRSBth9808,11011
+carrottransform/tools/omopcdm.py,sha256=fcqIub5ud57i-5J3iUvPi2dqfGgyjWnWJTH1djQzq9E,8603
+carrot_transform-0.3.5.dist-info/LICENSE,sha256=pqIiuuTs6Na-oFd10MMsZoZmdfhfUhHeOtQzgzSkcaw,1082
+carrot_transform-0.3.5.dist-info/METADATA,sha256=cW5wfZRrZoai-nnV5k9FVYY8-XGm24Qadu0hYV4P9R8,4206
+carrot_transform-0.3.5.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
+carrot_transform-0.3.5.dist-info/entry_points.txt,sha256=z7qmjTl7C8shrYiPBy6yZo9RRZ31Jcvo6L8ntdqbs2E,74
+carrot_transform-0.3.5.dist-info/RECORD,,

{carrot_transform-0.3.4.dist-info → carrot_transform-0.3.5.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 2.1.1
+Generator: poetry-core 2.1.2
 Root-Is-Purelib: true
 Tag: py3-none-any

carrottransform/cli/subcommands/run.py CHANGED Viewed

@@ -1,45 +1,65 @@
+import carrottransform
+import carrottransform.tools as tools
+import click
 import csv
-import os, time
 import datetime
 import fnmatch
-import sys
-import click
-import json
 import importlib.resources
-import carrottransform
-import carrottransform.tools as tools
+import json
+import logging
+import os
+import sys
+import time
+from carrottransform.tools.click import PathArgs
 from carrottransform.tools.omopcdm import OmopCDM
-from typing import Iterator, IO
+from pathlib import Path
+from typing import Iterator, IO, Iterable
+from ...tools.file_helpers import resolve_paths
+logger = logging.getLogger(__name__)
+if not logger.handlers:
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)
 @click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
 def run():
     pass
 @click.command()
-@click.option("--rules-file",
+@click.option("--rules-file", type=PathArgs,
               required=True,
               help="json file containing mapping rules")
-@click.option("--output-dir",
+@click.option("--output-dir", type=PathArgs,
               default=None,
+              required=True,
               help="define the output directory for OMOP-format tsv files")
 @click.option("--write-mode",
               default='w',
               type=click.Choice(['w','a']),
               help="force write-mode on output files")
-@click.option("--person-file",
+@click.option("--person-file", type=PathArgs,
               required=True,
               help="File containing person_ids in the first column")
-@click.option("--omop-ddl-file",
+@click.option("--omop-ddl-file", type=PathArgs,
               required=False,
               help="File containing OHDSI ddl statements for OMOP tables")
-@click.option("--omop-config-file",
+@click.option("--omop-config-file", type=PathArgs,
               required=False,
               help="File containing additional / override json config for omop outputs")
 @click.option("--omop-version",
               required=False,
               help="Quoted string containing omop version - eg '5.3'")
-@click.option("--saved-person-id-file",
+@click.option("--saved-person-id-file", type=PathArgs,
               default=None,
               required=False,
               help="Full path to person id file used to save person_id state and share person_ids between data sets")
@@ -47,7 +67,7 @@ def run():
               required=False,
               default='N',
               help="Use person ids as input without generating new integers")
-@click.option("--last-used-ids-file",
+@click.option("--last-used-ids-file", type=PathArgs,
               default=None,
               required=False,
               help="Full path to last used ids file for OMOP tables - format: tablename\tlast_used_id, \nwhere last_used_id must be an integer")
@@ -55,46 +75,108 @@ def run():
               required=False,
               default=0,
               help="Lower outcount limit for logfile output")
-@click.argument("input-dir",
-                required=False,
-                nargs=-1)
-def mapstream(rules_file, output_dir, write_mode,
-              person_file, omop_ddl_file, omop_config_file,
-              omop_version, saved_person_id_file, use_input_person_ids,
-              last_used_ids_file, log_file_threshold, input_dir):
+@click.option("--input-dir", type=PathArgs,
+    required=True,
+    multiple=True,
+    help="Input directories")
+def mapstream(
+    rules_file: Path,
+    output_dir: Path,
+    write_mode,
+    person_file: Path,
+    omop_ddl_file: Path,
+    omop_config_file: Path,
+    omop_version,
+    saved_person_id_file: Path,
+    use_input_person_ids,
+    last_used_ids_file: Path,
+    log_file_threshold,
+    input_dir: Iterable[Path],
+):
     """
     Map to output using input streams
     """
-    # Initialisation
+    # Resolve any @package paths in the arguments
+    resolved_paths = resolve_paths([
+        rules_file,
+        output_dir,
+        person_file,
+        omop_ddl_file,
+        omop_config_file,
+        saved_person_id_file,
+        last_used_ids_file,
+        input_dir[0] if input_dir else None  # Take first element of input_dir tuple
+    ])
+    # Assign back resolved paths
+    [rules_file, output_dir, person_file, omop_ddl_file,
+     omop_config_file, saved_person_id_file, last_used_ids_file,
+     input_dir] = resolved_paths
+    # Ensure input_dir is a list of paths
+    if isinstance(input_dir, (Path, str)):
+        input_dir = [input_dir]
+    elif isinstance(input_dir, tuple):
+        input_dir = list(input_dir)
+    # If it's already a list, leave it as is
+    # Initialisation
     # - check for values in optional arguments
     # - read in configuration files
     # - check main directories for existence
     # - handle saved person ids
     # - initialise metrics
-    print(rules_file, output_dir, write_mode,
-              person_file, omop_ddl_file, omop_config_file,
-              omop_version, saved_person_id_file, use_input_person_ids,
-              last_used_ids_file, log_file_threshold, input_dir)
+    logger.info(
+        ",".join(
+            map(
+                str,
+                [
+                    rules_file,
+                    output_dir,
+                    write_mode,
+                    person_file,
+                    omop_ddl_file,
+                    omop_config_file,
+                    omop_version,
+                    saved_person_id_file,
+                    use_input_person_ids,
+                    last_used_ids_file,
+                    log_file_threshold,
+                    input_dir,
+                ],
+            )
+        )
+    )
     ## set omop filenames
-    omop_config_file, omop_ddl_file = set_omop_filenames(omop_ddl_file, omop_config_file, omop_version)
+    omop_config_file, omop_ddl_file = set_omop_filenames(
+        omop_ddl_file, omop_config_file, omop_version
+    )
     ## check directories are valid
-    check_dir_isvalid(input_dir)
-    check_dir_isvalid(output_dir)
+    for idir in input_dir:
+        check_dir_isvalid(idir) # Input directory must exist
+    check_dir_isvalid(output_dir, create_if_missing=True) # Create output directory if needed
     saved_person_id_file = set_saved_person_id_file(saved_person_id_file, output_dir)
-    starttime = time.time()
+    start_time = time.time()
     ## create OmopCDM object, which contains attributes and methods for the omop data tables.
     omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
     ## mapping rules determine the ouput files? which input files and fields in the source data, AND the mappings to omop concepts
     mappingrules = tools.mappingrules.MappingRules(rules_file, omopcdm)
     metrics = tools.metrics.Metrics(mappingrules.get_dataset_name(), log_file_threshold)
-    nowtime = time.time()
-    print("--------------------------------------------------------------------------------")
-    print("Loaded mapping rules from: {0} in {1:.5f} secs".format(rules_file, (nowtime - starttime)))
+    logger.info(
+        "--------------------------------------------------------------------------------"
+    )
+    logger.info(
+        f"Loaded mapping rules from: {rules_file} in {time.time() - start_time:.5f} secs"
+    )
     output_files = mappingrules.get_all_outfile_names()
     ## set record number
@@ -102,31 +184,30 @@ def mapstream(rules_file, output_dir, write_mode,
     record_numbers = {}
     for output_file in output_files:
         record_numbers[output_file] = 1
-    if last_used_ids_file != None:
-        if os.path.isfile(last_used_ids_file):
-            record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
+    if (last_used_ids_file is not None) and last_used_ids_file.is_file():
+        record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
     fhd = {}
     tgtcolmaps = {}
     try:
         ## get all person_ids from file and either renumber with an int or take directly, and add to a dict
-        person_lookup, rejected_person_count = load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_person_ids)
+        person_lookup, rejected_person_count = load_person_ids(saved_person_id_file,
+                                                               person_file, mappingrules,
+                                                               use_input_person_ids)
         ## open person_ids output file
-        with open(saved_person_id_file, mode="w") as fhpout:
+        with saved_person_id_file.open(mode="w") as fhpout:
             ## write the header to the file
             fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
             ##iterate through the ids and write them to the file.
             for person_id, person_assigned_id in person_lookup.items():
-                fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
+                fhpout.write(f"{str(person_id)}\t{str(person_assigned_id)}")
         ## Initialise output files (adding them to a dict), output a header for each
         ## these aren't being closed deliberately
         for tgtfile in output_files:
-            fhd[tgtfile] = open(output_dir + "/" + tgtfile + ".tsv", mode=write_mode)
-            if write_mode == 'w':
+            fhd[tgtfile] = (output_dir / tgtfile).with_suffix(".tsv").open(mode=write_mode)
+            if write_mode == "w":
                 outhdr = omopcdm.get_omop_column_list(tgtfile)
                 fhd[tgtfile].write("\t".join(outhdr) + "\n")
             ## maps all omop columns for each file into a dict containing the column name and the index
@@ -134,13 +215,13 @@ def mapstream(rules_file, output_dir, write_mode,
             tgtcolmaps[tgtfile] = omopcdm.get_omop_column_map(tgtfile)
     except IOError as e:
-        print("I/O - error({0}): {1} -> {2}".format(e.errno, e.strerror, str(e)))
+        logger.exception(f"I/O - error({e.errno}): {e.strerror} -> {str(e)}")
         exit()
-    print("person_id stats: total loaded {0}, reject count {1}".format(len(person_lookup), rejected_person_count))
+    logger.info(f"person_id stats: total loaded {len(person_lookup)}, reject count {rejected_person_count}")
     ## Compare files found in the input_dir with those expected based on mapping rules
-    existing_input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
+    existing_input_files = [f.name for f in input_dir[0].glob("*.csv")]
     rules_input_files = mappingrules.get_all_infile_names()
     ## Log mismatches but continue
@@ -149,7 +230,7 @@ def mapstream(rules_file, output_dir, write_mode,
     ## set up overall counts
     rejidcounts = {}
     rejdatecounts = {}
-    print(rules_input_files)
+    logger.info(rules_input_files)
     ## set up per-input counts
     for srcfilename in rules_input_files:
@@ -162,7 +243,7 @@ def mapstream(rules_file, output_dir, write_mode,
         rejcounts = {}
         rcount = 0
-        fh, csvr = open_file(input_dir[0], srcfilename)
+        fh, csvr = open_file(input_dir[0] / srcfilename)
         if fh is None:
             continue
@@ -181,21 +262,37 @@ def mapstream(rules_file, output_dir, write_mode,
         inputcolmap = omopcdm.get_column_map(hdrdata)
         pers_id_col = inputcolmap[infile_person_id_source]
         datetime_col = inputcolmap[infile_datetime_source]
-        print("--------------------------------------------------------------------------------")
-        print("Processing input: {0}".format(srcfilename))
+        logger.info(
+            "--------------------------------------------------------------------------------"
+        )
+        logger.info(f"Processing input: {srcfilename}")
         # for each input record
         for indata in csvr:
-            key = srcfilename + "~all~all~all~"
-            metrics.increment_key_count(key, "input_count")
+            metrics.increment_key_count(
+                    source=srcfilename,
+                    fieldname="all",
+                    tablename="all",
+                    concept_id="all",
+                    additional="",
+                    count_type="input_count"
+                )
             rcount += 1
             # if there is a date, parse it - read it is a string and convert to YYYY-MM-DD
             strdate = indata[datetime_col].split(" ")[0]
             fulldate = parse_date(strdate)
-            if fulldate != None:
+            if fulldate is not None:
                 indata[datetime_col] = fulldate
             else:
-                metrics.increment_key_count(key, "invalid_date_fields")
+                metrics.increment_key_count(
+                        source=srcfilename,
+                        fieldname="all",
+                        tablename="all",
+                        concept_id="all",
+                        additional="",
+                        count_type="input_date_fields"
+                    )
                 continue
             for tgtfile in tgtfiles:
@@ -209,9 +306,9 @@ def mapstream(rules_file, output_dir, write_mode,
                 for datacol in datacols:
                     built_records, outrecords, metrics = get_target_records(tgtfile, tgtcolmap, src_to_tgt, datacol, indata, inputcolmap, srcfilename, omopcdm, metrics)
-                    if built_records == True:
+                    if built_records:
                         for outrecord in outrecords:
-                            if auto_num_col != None:
+                            if auto_num_col is not None:
                                 outrecord[tgtcolmap[auto_num_col]] = str(record_numbers[tgtfile])
                                 ### most of the rest of this section is actually to do with metrics
                                 record_numbers[tgtfile] += 1
@@ -219,70 +316,61 @@ def mapstream(rules_file, output_dir, write_mode,
                                 outrecord[tgtcolmap[pers_id_col]] = person_lookup[outrecord[tgtcolmap[pers_id_col]]]
                                 outcounts[tgtfile] += 1
-                                increment_key_counts(srcfilename, metrics, tgtfile, datacol, outrecord)
+                                metrics.increment_with_datacol(
+                                        source_path=srcfilename,
+                                        target_file=tgtfile,
+                                        datacol=datacol,
+                                        out_record=outrecord
+                                    )
                                 # write the line to the file
                                 fhd[tgtfile].write("\t".join(outrecord) + "\n")
                             else:
-                                key = srcfilename + "~all~" + tgtfile + "~all~"
-                                metrics.increment_key_count(key, "invalid_person_ids")
+                                metrics.increment_key_count(
+                                        source=srcfilename,
+                                        fieldname="all",
+                                        tablename=tgtfile,
+                                        concept_id="all",
+                                        additional="",
+                                        count_type="invalid_person_ids",
+                                    )
                                 rejidcounts[srcfilename] += 1
         fh.close()
-        nowtime= time.time()
-        print("INPUT file data : {0}: input count {1}, time since start {2:.5} secs".format(srcfilename, str(rcount), (nowtime - starttime)))
+        logger.info(f"INPUT file data : {srcfilename}: input count {str(rcount)}, time since start {time.time() - start_time:.5} secs")
         for outtablename, count in outcounts.items():
-            print("TARGET: {0}: output count {1}".format(outtablename, str(count)))
+            logger.info(f"TARGET: {outtablename}: output count {str(count)}")
     # END main processing loop
-    print("--------------------------------------------------------------------------------")
+    logger.info(
+        "--------------------------------------------------------------------------------"
+    )
     data_summary = metrics.get_mapstream_summary()
     try:
-        dsfh = open(output_dir + "/summary_mapstream.tsv", mode="w")
+        dsfh = (output_dir / "summary_mapstream.tsv").open(mode="w")
         dsfh.write(data_summary)
         dsfh.close()
     except IOError as e:
-        print("I/O error({0}): {1}".format(e.errno, e.strerror))
-        print("Unable to write file")
+        logger.exception(f"I/O error({e.errno}): {e.strerror}")
+        logger.exception("Unable to write file")
+        raise e
     # END mapstream
-    nowtime = time.time()
-    print("Elapsed time = {0:.5f} secs".format(nowtime - starttime))
-def increment_key_counts(srcfilename: str, metrics: tools.metrics.Metrics, tgtfile: str, datacol: str, outrecord: list[str]) -> None:
-    key = srcfilename + "~all~all~all~"
-    metrics.increment_key_count(key, "output_count")
-    key = "all~all~" + tgtfile + "~all~"
-    metrics.increment_key_count(key, "output_count")
-    key = srcfilename + "~all~" + tgtfile + "~all~"
-    metrics.increment_key_count(key, "output_count")
-    if tgtfile == "person":
-        key = srcfilename + "~all~" + tgtfile + "~" + outrecord[1] + "~"
-        metrics.increment_key_count(key, "output_count")
-        key = srcfilename + "~" + datacol + "~" + tgtfile + "~" + outrecord[1] + "~" + outrecord[2]
-        metrics.increment_key_count(key, "output_count")
-    else:
-        key = srcfilename + "~" + datacol + "~" + tgtfile + "~" + outrecord[2] + "~"
-        metrics.increment_key_count(key, "output_count")
-        key = srcfilename + "~all~" + tgtfile + "~" + outrecord[2] + "~"
-        metrics.increment_key_count(key, "output_count")
-        key = "all~all~" + tgtfile + "~" + outrecord[2] + "~"
-        metrics.increment_key_count(key, "output_count")
-        key = "all~all~all~" + outrecord[2] + "~"
-        metrics.increment_key_count(key, "output_count")
-    return
-def get_target_records(tgtfilename: str, tgtcolmap: dict[str, dict[str, int]], rulesmap: dict[str, list[dict[str, list[str]]]], srcfield: str, srcdata: list[str], srccolmap: dict[str, int], srcfilename: str, omopcdm: OmopCDM, metrics: tools.metrics.Metrics) -> \
-tuple[bool, list[str], tools.metrics.Metrics]:
+    logger.info(f"Elapsed time = {time.time() - start_time:.5f} secs")
+def get_target_records(
+        tgtfilename: str,
+        tgtcolmap: dict[str, dict[str, int]],
+        rulesmap: dict[str, list[dict[str, list[str]]]],
+        srcfield: str,
+        srcdata: list[str],
+        srccolmap: dict[str, int],
+        srcfilename: str,
+        omopcdm: OmopCDM,
+        metrics: tools.metrics.Metrics) -> tuple[bool, list[str], tools.metrics.Metrics]:
     """
     build all target records for a given input field
     """
@@ -292,8 +380,8 @@ tuple[bool, list[str], tools.metrics.Metrics]:
     date_component_data = omopcdm.get_omop_date_field_components(tgtfilename)
     notnull_numeric_fields = omopcdm.get_omop_notnull_numeric_fields(tgtfilename)
-    srckey = srcfilename + "~" + srcfield + "~" + tgtfilename
-    summarykey = srcfilename + "~" + srcfield + "~" + tgtfilename + "~all~"
+    srckey = f"{srcfilename}~{srcfield}~{tgtfilename}"
+    summarykey = srckey + "~all~"
     if valid_value(str(srcdata[srccolmap[srcfield]])):
         ## check if either or both of the srckey and summarykey are in the rules
         srcfullkey = srcfilename + "~" + srcfield + "~" + str(srcdata[srccolmap[srcfield]]) + "~" + tgtfilename
@@ -304,7 +392,7 @@ tuple[bool, list[str], tools.metrics.Metrics]:
         if srckey in rulesmap:
             build_records = True
             dictkeys.append(srckey)
-        if build_records == True:
+        if build_records:
             for dictkey in dictkeys:
                 for out_data_elem in rulesmap[dictkey]:
                     valid_data_elem = True
@@ -333,27 +421,47 @@ tuple[bool, list[str], tools.metrics.Metrics]:
                                     fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
                                     tgtarray[tgtcolmap[output_col_data]] = fulldate
                                 else:
-                                    metrics.increment_key_count(summarykey, "invalid_date_fields")
+                                    metrics.increment_key_count(
+                                            source=srcfilename,
+                                            fieldname=srcfield,
+                                            tablename=tgtfilename,
+                                            concept_id="all",
+                                            additional="",
+                                            count_type="invalid_date_fields"
+                                            )
                                     valid_data_elem = False
                             elif output_col_data in date_col_data:
                                 fulldate = srcdata[srccolmap[infield]]
                                 tgtarray[tgtcolmap[output_col_data]] = fulldate
                                 tgtarray[tgtcolmap[date_col_data[output_col_data]]] = fulldate
-                    if valid_data_elem == True:
+                    if valid_data_elem:
                         tgtrecords.append(tgtarray)
     else:
-        metrics.increment_key_count(summarykey, "invalid_source_fields")
+        metrics.increment_key_count(
+                source=srcfilename,
+                fieldname=srcfield,
+                tablename=tgtfilename,
+                concept_id="all",
+                additional="",
+                count_type="invalid_source_fields"
+            )
     return build_records, tgtrecords, metrics
 def valid_value(item):
     """
     Check if an item is non blank (null)
     """
     if item.strip() == "":
-        return(False)
-    return(True)
+        return False
+    return True
+# DATE TESTING
+# ------------
+# I started by changing the get_datetime_value to be neater.
+# I think it should be handled all as one thing, but I've spent too much time doing this already
 def valid_date_value(item):
     """
@@ -363,44 +471,33 @@ def valid_date_value(item):
     if item.strip() == "":
         return(False)
     if not valid_iso_date(item) and not valid_reverse_iso_date(item) and not valid_uk_date(item):
-        #print("Bad date : {0}".format(item))
-        return(False)
-    return(True)
+        logger.warning("Bad date : {0}".format(item))
+        return False
+    return True
 def get_datetime_value(item):
     """
-    Check if a date item is non null and parses as ISO (YYYY-MM-DD), reverse-ISO
-    or dd/mm/yyyy or mm/dd/yyyy
+    Check if a date item is non-null and parses as ISO (YYYY-MM-DD), reverse-ISO (DD-MM-YYYY),
+    or UK format (DD/MM/YYYY).
+    Returns a datetime object if successful, None otherwise.
     """
-    dt = None
-    # Does the date parse as an ISO date?
-    try:
-        dt = datetime.datetime.strptime(item, "%Y-%m-%d")
-    except ValueError:
-        pass
-    if dt != None:
-      return(dt)
-    # Does the date parse as a reverse ISO date?
-    try:
-        dt = datetime.datetime.strptime(item, "%d-%m-%Y")
-    except ValueError:
-        pass
-    if dt != None:
-      return(dt)
-    # Does the date parse as a UK old-style date?
-    try:
-        dt = datetime.datetime.strptime(item, "%d/%m/%Y")
-    except ValueError:
-        pass
-    if dt != None:
-      return(dt)
+    date_formats = [
+        "%Y-%m-%d",  # ISO format (YYYY-MM-DD)
+        "%d-%m-%Y",  # Reverse ISO format (DD-MM-YYYY)
+        "%d/%m/%Y",  # UK old-style format (DD/MM/YYYY)
+    ]
+    for date_format in date_formats:
+        try:
+            return datetime.datetime.strptime(item, date_format)
+        except ValueError:
+            continue
+    # If we get here, none of the formats worked
     return None
 def parse_date(item):
     """
     Crude hand-coded check on date format
@@ -411,9 +508,8 @@ def parse_date(item):
     if len(datedata) != 3:
         return None
     if len(datedata[2]) == 4:
-        return("{0}-{1}-{2}".format(datedata[2], datedata[1], datedata[0]))
-    return("{0}-{1}-{2}".format(datedata[0], datedata[1], datedata[2]))
+        return(f"{datedata[2]}-{datedata[1]}-{datedata[0]}".format(datedata[2], datedata[1], datedata[0]))
+    return "-".join(datedata[:3])
 def valid_iso_date(item):
     """
@@ -422,9 +518,10 @@ def valid_iso_date(item):
     try:
         datetime.datetime.strptime(item, "%Y-%m-%d")
     except ValueError:
-        return(False)
+        return False
+    return True
-    return(True)
 def valid_reverse_iso_date(item):
     """
@@ -433,9 +530,10 @@ def valid_reverse_iso_date(item):
     try:
         datetime.datetime.strptime(item, "%d-%m-%Y")
     except ValueError:
-        return(False)
+        return False
+    return True
-    return(True)
 def valid_uk_date(item):
     """
@@ -444,12 +542,15 @@ def valid_uk_date(item):
     try:
         datetime.datetime.strptime(item, "%d/%m/%Y")
     except ValueError:
-        return(False)
+        return False
-    return(True)
+    return True
-def load_last_used_ids(last_used_ids_file, last_used_ids):
-    fh = open(last_used_ids_file, mode="r", encoding="utf-8-sig")
+# End of date code
+def load_last_used_ids(last_used_ids_file: Path, last_used_ids):
+    fh = last_used_ids_file.open(mode="r", encoding="utf-8-sig")
     csvr = csv.reader(fh, delimiter="\t")
     for last_ids_data in csvr:
@@ -458,8 +559,9 @@ def load_last_used_ids(last_used_ids_file, last_used_ids):
     fh.close()
     return last_used_ids
-def load_saved_person_ids(person_file):
-    fh = open(person_file, mode="r", encoding="utf-8-sig")
+def load_saved_person_ids(person_file: Path):
+    fh = person_file.open(mode="r", encoding="utf-8-sig")
     csvr = csv.reader(fh, delimiter="\t")
     last_int = 1
     person_ids = {}
@@ -475,23 +577,28 @@ def load_saved_person_ids(person_file):
 def load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_person_ids, delim=","):
     person_ids, person_number = get_person_lookup(saved_person_id_file)
-    fh = open(person_file, mode="r", encoding="utf-8-sig")
+    fh = person_file.open(mode="r", encoding="utf-8-sig")
     csvr = csv.reader(fh, delimiter=delim)
     person_columns = {}
     person_col_in_hdr_number = 0
     reject_count = 0
     personhdr = next(csvr)
-    print(personhdr)
+    logger.info(personhdr)
     # Make a dictionary of column names vs their positions
     for col in personhdr:
         person_columns[col] = person_col_in_hdr_number
         person_col_in_hdr_number += 1
-## check the mapping rules for person to find where to get the person data) i.e., which column in the person file contains dob, sex
-    birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info("person")
-    print("Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source))
+    ## check the mapping rules for person to find where to get the person data) i.e., which column in the person file contains dob, sex
+    birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info(
+        "person"
+    )
+    logger.info(
+        "Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source)
+    )
     ## get the column index of the PersonID from the input file
     person_col = person_columns[person_id_source]
@@ -516,55 +623,122 @@ def load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_p
 def py():
     pass
-def check_dir_isvalid(directory: str | tuple[str, ...]) -> None:
+def check_dir_isvalid(directory: Path | tuple[Path, ...], create_if_missing: bool = False) -> None:
+    """Check if directory is valid, optionally create it if missing.
+    Args:
+        directory: Directory path as string or tuple
+        create_if_missing: If True, create directory if it doesn't exist
+    """
+    ## check directory has been set
+    if directory is None:
+        logger.warning("Directory not provided.")
+        sys.exit(1)
     ## check output dir is valid
-    if type(directory) is tuple:
+    elif type(directory) is tuple:
         directory = directory[0]
-    if not os.path.isdir(directory):
-        print("Not a directory, dir {0}".format(directory))
-        sys.exit(1)
-def set_saved_person_id_file(saved_person_id_file: str, output_dir: str) -> str:
-## check if there is a saved person id file set in options - if not, check if the file exists and remove it
+    ## if not a directory, create it if requested (including parents. This option is for the output directory only).
+    if not directory.is_dir():
+        if create_if_missing:
+            try:
+                ## deliberately not using the exist_ok option, as we want to know whether it was created or not to provide different logger messages.
+                directory.mkdir(parents = True)
+                logger.info(f"Created directory: {directory}")
+            except OSError as e:
+                logger.warning(f"Failed to create directory {directory}: {e}")
+                sys.exit(1)
+        else:
+            logger.warning(f"Not a directory, dir {directory}")
+            sys.exit(1)
+    # Handle tuple input (like input_dir)
+    if isinstance(directory, tuple):
+        if not directory:  # Empty tuple
+            print("No directory provided")
+            sys.exit(1)
+        directory = directory[0]
+    # Handle string input
+    dir_path = str(directory)
+    if not os.path.isdir(dir_path):
+        if create_if_missing:
+            try:
+                os.makedirs(dir_path)
+                print(f"Created directory: {dir_path}")
+            except OSError as e:
+                print(f"Failed to create directory {dir_path}: {e}")
+                sys.exit(1)
+        else:
+            print(f"Not a directory, dir {dir_path}")
+            sys.exit(1)
+def set_saved_person_id_file(
+    saved_person_id_file: Path | None, output_dir: Path
+) -> Path:
+    """check if there is a saved person id file set in options - if not, check if the file exists and remove it"""
     if saved_person_id_file is None:
-        saved_person_id_file = output_dir + "/" + "person_ids.tsv"
-        if os.path.exists(saved_person_id_file):
-            os.remove(saved_person_id_file)
+        saved_person_id_file = output_dir / "person_ids.tsv"
+        if saved_person_id_file.exists():
+            assert not saved_person_id_file.is_dir()
+            saved_person_id_file.unlink()
+    else:
+        assert not saved_person_id_file.is_dir()
     return saved_person_id_file
 def check_files_in_rules_exist(rules_input_files: list[str], existing_input_files: list[str]) -> None:
     for infile in existing_input_files:
         if infile not in rules_input_files:
-            msg = "WARNING: no mapping rules found for existing input file - {0}".format(infile)
-            print(msg)
+            msg = (
+                "WARNING: no mapping rules found for existing input file - {0}".format(
+                    infile
+                )
+            )
+            logger.warning(msg)
     for infile in rules_input_files:
         if infile not in existing_input_files:
             msg = "WARNING: no data for mapped input file - {0}".format(infile)
-            print(msg)
+            logger.warning(msg)
-def open_file(directory: str, filename: str) -> tuple[IO[str], Iterator[list[str]]] | None:
-#def open_file(directory: str, filename: str):
+def open_file(file_path: Path) -> tuple[IO[str], Iterator[list[str]]] | None:
+    """opens a file and does something related to CSVs"""
     try:
-        fh = open(directory + "/" + filename, mode="r", encoding="utf-8-sig")
+        fh = file_path.open(mode="r", encoding="utf-8-sig")
         csvr = csv.reader(fh)
         return fh, csvr
     except IOError as e:
-        print("Unable to open: {0}".format(directory + "/" + filename))
-        print("I/O error({0}): {1}".format(e.errno, e.strerror))
+        logger.exception("Unable to open: {0}".format(file_path))
+        logger.exception("I/O error({0}): {1}".format(e.errno, e.strerror))
         return None
-def set_omop_filenames(omop_ddl_file: str, omop_config_file: str, omop_version: str) -> tuple[str, str]:
-    if (omop_ddl_file is None) and (omop_config_file is None) and (omop_version is not None):
-        omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
+def set_omop_filenames(
+    omop_ddl_file: Path, omop_config_file: Path, omop_version: str
+) -> tuple[Path, Path]:
+    if (
+        (omop_ddl_file is None)
+        and (omop_config_file is None)
+        and (omop_version is not None)
+    ):
+        omop_config_file = (
+            importlib.resources.files("carrottransform") / "config/omop.json"
+        )
         omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
-        omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
+        omop_ddl_file = (
+            importlib.resources.files("carrottransform") / "config" / omop_ddl_file_name
+        )
     return omop_config_file, omop_ddl_file
-def get_person_lookup(saved_person_id_file: str) -> tuple[dict[str, str], int]:
+def get_person_lookup(saved_person_id_file: Path) -> tuple[dict[str, str], int]:
     # Saved-person-file existence test, reload if found, return last used integer
-    if os.path.isfile(saved_person_id_file):
+    if saved_person_id_file.is_file():
         person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
     else:
         person_lookup = {}
@@ -572,6 +746,3 @@ def get_person_lookup(saved_person_id_file: str) -> tuple[dict[str, str], int]:
     return person_lookup, last_used_integer
 run.add_command(mapstream,"mapstream")
-if __name__== '__main__':
-    mapstream()

carrottransform/tools/click.py ADDED Viewed

@@ -0,0 +1,21 @@
+import click
+from pathlib import Path
+def PathArgs():
+    """used by the click library for CLI args that are files"""
+    class PathArgs(click.ParamType):
+        name = "pathlib.Path"
+        def convert(self, value, param, ctx):
+            try:
+                return Path(value)
+            except Exception as e:
+                self.fail(f"Invalid path: {value} ({e})", param, ctx)
+    return PathArgs()
+# use this
+PathArgs = PathArgs()

carrottransform/tools/file_helpers.py CHANGED Viewed

@@ -1,15 +1,41 @@
+import json
+import logging
 import os
 import sys
 import json
+import importlib.resources as resources
+from typing import List, Optional
+from pathlib import Path
+logger = logging.getLogger(__name__)
 # Function inherited from the "old" CaRROT-CDM (modfied to exit on error)
-def load_json(f_in):
+def load_json(f_in: Path):
     try:
-        data = json.load(open(f_in))
+        data = json.load(f_in.open())
     except Exception as err:
-        print ("{0} not found. Or cannot parse as json".format(f_in))
+        logger.exception("{0} not found. Or cannot parse as json".format(f_in))
         sys.exit()
     return data
+def resolve_paths(args: List[Optional[Path]]) -> List[Optional[Path]]:
+    """Resolve special path syntaxes in command line arguments."""
+    try:
+        with resources.files('carrottransform').joinpath('__init__.py') as f:
+            package_path = f.parent
+    except Exception:
+        # Fallback for development environment
+        import carrottransform
+        package_path = Path(carrottransform.__file__).resolve().parent
+    # Handle None values and replace @carrot with the actual package path
+    prefix = '@carrot'
+    return [
+        package_path / Path(str(arg).replace(prefix, '').lstrip('/')) if arg is not None and str(arg).startswith(prefix) else arg
+        for arg in args
+    ]

carrottransform/tools/mappingrules.py CHANGED Viewed

@@ -3,13 +3,16 @@ import json
 import carrottransform.tools as tools
 from .omopcdm import OmopCDM
+import logging
+logger = logging.getLogger(__name__)
 class MappingRules:
     """
     self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
     as a file-specific dictionary allowing rules to be "looked-up" depending on data content
     """
-    def __init__(self, rulesfilepath, omopcdm):
+    def __init__(self, rulesfilepath: os.PathLike, omopcdm: OmopCDM):
         ## just loads the json directly
         self.rules_data = tools.load_json(rulesfilepath)
         self.omopcdm = omopcdm
@@ -80,7 +83,7 @@ class MappingRules:
             outfile = keydata[-1]
             for outfield_elem in outfield_data:
                 for infield, outfield_list in outfield_elem.items():
-                    #print("{0}, {1}, {2}".format(outfile, infield, str(outfield_list)))
+                    logger.debug("{0}, {1}, {2}".format(outfile, infield, str(outfield_list)))
                     for outfield in outfield_list:
                         if outfield.split('~')[0] in self.omopcdm.get_omop_datetime_fields(outfile):
                             datetime_source = infield

carrottransform/tools/metrics.py CHANGED Viewed

@@ -1,3 +1,95 @@
+import logging
+logger = logging.getLogger(__name__)
+from dataclasses import dataclass, field
+from typing import Dict, List
+@dataclass
+class DataKey:
+    source: str
+    fieldname:str
+    tablename:str
+    concept_id:str
+    additional:str
+    def __str__(self) -> str:
+        """
+        The original implementation used strings as keys, then split by `~`.
+        This is here in case that representation is needed somewhere
+        """
+        return f"{self.source}~{self.fieldname}~{self.tablename}~{self.concept_id}~{self.additional}"
+    def __hash__(self) -> int:
+        """
+        The DataKey is used as a key for a dictionary of key counts
+        """
+        return hash((self.source, self.fieldname, self.tablename, self.concept_id, self.additional))
+@dataclass
+class CountData:
+    counts: Dict[str, int] = field(default_factory=dict)
+    def increment(self, count_type: str):
+        if count_type not in self.counts:
+            self.counts[count_type] = 0
+        self.counts[count_type] += 1
+    def get_count(self, count_type: str, default: int=0):
+        return self.counts.get(count_type, default)
+@dataclass
+class MapstreamSummaryRow:
+    """Represents a single row in the mapstream summary"""
+    dataset_name: str
+    source: str
+    fieldname: str
+    tablename: str
+    concept_id: str
+    additional: str
+    input_count: int = 0
+    invalid_person_ids: int = 0
+    invalid_date_fields: int = 0
+    invalid_source_fields: int = 0
+    output_count: int = 0
+    def to_tsv_row(self) -> str:
+        """Convert the row to a tab-separated string"""
+        row_list = [str(col) for col in [
+            self.dataset_name,
+            self.source,
+            self.fieldname,
+            self.tablename,
+            self.concept_id,
+            self.additional,
+            self.input_count,
+            self.invalid_person_ids,
+            self.invalid_date_fields,
+            self.invalid_source_fields,
+            self.output_count
+            ]]
+        # If python gets updated, you can move the row_str expression into the f-string
+        row_str = '\t'.join(row_list)
+        return f"{row_str}\n"
+    @classmethod
+    def get_header(cls) -> str:
+        """Return the TSV header row"""
+        header = [
+                "dsname",
+                "source",
+                "source_field",
+                "target",
+                "concept_id",
+                "additional",
+                "incount",
+                "invalid_persid",
+                "invalid_date",
+                "invalid_source",
+                "outcount"
+                ]
+        header_str = '\t'.join(header)
+        return f"{header_str}\n"
 class Metrics():
     """
     Capture metrics for output to a summary tsv file, record counts at multiple levels
@@ -58,21 +150,87 @@ class Metrics():
                 self.datasummary[dkey][counttype] = 0
             self.datasummary[dkey][counttype] += int(count_block[counttype])
-    def increment_key_count(self, dkey, count_type):
-        """
-        Intended to work with the mapstream functions
-        """
+    def increment_key_count(self, source, fieldname, tablename, concept_id, additional, count_type):
+        dkey = DataKey(source, fieldname, tablename, concept_id, additional)
         if dkey not in self.datasummary:
-            self.datasummary[dkey] = {}
-        if count_type not in self.datasummary[dkey]:
-            self.datasummary[dkey][count_type] = 0
-        self.datasummary[dkey][count_type] += 1
+            self.datasummary[dkey] = CountData()
+        self.datasummary[dkey].increment(count_type)
+    def increment_with_datacol(
+            self,
+            source_path: str,
+            target_file: str,
+            datacol: str,
+            out_record: List[str],
+            ) -> None:
+        #Are the parameters for DataKeys hierarchical?
+        #If so, a nested structure where a Source contains n Fields etc. and each has a method to sum its children would be better
+        #But I don't know if that's the desired behaviour
+        #A lot of these increment the same thing, so I have defined `increment_this`
+        def increment_this(
+                fieldname: str,
+                concept_id: str,
+                additional = "",
+                ) -> None:
+            self.increment_key_count(
+                    source=source_path,
+                    fieldname=fieldname,
+                    tablename=target_file,
+                    concept_id=concept_id,
+                    additional=additional,
+                    count_type="output_count"
+                    )
+        self.increment_key_count(
+                source=source_path,
+                fieldname="all",
+                tablename="all",
+                concept_id="all",
+                additional="",
+                count_type="output_count"
+                )
+        self.increment_key_count(
+                source="all",
+                fieldname="all",
+                tablename=target_file,
+                concept_id="all",
+                additional="",
+                count_type="output_count"
+                )
+        increment_this(fieldname="all", concept_id="all")
+        if target_file == "person":
+            increment_this(fieldname="all", concept_id=out_record[1])
+            increment_this(fieldname="all", concept_id=out_record[1], additional=out_record[2])
+        else:
+            increment_this(fieldname=datacol, concept_id=out_record[2])
+            increment_this(fieldname="all", concept_id=out_record[2])
+            self.increment_key_count(
+                    source="all",
+                    fieldname="all",
+                    tablename=target_file,
+                    concept_id=out_record[2],
+                    additional="",
+                    count_type="output_count"
+                    )
+            self.increment_key_count(
+                    source="all",
+                    fieldname="all",
+                    tablename="all",
+                    concept_id=out_record[2],
+                    additional="",
+                    count_type="output_count"
+                    )
     def get_summary(self):
         summary_str = "source\ttablename\tname\tcolumn name\tbefore\tafter content check\tpct reject content check\tafter date format check\tpct reject date format\n"
         for dkey in self.datasummary:
-            #print(dkey)
+            logger.debug(dkey)
             source, tablename, name, colname = dkey.split('.')
             before_count = int(self.datasummary[dkey]["before"])
             after_count = int(self.datasummary[dkey]["after"])
@@ -90,40 +248,54 @@ class Metrics():
     def get_data_summary(self):
         return self.datasummary
-    def get_mapstream_summary(self):
-        summary_str = "dsname\tsource\tsource_field\ttarget\tconcept_id\tadditional\tincount\tinvalid_persid\tinvalid_date\tinvalid_source\toutcount\n"
-        for dkey in sorted(self.datasummary):
-            try:
-                source, fieldname, tablename, concept_id, additional = dkey.split('~')
-            except ValueError:
-                print("get_mapstream_summary - ValueError: {0}".format(dkey))
-                break
-            source = self.get_prefix(source)
-            dvalue = self.datasummary[dkey]
-            input_count = "0"
-            if "input_count" in dvalue:
-                input_count = str(dvalue["input_count"])
+    def get_mapstream_summary_rows(self) -> List[MapstreamSummaryRow]:
+        """
+        Creates a list of MapstreamSummaryRow from the datasummary
+        """
+        rows = []
-            invalid_person_ids = "0"
-            if "invalid_person_ids" in dvalue:
-                invalid_person_ids = str(dvalue["invalid_person_ids"])
+        for d_key in sorted(self.datasummary.keys(), key=str):
+            source = self.get_prefix(d_key.source)
+            count_data = self.datasummary[d_key]
-            invalid_source_fields = "0"
-            if "invalid_source_fields" in dvalue:
-                invalid_source_fields = str(dvalue["invalid_source_fields"])
+            row = MapstreamSummaryRow(
+                dataset_name=self.dataset_name,
+                source=source,
+                fieldname=d_key.fieldname,
+                tablename=d_key.tablename,
+                concept_id=d_key.concept_id,
+                additional=d_key.additional,
+                input_count=count_data.get_count("input_count"),
+                invalid_person_ids=count_data.get_count("invalid_person_ids"),
+                invalid_date_fields=count_data.get_count("invalid_date_fields"),
+                invalid_source_fields=count_data.get_count("invalid_source_fields"),
+                output_count=count_data.get_count("output_count")
+            )
-            invalid_date_fields = "0"
-            if "invalid_date_fields" in dvalue:
-                invalid_date_fields = str(dvalue["invalid_date_fields"])
+            if row.output_count >= self.log_threshold:
+                rows.append(row)
+        return rows
-            output_count = "0"
-            if "output_count" in dvalue:
-                output_count = str(dvalue["output_count"])
-            if (int(output_count) >= self.log_threshold):
-              summary_str += self.dataset_name + "\t" + source + "\t" + fieldname + "\t" + tablename + "\t" + concept_id + "\t" + additional +"\t" + input_count + "\t" + invalid_person_ids + "\t" + invalid_date_fields + "\t" + invalid_source_fields + "\t" + output_count + "\n"
+    def get_mapstream_summary(self) -> str:
+        """
+        Makes a TSV string of the mapstream summary
+        """
+        summary_rows = self.get_mapstream_summary_rows()
+        result = MapstreamSummaryRow.get_header()
+        for row in summary_rows:
+            result += row.to_tsv_row()
+        return result
-        return summary_str
+    def get_mapstream_summary_dict(self) -> Dict:
+        """
+        Makes a dict of the mapstream summary
+        """
+        rows = self.get_mapstream_summary_rows()
+        return {
+            "dataset": self.dataset_name,
+            "threshold": self.log_threshold,
+            "rows": [vars(row) for row in rows]
+        }

carrottransform/tools/omopcdm.py CHANGED Viewed

@@ -1,8 +1,13 @@
 import carrottransform.tools as tools
 import json
+import logging
 import re
 import sys
+from pathlib import Path
+logger = logging.getLogger(__name__)
 class OmopCDM:
     """
     Load and parse OMOP DDL data, to make an in-memory json CDM
@@ -29,11 +34,11 @@ class OmopCDM:
         self.auto_number_field = self.get_columns("auto_number_field")
-    def load_ddl(self, omopddl):
+    def load_ddl(self, omopddl: Path):
         try:
-            fp = open(omopddl, "r")
+            fp = omopddl.open("r")
         except Exception as err:
-            print("OMOP ddl file ({0}) not found".format(omopddl))
+            logger.exception("OMOP ddl file ({0}) not found".format(omopddl))
             sys.exit()
         return(self.process_ddl(fp))

{carrot_transform-0.3.4.dist-info → carrot_transform-0.3.5.dist-info}/LICENSE RENAMED Viewed

File without changes

{carrot_transform-0.3.4.dist-info → carrot_transform-0.3.5.dist-info}/entry_points.txt RENAMED Viewed

File without changes

carrot-transform 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl

Potentially problematic release.

carrot-transform 0.3.4py3-none-any.whl → 0.3.5py3-none-any.whl