PyPI - carrot-transform - Versions diffs - 0.3__tar.gz → 0.3.1__tar.gz - Mend

carrot-transform 0.3tar.gz → 0.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of carrot-transform might be problematic. Click here for more details.

Files changed (27) hide show

{carrot_transform-0.3 → carrot_transform-0.3.1}/PKG-INFO RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.1
 Name: carrot-transform
-Version: 0.3
-Summary: Carrot somple transformer, input rules and data csvs, output OMOP
+Version: 0.3.1
+Summary: Carrot simple transformer, input rules and data csv's, output OMOP
 Author-email: PD Appleby <pdappleby@gmail.com>
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License

{carrot_transform-0.3 → carrot_transform-0.3.1}/carrot_transform.egg-info/PKG-INFO RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.1
 Name: carrot-transform
-Version: 0.3
-Summary: Carrot somple transformer, input rules and data csvs, output OMOP
+Version: 0.3.1
+Summary: Carrot simple transformer, input rules and data csv's, output OMOP
 Author-email: PD Appleby <pdappleby@gmail.com>
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License

{carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/_version.py RENAMED Viewed

@@ -1,2 +1,2 @@
 # TODO - pick this up automatically when building
-__version__ = '0.3'
+__version__ = '0.3.1'

{carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/cli/subcommands/run.py RENAMED Viewed

@@ -53,6 +53,12 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
     """
     Map to output using input streams
     """
+    # Initialisation
+    # - check for values in optional arguments
+    # - read in configuration files
+    # - check main directories for existence
+    # - handle saved persion ids
+    # - initialise metrics
     omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
     omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
     omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
@@ -88,7 +94,7 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
     tgtcolmaps = {}
     try:
-        # Add in a saved-person-file existence test and reload from it is necessary returning the last used integer
+        # Saved-person-file existence test, reload if found, return last used integer
         if os.path.isfile(saved_person_id_file):
             person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
         else:
@@ -98,14 +104,13 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
             if os.path.isfile(last_used_ids_file):
                 record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
-        #fhp = open(person_file, mode="r", encoding="utf-8-sig")
-        #csvrp = csv.reader(fhp)
         person_lookup, rejected_person_count = load_person_ids(person_file, person_lookup, mappingrules, use_input_person_ids, last_used_integer)
         fhpout = open(saved_person_id_file, mode="w")
         fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
         for person_id, person_assigned_id in person_lookup.items():
             fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
         fhpout.close()
+        # Initialise output files, output a header for each
         for tgtfile in output_files:
             fhd[tgtfile] = open(output_dir + "/" + tgtfile + ".tsv", mode=write_mode)
             if write_mode == 'w':
@@ -119,9 +124,10 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
     print("person_id stats: total loaded {0}, reject count {1}".format(len(person_lookup), rejected_person_count))
-    # TODO get this list of input files from the  parsed rules
+    # Compare files found in the input_dir with those expected based on mapping rules
     existing_input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
     rules_input_files = mappingrules.get_all_infile_names()
+    # Log mismatches but continue
     for infile in existing_input_files:
         if infile not in rules_input_files:
             msg = "ERROR: no mapping rules found for existing input file - {0}".format(infile)
@@ -132,15 +138,18 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
             msg = "ERROR: no data for mapped input file - {0}".format(infile)
             print(msg)
             metrics.add_log_data(msg)
+    # set up overall counts
     rejidcounts = {}
     rejdatecounts = {}
-    #src_tgt_counts = {}
     print(rules_input_files)
+    # set up per-input counts
     for srcfilename in rules_input_files:
         rejidcounts[srcfilename] = 0
         rejdatecounts[srcfilename] = 0
+    # main processing loop, for each input file
     for srcfilename in rules_input_files:
         outcounts = {}
         rejcounts = {}
@@ -169,17 +178,15 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
         datetime_col = inputcolmap[infile_datetime_source]
         print("--------------------------------------------------------------------------------")
         print("Processing input: {0}".format(srcfilename))
-#        print("Processing input: {0}, All input cols = {1}, Data cols = {2}".format(srcfilename, str(datacolsall), str(dflist)))
+        # for each input record
         for indata in csvr:
-            #indata = inputline.strip().split(",")
             key = srcfilename + "~all~all~all~"
             metrics.increment_key_count(key, "input_count")
             rcount += 1
             strdate = indata[datetime_col].split(" ")[0]
             fulldate = parse_date(strdate)
             if fulldate != None:
-                #fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
                 indata[datetime_col] = fulldate
             else:
                 metrics.increment_key_count(key, "invalid_date_fields")
@@ -258,6 +265,9 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
     #stats.print_stats()
 def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srccolmap, srcfilename, omopcdm, metrics):
+    """
+    build all target records for a given input field
+    """
     build_records = False
     tgtrecords = []
     date_col_data = omopcdm.get_omop_datetime_linked_fields(tgtfilename)

{carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/tools/file_helpers.py RENAMED Viewed

@@ -1,5 +1,7 @@
 import os
 import json
+# Function inherited from the "old" CaRROT-CDM
 def load_json(f_in):
   if os.path.exists(f_in):

{carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/tools/mappingrules.py RENAMED Viewed

@@ -4,6 +4,10 @@ import carrottransform.tools as tools
 from .omopcdm import OmopCDM
 class MappingRules:
+    """
+    self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
+    as a file-specific dictionary allowing rules to be "looked-up" depending on data content
+    """
     def __init__(self, rulesfilepath, omopcdm):
         self.rules_data = tools.load_json(rulesfilepath)

{carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/tools/metrics.py RENAMED Viewed

@@ -1,5 +1,12 @@
 class Metrics():
+    """
+    Capture metrics for output to a summary tsv file, record counts at multiple levels
+    The main principle is to increment counts associated with datakeys (dkey) at different levels
+    """
     def __init__(self, dataset_name, log_threshold=0):
+        """
+        self.datasummary holds all the saved counts
+        """
         self.datasummary={}
         self.allcounts={}
         self.log_data=""
@@ -7,6 +14,9 @@ class Metrics():
         self.log_threshold = log_threshold
     def get_new_mapstream_counts(self):
+        """
+        return a new, initialised,  count structure
+        """
         counts = {}
         counts["input_count"] = 0
         counts["invalid_persids"] = 0

{carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/tools/omopcdm.py RENAMED Viewed

@@ -4,6 +4,11 @@ import re
 import sys
 class OmopCDM:
+    """
+    Load and parse OMOP DDL data, to make an in-memory json CDM
+    Merge in manual additions (currently necessary to identify, person id, date / time fields and autonumber fields)
+    Define a series of "get" functions to allow CDM component discovery
+    """
     def __init__(self, omopddl, omopcfg):
         self.numeric_types = ["integer", "numeric"]

{carrot_transform-0.3 → carrot_transform-0.3.1}/pyproject.toml RENAMED Viewed

@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "carrot-transform"
-version = "0.3"
+version = "0.3.1"
 authors = [
   { name="PD Appleby", email="pdappleby@gmail.com" },
 ]
-description = "Carrot somple transformer, input rules and data csvs, output OMOP"
+description = "Carrot simple transformer, input rules and data csv's, output OMOP"
 readme = "README.md"
 requires-python = ">=3.9"
 classifiers = [