PyPI - carrot-transform - Versions diffs - 0.3__tar.gz → 0.3.2__tar.gz - Mend

carrot-transform 0.3tar.gz → 0.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of carrot-transform might be problematic. Click here for more details.

Files changed (28) hide show

{carrot_transform-0.3 → carrot_transform-0.3.2}/PKG-INFO RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.1
 Name: carrot-transform
-Version: 0.3
-Summary: Carrot somple transformer, input rules and data csvs, output OMOP
+Version: 0.3.2
+Summary: Carrot simple transformer, input rules and data csv's, output OMOP
 Author-email: PD Appleby <pdappleby@gmail.com>
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License

{carrot_transform-0.3 → carrot_transform-0.3.2}/carrot_transform.egg-info/PKG-INFO RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.1
 Name: carrot-transform
-Version: 0.3
-Summary: Carrot somple transformer, input rules and data csvs, output OMOP
+Version: 0.3.2
+Summary: Carrot simple transformer, input rules and data csv's, output OMOP
 Author-email: PD Appleby <pdappleby@gmail.com>
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License

{carrot_transform-0.3 → carrot_transform-0.3.2}/carrot_transform.py RENAMED Viewed

@@ -1,4 +1,5 @@
 # Provides an entry point for the built executable
+# Build with "pyinstaller --onefile carrot_transform.py"
 from carrottransform.cli.command import transform
 if __name__ == '__main__':
   transform()

{carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/_version.py RENAMED Viewed

@@ -1,2 +1,2 @@
 # TODO - pick this up automatically when building
-__version__ = '0.3'
+__version__ = '0.3.2'

{carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/cli/subcommands/run.py RENAMED Viewed

@@ -27,8 +27,14 @@ def run():
 @click.option("--person-file",
               required=True,
               help="File containing person_ids in the first column")
+@click.option("--omop-ddl-file",
+              required=False,
+              help="File containing OHDSI ddl statements for OMOP tables")
+@click.option("--omop-config-file",
+              required=False,
+              help="File containing additional / override json config for omop outputs")
 @click.option("--omop-version",
-              required=True,
+              required=False,
               help="Quoted string containing opmop version - eg '5.3'")
 @click.option("--saved-person-id-file",
               default=None,
@@ -49,13 +55,23 @@ def run():
 @click.argument("input-dir",
                 required=False,
                 nargs=-1)
-def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, saved_person_id_file, use_input_person_ids, last_used_ids_file, log_file_threshold, input_dir):
+def mapstream(rules_file, output_dir, write_mode,
+              person_file, omop_ddl_file, omop_config_file,
+              omop_version, saved_person_id_file, use_input_person_ids,
+              last_used_ids_file, log_file_threshold, input_dir):
     """
     Map to output using input streams
     """
-    omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
-    omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
-    omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
+    # Initialisation
+    # - check for values in optional arguments
+    # - read in configuration files
+    # - check main directories for existence
+    # - handle saved persion ids
+    # - initialise metrics
+    if (omop_ddl_file == None) and (omop_config_file == None) and (omop_version != None):
+      omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
+      omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
+      omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
     if os.path.isdir(input_dir[0]) == False:
         print("Not a directory, input dir {0}".format(input_dir[0]))
@@ -72,13 +88,12 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
     starttime = time.time()
     omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
-    #print(omopcdm.dump_ddl())
     mappingrules = tools.mappingrules.MappingRules(rules_file, omopcdm)
     metrics = tools.metrics.Metrics(mappingrules.get_dataset_name(), log_file_threshold)
     nowtime = time.time()
     print("--------------------------------------------------------------------------------")
-    print("Loaded mapping rules from: {0} after {1:.5f} secs".format(rules_file, (nowtime - starttime)))
+    print("Loaded mapping rules from: {0} in {1:.5f} secs".format(rules_file, (nowtime - starttime)))
     output_files = mappingrules.get_all_outfile_names()
     record_numbers = {}
     for output_file in output_files:
@@ -88,7 +103,7 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
     tgtcolmaps = {}
     try:
-        # Add in a saved-person-file existence test and reload from it is necessary returning the last used integer
+        # Saved-person-file existence test, reload if found, return last used integer
         if os.path.isfile(saved_person_id_file):
             person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
         else:
@@ -98,14 +113,13 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
             if os.path.isfile(last_used_ids_file):
                 record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
-        #fhp = open(person_file, mode="r", encoding="utf-8-sig")
-        #csvrp = csv.reader(fhp)
         person_lookup, rejected_person_count = load_person_ids(person_file, person_lookup, mappingrules, use_input_person_ids, last_used_integer)
         fhpout = open(saved_person_id_file, mode="w")
         fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
         for person_id, person_assigned_id in person_lookup.items():
             fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
         fhpout.close()
+        # Initialise output files, output a header for each
         for tgtfile in output_files:
             fhd[tgtfile] = open(output_dir + "/" + tgtfile + ".tsv", mode=write_mode)
             if write_mode == 'w':
@@ -119,28 +133,30 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
     print("person_id stats: total loaded {0}, reject count {1}".format(len(person_lookup), rejected_person_count))
-    # TODO get this list of input files from the  parsed rules
+    # Compare files found in the input_dir with those expected based on mapping rules
     existing_input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
     rules_input_files = mappingrules.get_all_infile_names()
+    # Log mismatches but continue
     for infile in existing_input_files:
         if infile not in rules_input_files:
             msg = "ERROR: no mapping rules found for existing input file - {0}".format(infile)
             print(msg)
-            metrics.add_log_data(msg)
     for infile in rules_input_files:
         if infile not in existing_input_files:
             msg = "ERROR: no data for mapped input file - {0}".format(infile)
             print(msg)
-            metrics.add_log_data(msg)
+    # set up overall counts
     rejidcounts = {}
     rejdatecounts = {}
-    #src_tgt_counts = {}
     print(rules_input_files)
+    # set up per-input counts
     for srcfilename in rules_input_files:
         rejidcounts[srcfilename] = 0
         rejdatecounts[srcfilename] = 0
+    # main processing loop, for each input file
     for srcfilename in rules_input_files:
         outcounts = {}
         rejcounts = {}
@@ -169,17 +185,15 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
         datetime_col = inputcolmap[infile_datetime_source]
         print("--------------------------------------------------------------------------------")
         print("Processing input: {0}".format(srcfilename))
-#        print("Processing input: {0}, All input cols = {1}, Data cols = {2}".format(srcfilename, str(datacolsall), str(dflist)))
+        # for each input record
         for indata in csvr:
-            #indata = inputline.strip().split(",")
             key = srcfilename + "~all~all~all~"
             metrics.increment_key_count(key, "input_count")
             rcount += 1
             strdate = indata[datetime_col].split(" ")[0]
             fulldate = parse_date(strdate)
             if fulldate != None:
-                #fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
                 indata[datetime_col] = fulldate
             else:
                 metrics.increment_key_count(key, "invalid_date_fields")
@@ -236,28 +250,26 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
         print("INPUT file data : {0}: input count {1}, time since start {2:.5} secs".format(srcfilename, str(rcount), (nowtime - starttime)))
         for outtablename, count in outcounts.items():
             print("TARGET: {0}: output count {1}".format(outtablename, str(count)))
+    # END main processing loop
     print("--------------------------------------------------------------------------------")
     data_summary = metrics.get_mapstream_summary()
-    log_report = metrics.get_log_data()
     try:
         dsfh = open(output_dir + "/summary_mapstream.tsv", mode="w")
         dsfh.write(data_summary)
         dsfh.close()
-        logfh = open(output_dir + "/error_report.txt", mode="w")
-        logfh.write(log_report)
-        logfh.close()
     except IOError as e:
         print("I/O error({0}): {1}".format(e.errno, e.strerror))
         print("Unable to write file")
+    # END mapstream
     nowtime = time.time()
     print("Elapsed time = {0:.5f} secs".format(nowtime - starttime))
-    #profiler.disable()
-    #stats = pstats.Stats(profiler).sort_stats('ncalls')
-    #stats.print_stats()
 def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srccolmap, srcfilename, omopcdm, metrics):
+    """
+    build all target records for a given input field
+    """
     build_records = False
     tgtrecords = []
     date_col_data = omopcdm.get_omop_datetime_linked_fields(tgtfilename)

carrot_transform-0.3.2/carrottransform/tools/file_helpers.py ADDED Viewed

@@ -0,0 +1,15 @@
+import os
+import sys
+import json
+# Function inherited from the "old" CaRROT-CDM (modfied to exit on error)
+def load_json(f_in):
+    try:
+        data = json.load(open(f_in))
+    except Exception as err:
+        print ("{0} not found. Or cannot parse as json".format(f_in))
+        sys.exit()
+    return data

{carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/tools/mappingrules.py RENAMED Viewed

@@ -4,6 +4,10 @@ import carrottransform.tools as tools
 from .omopcdm import OmopCDM
 class MappingRules:
+    """
+    self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
+    as a file-specific dictionary allowing rules to be "looked-up" depending on data content
+    """
     def __init__(self, rulesfilepath, omopcdm):
         self.rules_data = tools.load_json(rulesfilepath)

{carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/tools/metrics.py RENAMED Viewed

@@ -1,12 +1,21 @@
 class Metrics():
+    """
+    Capture metrics for output to a summary tsv file, record counts at multiple levels
+    The main principle is to increment counts associated with datakeys (dkey) at different levels
+    """
     def __init__(self, dataset_name, log_threshold=0):
+        """
+        self.datasummary holds all the saved counts
+        """
         self.datasummary={}
         self.allcounts={}
-        self.log_data=""
         self.dataset_name=dataset_name
         self.log_threshold = log_threshold
     def get_new_mapstream_counts(self):
+        """
+        return a new, initialised,  count structure
+        """
         counts = {}
         counts["input_count"] = 0
         counts["invalid_persids"] = 0
@@ -118,10 +127,3 @@ class Metrics():
               summary_str += self.dataset_name + "\t" + source + "\t" + fieldname + "\t" + tablename + "\t" + concept_id + "\t" + additional +"\t" + input_count + "\t" + invalid_person_ids + "\t" + invalid_date_fields + "\t" + invalid_source_fields + "\t" + output_count + "\n"
         return summary_str
-    def add_log_data(self, msg):
-        self.log_data += msg + "\n"
-    def get_log_data(self):
-        return self.log_data

{carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/tools/omopcdm.py RENAMED Viewed

@@ -4,6 +4,11 @@ import re
 import sys
 class OmopCDM:
+    """
+    Load and parse OMOP DDL data, to make an in-memory json CDM
+    Merge in manual additions (currently necessary to identify, person id, date / time fields and autonumber fields)
+    Define a series of "get" functions to allow CDM component discovery
+    """
     def __init__(self, omopddl, omopcfg):
         self.numeric_types = ["integer", "numeric"]
@@ -24,8 +29,8 @@ class OmopCDM:
     def load_ddl(self, omopddl):
         try:
             fp = open(omopddl, "r")
-        except IOError as e:
-            print("I/O error for ddl file ({0}): {1}".format(e.errno, e.strerror))
+        except Exception as err:
+            print("OMOP ddl file ({0}) not found".format(omopddl))
             sys.exit()
         return(self.process_ddl(fp))

{carrot_transform-0.3 → carrot_transform-0.3.2}/pyproject.toml RENAMED Viewed

@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "carrot-transform"
-version = "0.3"
+version = "0.3.2"
 authors = [
   { name="PD Appleby", email="pdappleby@gmail.com" },
 ]
-description = "Carrot somple transformer, input rules and data csvs, output OMOP"
+description = "Carrot simple transformer, input rules and data csv's, output OMOP"
 readme = "README.md"
 requires-python = ">=3.9"
 classifiers = [

carrot_transform-0.3/carrottransform/tools/file_helpers.py DELETED Viewed

@@ -1,14 +0,0 @@
-import os
-import json
-def load_json(f_in):
-  if os.path.exists(f_in):
-    data = json.load(open(f_in))
-  else:
-    try:
-      data = json.loads(f_in)
-    except Exception as err:
-      raise FileNotFoundError(f"{f_in} not found. Or cannot parse as json")
-  return data