carrot-transform 0.3__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

Files changed (27) hide show
  1. {carrot_transform-0.3 → carrot_transform-0.3.1}/PKG-INFO +2 -2
  2. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrot_transform.egg-info/PKG-INFO +2 -2
  3. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/_version.py +1 -1
  4. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/cli/subcommands/run.py +19 -9
  5. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/tools/file_helpers.py +2 -0
  6. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/tools/mappingrules.py +4 -0
  7. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/tools/metrics.py +10 -0
  8. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/tools/omopcdm.py +5 -0
  9. {carrot_transform-0.3 → carrot_transform-0.3.1}/pyproject.toml +2 -2
  10. {carrot_transform-0.3 → carrot_transform-0.3.1}/.github/workflows/pypi.publish.yml +0 -0
  11. {carrot_transform-0.3 → carrot_transform-0.3.1}/.gitignore +0 -0
  12. {carrot_transform-0.3 → carrot_transform-0.3.1}/LICENSE +0 -0
  13. {carrot_transform-0.3 → carrot_transform-0.3.1}/MANIFEST.in +0 -0
  14. {carrot_transform-0.3 → carrot_transform-0.3.1}/README.md +0 -0
  15. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrot_transform.egg-info/SOURCES.txt +0 -0
  16. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrot_transform.egg-info/dependency_links.txt +0 -0
  17. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrot_transform.egg-info/entry_points.txt +0 -0
  18. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrot_transform.egg-info/top_level.txt +0 -0
  19. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrot_transform.py +0 -0
  20. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/__init__.py +0 -0
  21. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/cli/__init__.py +0 -0
  22. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/cli/command.py +0 -0
  23. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/cli/subcommands/__init__.py +0 -0
  24. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql +0 -0
  25. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/config/omop.json +0 -0
  26. {carrot_transform-0.3 → carrot_transform-0.3.1}/carrottransform/tools/__init__.py +0 -0
  27. {carrot_transform-0.3 → carrot_transform-0.3.1}/setup.cfg +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: carrot-transform
3
- Version: 0.3
4
- Summary: Carrot somple transformer, input rules and data csvs, output OMOP
3
+ Version: 0.3.1
4
+ Summary: Carrot simple transformer, input rules and data csv's, output OMOP
5
5
  Author-email: PD Appleby <pdappleby@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
7
7
  Classifier: License :: OSI Approved :: MIT License
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: carrot-transform
3
- Version: 0.3
4
- Summary: Carrot somple transformer, input rules and data csvs, output OMOP
3
+ Version: 0.3.1
4
+ Summary: Carrot simple transformer, input rules and data csv's, output OMOP
5
5
  Author-email: PD Appleby <pdappleby@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
7
7
  Classifier: License :: OSI Approved :: MIT License
@@ -1,2 +1,2 @@
1
1
  # TODO - pick this up automatically when building
2
- __version__ = '0.3'
2
+ __version__ = '0.3.1'
@@ -53,6 +53,12 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
53
53
  """
54
54
  Map to output using input streams
55
55
  """
56
+ # Initialisation
57
+ # - check for values in optional arguments
58
+ # - read in configuration files
59
+ # - check main directories for existence
60
+ # - handle saved persion ids
61
+ # - initialise metrics
56
62
  omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
57
63
  omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
58
64
  omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
@@ -88,7 +94,7 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
88
94
  tgtcolmaps = {}
89
95
 
90
96
  try:
91
- # Add in a saved-person-file existence test and reload from it is necessary returning the last used integer
97
+ # Saved-person-file existence test, reload if found, return last used integer
92
98
  if os.path.isfile(saved_person_id_file):
93
99
  person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
94
100
  else:
@@ -98,14 +104,13 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
98
104
  if os.path.isfile(last_used_ids_file):
99
105
  record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
100
106
 
101
- #fhp = open(person_file, mode="r", encoding="utf-8-sig")
102
- #csvrp = csv.reader(fhp)
103
107
  person_lookup, rejected_person_count = load_person_ids(person_file, person_lookup, mappingrules, use_input_person_ids, last_used_integer)
104
108
  fhpout = open(saved_person_id_file, mode="w")
105
109
  fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
106
110
  for person_id, person_assigned_id in person_lookup.items():
107
111
  fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
108
112
  fhpout.close()
113
+ # Initialise output files, output a header for each
109
114
  for tgtfile in output_files:
110
115
  fhd[tgtfile] = open(output_dir + "/" + tgtfile + ".tsv", mode=write_mode)
111
116
  if write_mode == 'w':
@@ -119,9 +124,10 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
119
124
 
120
125
  print("person_id stats: total loaded {0}, reject count {1}".format(len(person_lookup), rejected_person_count))
121
126
 
122
- # TODO get this list of input files from the parsed rules
127
+ # Compare files found in the input_dir with those expected based on mapping rules
123
128
  existing_input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
124
129
  rules_input_files = mappingrules.get_all_infile_names()
130
+ # Log mismatches but continue
125
131
  for infile in existing_input_files:
126
132
  if infile not in rules_input_files:
127
133
  msg = "ERROR: no mapping rules found for existing input file - {0}".format(infile)
@@ -132,15 +138,18 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
132
138
  msg = "ERROR: no data for mapped input file - {0}".format(infile)
133
139
  print(msg)
134
140
  metrics.add_log_data(msg)
141
+
142
+ # set up overall counts
135
143
  rejidcounts = {}
136
144
  rejdatecounts = {}
137
- #src_tgt_counts = {}
138
145
  print(rules_input_files)
139
146
 
147
+ # set up per-input counts
140
148
  for srcfilename in rules_input_files:
141
149
  rejidcounts[srcfilename] = 0
142
150
  rejdatecounts[srcfilename] = 0
143
151
 
152
+ # main processing loop, for each input file
144
153
  for srcfilename in rules_input_files:
145
154
  outcounts = {}
146
155
  rejcounts = {}
@@ -169,17 +178,15 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
169
178
  datetime_col = inputcolmap[infile_datetime_source]
170
179
  print("--------------------------------------------------------------------------------")
171
180
  print("Processing input: {0}".format(srcfilename))
172
- # print("Processing input: {0}, All input cols = {1}, Data cols = {2}".format(srcfilename, str(datacolsall), str(dflist)))
173
-
181
+
182
+ # for each input record
174
183
  for indata in csvr:
175
- #indata = inputline.strip().split(",")
176
184
  key = srcfilename + "~all~all~all~"
177
185
  metrics.increment_key_count(key, "input_count")
178
186
  rcount += 1
179
187
  strdate = indata[datetime_col].split(" ")[0]
180
188
  fulldate = parse_date(strdate)
181
189
  if fulldate != None:
182
- #fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
183
190
  indata[datetime_col] = fulldate
184
191
  else:
185
192
  metrics.increment_key_count(key, "invalid_date_fields")
@@ -258,6 +265,9 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
258
265
  #stats.print_stats()
259
266
 
260
267
  def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srccolmap, srcfilename, omopcdm, metrics):
268
+ """
269
+ build all target records for a given input field
270
+ """
261
271
  build_records = False
262
272
  tgtrecords = []
263
273
  date_col_data = omopcdm.get_omop_datetime_linked_fields(tgtfilename)
@@ -1,5 +1,7 @@
1
1
  import os
2
2
  import json
3
+
4
+ # Function inherited from the "old" CaRROT-CDM
3
5
 
4
6
  def load_json(f_in):
5
7
  if os.path.exists(f_in):
@@ -4,6 +4,10 @@ import carrottransform.tools as tools
4
4
  from .omopcdm import OmopCDM
5
5
 
6
6
  class MappingRules:
7
+ """
8
+ self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
9
+ as a file-specific dictionary allowing rules to be "looked-up" depending on data content
10
+ """
7
11
 
8
12
  def __init__(self, rulesfilepath, omopcdm):
9
13
  self.rules_data = tools.load_json(rulesfilepath)
@@ -1,5 +1,12 @@
1
1
  class Metrics():
2
+ """
3
+ Capture metrics for output to a summary tsv file, record counts at multiple levels
4
+ The main principle is to increment counts associated with datakeys (dkey) at different levels
5
+ """
2
6
  def __init__(self, dataset_name, log_threshold=0):
7
+ """
8
+ self.datasummary holds all the saved counts
9
+ """
3
10
  self.datasummary={}
4
11
  self.allcounts={}
5
12
  self.log_data=""
@@ -7,6 +14,9 @@ class Metrics():
7
14
  self.log_threshold = log_threshold
8
15
 
9
16
  def get_new_mapstream_counts(self):
17
+ """
18
+ return a new, initialised, count structure
19
+ """
10
20
  counts = {}
11
21
  counts["input_count"] = 0
12
22
  counts["invalid_persids"] = 0
@@ -4,6 +4,11 @@ import re
4
4
  import sys
5
5
 
6
6
  class OmopCDM:
7
+ """
8
+ Load and parse OMOP DDL data, to make an in-memory json CDM
9
+ Merge in manual additions (currently necessary to identify, person id, date / time fields and autonumber fields)
10
+ Define a series of "get" functions to allow CDM component discovery
11
+ """
7
12
 
8
13
  def __init__(self, omopddl, omopcfg):
9
14
  self.numeric_types = ["integer", "numeric"]
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "carrot-transform"
7
- version = "0.3"
7
+ version = "0.3.1"
8
8
  authors = [
9
9
  { name="PD Appleby", email="pdappleby@gmail.com" },
10
10
  ]
11
- description = "Carrot somple transformer, input rules and data csvs, output OMOP"
11
+ description = "Carrot simple transformer, input rules and data csv's, output OMOP"
12
12
  readme = "README.md"
13
13
  requires-python = ">=3.9"
14
14
  classifiers = [
File without changes