carrot-transform 0.3__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

Files changed (28) hide show
  1. {carrot_transform-0.3 → carrot_transform-0.3.2}/PKG-INFO +2 -2
  2. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrot_transform.egg-info/PKG-INFO +2 -2
  3. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrot_transform.py +1 -0
  4. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/_version.py +1 -1
  5. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/cli/subcommands/run.py +37 -25
  6. carrot_transform-0.3.2/carrottransform/tools/file_helpers.py +15 -0
  7. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/tools/mappingrules.py +4 -0
  8. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/tools/metrics.py +10 -8
  9. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/tools/omopcdm.py +7 -2
  10. {carrot_transform-0.3 → carrot_transform-0.3.2}/pyproject.toml +2 -2
  11. carrot_transform-0.3/carrottransform/tools/file_helpers.py +0 -14
  12. {carrot_transform-0.3 → carrot_transform-0.3.2}/.github/workflows/pypi.publish.yml +0 -0
  13. {carrot_transform-0.3 → carrot_transform-0.3.2}/.gitignore +0 -0
  14. {carrot_transform-0.3 → carrot_transform-0.3.2}/LICENSE +0 -0
  15. {carrot_transform-0.3 → carrot_transform-0.3.2}/MANIFEST.in +0 -0
  16. {carrot_transform-0.3 → carrot_transform-0.3.2}/README.md +0 -0
  17. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrot_transform.egg-info/SOURCES.txt +0 -0
  18. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrot_transform.egg-info/dependency_links.txt +0 -0
  19. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrot_transform.egg-info/entry_points.txt +0 -0
  20. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrot_transform.egg-info/top_level.txt +0 -0
  21. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/__init__.py +0 -0
  22. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/cli/__init__.py +0 -0
  23. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/cli/command.py +0 -0
  24. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/cli/subcommands/__init__.py +0 -0
  25. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql +0 -0
  26. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/config/omop.json +0 -0
  27. {carrot_transform-0.3 → carrot_transform-0.3.2}/carrottransform/tools/__init__.py +0 -0
  28. {carrot_transform-0.3 → carrot_transform-0.3.2}/setup.cfg +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: carrot-transform
3
- Version: 0.3
4
- Summary: Carrot somple transformer, input rules and data csvs, output OMOP
3
+ Version: 0.3.2
4
+ Summary: Carrot simple transformer, input rules and data csv's, output OMOP
5
5
  Author-email: PD Appleby <pdappleby@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
7
7
  Classifier: License :: OSI Approved :: MIT License
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: carrot-transform
3
- Version: 0.3
4
- Summary: Carrot somple transformer, input rules and data csvs, output OMOP
3
+ Version: 0.3.2
4
+ Summary: Carrot simple transformer, input rules and data csv's, output OMOP
5
5
  Author-email: PD Appleby <pdappleby@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
7
7
  Classifier: License :: OSI Approved :: MIT License
@@ -1,4 +1,5 @@
1
1
  # Provides an entry point for the built executable
2
+ # Build with "pyinstaller --onefile carrot_transform.py"
2
3
  from carrottransform.cli.command import transform
3
4
  if __name__ == '__main__':
4
5
  transform()
@@ -1,2 +1,2 @@
1
1
  # TODO - pick this up automatically when building
2
- __version__ = '0.3'
2
+ __version__ = '0.3.2'
@@ -27,8 +27,14 @@ def run():
27
27
  @click.option("--person-file",
28
28
  required=True,
29
29
  help="File containing person_ids in the first column")
30
+ @click.option("--omop-ddl-file",
31
+ required=False,
32
+ help="File containing OHDSI ddl statements for OMOP tables")
33
+ @click.option("--omop-config-file",
34
+ required=False,
35
+ help="File containing additional / override json config for omop outputs")
30
36
  @click.option("--omop-version",
31
- required=True,
37
+ required=False,
32
38
  help="Quoted string containing opmop version - eg '5.3'")
33
39
  @click.option("--saved-person-id-file",
34
40
  default=None,
@@ -49,13 +55,23 @@ def run():
49
55
  @click.argument("input-dir",
50
56
  required=False,
51
57
  nargs=-1)
52
- def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, saved_person_id_file, use_input_person_ids, last_used_ids_file, log_file_threshold, input_dir):
58
+ def mapstream(rules_file, output_dir, write_mode,
59
+ person_file, omop_ddl_file, omop_config_file,
60
+ omop_version, saved_person_id_file, use_input_person_ids,
61
+ last_used_ids_file, log_file_threshold, input_dir):
53
62
  """
54
63
  Map to output using input streams
55
64
  """
56
- omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
57
- omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
58
- omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
65
+ # Initialisation
66
+ # - check for values in optional arguments
67
+ # - read in configuration files
68
+ # - check main directories for existence
69
+ # - handle saved persion ids
70
+ # - initialise metrics
71
+ if (omop_ddl_file == None) and (omop_config_file == None) and (omop_version != None):
72
+ omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
73
+ omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
74
+ omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
59
75
 
60
76
  if os.path.isdir(input_dir[0]) == False:
61
77
  print("Not a directory, input dir {0}".format(input_dir[0]))
@@ -72,13 +88,12 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
72
88
 
73
89
  starttime = time.time()
74
90
  omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
75
- #print(omopcdm.dump_ddl())
76
91
  mappingrules = tools.mappingrules.MappingRules(rules_file, omopcdm)
77
92
  metrics = tools.metrics.Metrics(mappingrules.get_dataset_name(), log_file_threshold)
78
93
  nowtime = time.time()
79
94
 
80
95
  print("--------------------------------------------------------------------------------")
81
- print("Loaded mapping rules from: {0} after {1:.5f} secs".format(rules_file, (nowtime - starttime)))
96
+ print("Loaded mapping rules from: {0} in {1:.5f} secs".format(rules_file, (nowtime - starttime)))
82
97
  output_files = mappingrules.get_all_outfile_names()
83
98
  record_numbers = {}
84
99
  for output_file in output_files:
@@ -88,7 +103,7 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
88
103
  tgtcolmaps = {}
89
104
 
90
105
  try:
91
- # Add in a saved-person-file existence test and reload from it is necessary returning the last used integer
106
+ # Saved-person-file existence test, reload if found, return last used integer
92
107
  if os.path.isfile(saved_person_id_file):
93
108
  person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
94
109
  else:
@@ -98,14 +113,13 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
98
113
  if os.path.isfile(last_used_ids_file):
99
114
  record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
100
115
 
101
- #fhp = open(person_file, mode="r", encoding="utf-8-sig")
102
- #csvrp = csv.reader(fhp)
103
116
  person_lookup, rejected_person_count = load_person_ids(person_file, person_lookup, mappingrules, use_input_person_ids, last_used_integer)
104
117
  fhpout = open(saved_person_id_file, mode="w")
105
118
  fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
106
119
  for person_id, person_assigned_id in person_lookup.items():
107
120
  fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
108
121
  fhpout.close()
122
+ # Initialise output files, output a header for each
109
123
  for tgtfile in output_files:
110
124
  fhd[tgtfile] = open(output_dir + "/" + tgtfile + ".tsv", mode=write_mode)
111
125
  if write_mode == 'w':
@@ -119,28 +133,30 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
119
133
 
120
134
  print("person_id stats: total loaded {0}, reject count {1}".format(len(person_lookup), rejected_person_count))
121
135
 
122
- # TODO get this list of input files from the parsed rules
136
+ # Compare files found in the input_dir with those expected based on mapping rules
123
137
  existing_input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
124
138
  rules_input_files = mappingrules.get_all_infile_names()
139
+ # Log mismatches but continue
125
140
  for infile in existing_input_files:
126
141
  if infile not in rules_input_files:
127
142
  msg = "ERROR: no mapping rules found for existing input file - {0}".format(infile)
128
143
  print(msg)
129
- metrics.add_log_data(msg)
130
144
  for infile in rules_input_files:
131
145
  if infile not in existing_input_files:
132
146
  msg = "ERROR: no data for mapped input file - {0}".format(infile)
133
147
  print(msg)
134
- metrics.add_log_data(msg)
148
+
149
+ # set up overall counts
135
150
  rejidcounts = {}
136
151
  rejdatecounts = {}
137
- #src_tgt_counts = {}
138
152
  print(rules_input_files)
139
153
 
154
+ # set up per-input counts
140
155
  for srcfilename in rules_input_files:
141
156
  rejidcounts[srcfilename] = 0
142
157
  rejdatecounts[srcfilename] = 0
143
158
 
159
+ # main processing loop, for each input file
144
160
  for srcfilename in rules_input_files:
145
161
  outcounts = {}
146
162
  rejcounts = {}
@@ -169,17 +185,15 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
169
185
  datetime_col = inputcolmap[infile_datetime_source]
170
186
  print("--------------------------------------------------------------------------------")
171
187
  print("Processing input: {0}".format(srcfilename))
172
- # print("Processing input: {0}, All input cols = {1}, Data cols = {2}".format(srcfilename, str(datacolsall), str(dflist)))
173
-
188
+
189
+ # for each input record
174
190
  for indata in csvr:
175
- #indata = inputline.strip().split(",")
176
191
  key = srcfilename + "~all~all~all~"
177
192
  metrics.increment_key_count(key, "input_count")
178
193
  rcount += 1
179
194
  strdate = indata[datetime_col].split(" ")[0]
180
195
  fulldate = parse_date(strdate)
181
196
  if fulldate != None:
182
- #fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
183
197
  indata[datetime_col] = fulldate
184
198
  else:
185
199
  metrics.increment_key_count(key, "invalid_date_fields")
@@ -236,28 +250,26 @@ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, sav
236
250
  print("INPUT file data : {0}: input count {1}, time since start {2:.5} secs".format(srcfilename, str(rcount), (nowtime - starttime)))
237
251
  for outtablename, count in outcounts.items():
238
252
  print("TARGET: {0}: output count {1}".format(outtablename, str(count)))
253
+ # END main processing loop
239
254
 
240
255
  print("--------------------------------------------------------------------------------")
241
256
  data_summary = metrics.get_mapstream_summary()
242
- log_report = metrics.get_log_data()
243
257
  try:
244
258
  dsfh = open(output_dir + "/summary_mapstream.tsv", mode="w")
245
259
  dsfh.write(data_summary)
246
260
  dsfh.close()
247
- logfh = open(output_dir + "/error_report.txt", mode="w")
248
- logfh.write(log_report)
249
- logfh.close()
250
261
  except IOError as e:
251
262
  print("I/O error({0}): {1}".format(e.errno, e.strerror))
252
263
  print("Unable to write file")
253
264
 
265
+ # END mapstream
254
266
  nowtime = time.time()
255
267
  print("Elapsed time = {0:.5f} secs".format(nowtime - starttime))
256
- #profiler.disable()
257
- #stats = pstats.Stats(profiler).sort_stats('ncalls')
258
- #stats.print_stats()
259
268
 
260
269
  def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srccolmap, srcfilename, omopcdm, metrics):
270
+ """
271
+ build all target records for a given input field
272
+ """
261
273
  build_records = False
262
274
  tgtrecords = []
263
275
  date_col_data = omopcdm.get_omop_datetime_linked_fields(tgtfilename)
@@ -0,0 +1,15 @@
1
+ import os
2
+ import sys
3
+ import json
4
+
5
+ # Function inherited from the "old" CaRROT-CDM (modfied to exit on error)
6
+
7
+ def load_json(f_in):
8
+ try:
9
+ data = json.load(open(f_in))
10
+ except Exception as err:
11
+ print ("{0} not found. Or cannot parse as json".format(f_in))
12
+ sys.exit()
13
+
14
+ return data
15
+
@@ -4,6 +4,10 @@ import carrottransform.tools as tools
4
4
  from .omopcdm import OmopCDM
5
5
 
6
6
  class MappingRules:
7
+ """
8
+ self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
9
+ as a file-specific dictionary allowing rules to be "looked-up" depending on data content
10
+ """
7
11
 
8
12
  def __init__(self, rulesfilepath, omopcdm):
9
13
  self.rules_data = tools.load_json(rulesfilepath)
@@ -1,12 +1,21 @@
1
1
  class Metrics():
2
+ """
3
+ Capture metrics for output to a summary tsv file, record counts at multiple levels
4
+ The main principle is to increment counts associated with datakeys (dkey) at different levels
5
+ """
2
6
  def __init__(self, dataset_name, log_threshold=0):
7
+ """
8
+ self.datasummary holds all the saved counts
9
+ """
3
10
  self.datasummary={}
4
11
  self.allcounts={}
5
- self.log_data=""
6
12
  self.dataset_name=dataset_name
7
13
  self.log_threshold = log_threshold
8
14
 
9
15
  def get_new_mapstream_counts(self):
16
+ """
17
+ return a new, initialised, count structure
18
+ """
10
19
  counts = {}
11
20
  counts["input_count"] = 0
12
21
  counts["invalid_persids"] = 0
@@ -118,10 +127,3 @@ class Metrics():
118
127
  summary_str += self.dataset_name + "\t" + source + "\t" + fieldname + "\t" + tablename + "\t" + concept_id + "\t" + additional +"\t" + input_count + "\t" + invalid_person_ids + "\t" + invalid_date_fields + "\t" + invalid_source_fields + "\t" + output_count + "\n"
119
128
 
120
129
  return summary_str
121
-
122
- def add_log_data(self, msg):
123
- self.log_data += msg + "\n"
124
-
125
- def get_log_data(self):
126
- return self.log_data
127
-
@@ -4,6 +4,11 @@ import re
4
4
  import sys
5
5
 
6
6
  class OmopCDM:
7
+ """
8
+ Load and parse OMOP DDL data, to make an in-memory json CDM
9
+ Merge in manual additions (currently necessary to identify, person id, date / time fields and autonumber fields)
10
+ Define a series of "get" functions to allow CDM component discovery
11
+ """
7
12
 
8
13
  def __init__(self, omopddl, omopcfg):
9
14
  self.numeric_types = ["integer", "numeric"]
@@ -24,8 +29,8 @@ class OmopCDM:
24
29
  def load_ddl(self, omopddl):
25
30
  try:
26
31
  fp = open(omopddl, "r")
27
- except IOError as e:
28
- print("I/O error for ddl file ({0}): {1}".format(e.errno, e.strerror))
32
+ except Exception as err:
33
+ print("OMOP ddl file ({0}) not found".format(omopddl))
29
34
  sys.exit()
30
35
 
31
36
  return(self.process_ddl(fp))
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "carrot-transform"
7
- version = "0.3"
7
+ version = "0.3.2"
8
8
  authors = [
9
9
  { name="PD Appleby", email="pdappleby@gmail.com" },
10
10
  ]
11
- description = "Carrot somple transformer, input rules and data csvs, output OMOP"
11
+ description = "Carrot simple transformer, input rules and data csv's, output OMOP"
12
12
  readme = "README.md"
13
13
  requires-python = ">=3.9"
14
14
  classifiers = [
@@ -1,14 +0,0 @@
1
- import os
2
- import json
3
-
4
- def load_json(f_in):
5
- if os.path.exists(f_in):
6
- data = json.load(open(f_in))
7
- else:
8
- try:
9
- data = json.loads(f_in)
10
- except Exception as err:
11
- raise FileNotFoundError(f"{f_in} not found. Or cannot parse as json")
12
-
13
- return data
14
-
File without changes