carrot-transform 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

Files changed (33) hide show
  1. {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
  2. carrot_transform-0.4.0.dist-info/RECORD +41 -0
  3. {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
  4. carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
  5. carrottransform/__init__.py +1 -1
  6. carrottransform/_version.py +2 -2
  7. carrottransform/cli/command.py +9 -5
  8. carrottransform/cli/subcommands/run.py +302 -443
  9. carrottransform/cli/subcommands/run_v2.py +145 -0
  10. carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
  11. carrottransform/examples/test/rules/v1.json +280 -0
  12. carrottransform/examples/test/rules/v2.json +115 -0
  13. carrottransform/tools/__init__.py +4 -14
  14. carrottransform/tools/args.py +128 -0
  15. carrottransform/tools/click.py +21 -0
  16. carrottransform/tools/concept_helpers.py +61 -0
  17. carrottransform/tools/core.py +163 -0
  18. carrottransform/tools/date_helpers.py +79 -0
  19. carrottransform/tools/file_helpers.py +177 -7
  20. carrottransform/tools/logger.py +19 -0
  21. carrottransform/tools/mapping_types.py +32 -0
  22. carrottransform/tools/mappingrules.py +298 -32
  23. carrottransform/tools/metrics.py +274 -49
  24. carrottransform/tools/omopcdm.py +42 -32
  25. carrottransform/tools/orchestrator.py +381 -0
  26. carrottransform/tools/person_helpers.py +126 -0
  27. carrottransform/tools/record_builder.py +413 -0
  28. carrottransform/tools/stream_helpers.py +71 -0
  29. carrottransform/tools/types.py +71 -0
  30. carrottransform/tools/validation.py +62 -0
  31. carrot_transform-0.3.4.dist-info/RECORD +0 -24
  32. carrot_transform-0.3.4.dist-info/entry_points.txt +0 -3
  33. {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,15 +1,125 @@
1
- class Metrics():
1
+ from dataclasses import dataclass, field
2
+ from typing import Dict, List
3
+
4
+ from carrottransform.tools.logger import logger_setup
5
+
6
+ logger = logger_setup()
7
+
8
+
9
+ @dataclass
10
+ class DataKey:
11
+ source: str
12
+ fieldname: str
13
+ tablename: str
14
+ concept_id: str
15
+ additional: str
16
+
17
+ def __str__(self) -> str:
18
+ """
19
+ The original implementation used strings as keys, then split by `~`.
20
+ This is here in case that representation is needed somewhere
21
+ """
22
+ return f"{self.source}~{self.fieldname}~{self.tablename}~{self.concept_id}~{self.additional}"
23
+
24
+ def __hash__(self) -> int:
25
+ """
26
+ The DataKey is used as a key for a dictionary of key counts
27
+ """
28
+ return hash(
29
+ (
30
+ self.source,
31
+ self.fieldname,
32
+ self.tablename,
33
+ self.concept_id,
34
+ self.additional,
35
+ )
36
+ )
37
+
38
+
39
+ @dataclass
40
+ class CountData:
41
+ counts: Dict[str, int] = field(default_factory=dict)
42
+
43
+ def increment(self, count_type: str):
44
+ if count_type not in self.counts:
45
+ self.counts[count_type] = 0
46
+ self.counts[count_type] += 1
47
+
48
+ def get_count(self, count_type: str, default: int = 0):
49
+ return self.counts.get(count_type, default)
50
+
51
+
52
+ @dataclass
53
+ class MapstreamSummaryRow:
54
+ """Represents a single row in the mapstream summary"""
55
+
56
+ dataset_name: str
57
+ source: str
58
+ fieldname: str
59
+ tablename: str
60
+ concept_id: str
61
+ additional: str
62
+ input_count: int = 0
63
+ invalid_person_ids: int = 0
64
+ invalid_date_fields: int = 0
65
+ invalid_source_fields: int = 0
66
+ output_count: int = 0
67
+
68
+ def to_tsv_row(self) -> str:
69
+ """Convert the row to a tab-separated string"""
70
+ row_list = [
71
+ str(col)
72
+ for col in [
73
+ self.dataset_name,
74
+ self.source,
75
+ self.fieldname,
76
+ self.tablename,
77
+ self.concept_id,
78
+ self.additional,
79
+ self.input_count,
80
+ self.invalid_person_ids,
81
+ self.invalid_date_fields,
82
+ self.invalid_source_fields,
83
+ self.output_count,
84
+ ]
85
+ ]
86
+ # If python gets updated, you can move the row_str expression into the f-string
87
+ row_str = "\t".join(row_list)
88
+ return f"{row_str}\n"
89
+
90
+ @classmethod
91
+ def get_header(cls) -> str:
92
+ """Return the TSV header row"""
93
+ header = [
94
+ "dsname",
95
+ "source",
96
+ "source_field",
97
+ "target",
98
+ "concept_id",
99
+ "additional",
100
+ "incount",
101
+ "invalid_persid",
102
+ "invalid_date",
103
+ "invalid_source",
104
+ "outcount",
105
+ ]
106
+ header_str = "\t".join(header)
107
+ return f"{header_str}\n"
108
+
109
+
110
+ class Metrics:
2
111
  """
3
112
  Capture metrics for output to a summary tsv file, record counts at multiple levels
4
113
  The main principle is to increment counts associated with datakeys (dkey) at different levels
5
114
  """
115
+
6
116
  def __init__(self, dataset_name, log_threshold=0):
7
117
  """
8
118
  self.datasummary holds all the saved counts
9
119
  """
10
- self.datasummary={}
11
- self.allcounts={}
12
- self.dataset_name=dataset_name
120
+ self.datasummary = {}
121
+ self.allcounts = {}
122
+ self.dataset_name = dataset_name
13
123
  self.log_threshold = log_threshold
14
124
 
15
125
  def get_new_mapstream_counts(self):
@@ -43,8 +153,18 @@ class Metrics():
43
153
  prfx = "NA"
44
154
  if "source_files" in increment:
45
155
  if fieldname in increment["source_files"]:
46
- prfx = self.get_prefix(increment["source_files"][fieldname]["table"])
47
- dkey = prfx + "." + desttablename + "." + name + "." + fieldname
156
+ prfx = self.get_prefix(
157
+ increment["source_files"][fieldname]["table"]
158
+ )
159
+ dkey = (
160
+ prfx
161
+ + "."
162
+ + desttablename
163
+ + "."
164
+ + name
165
+ + "."
166
+ + fieldname
167
+ )
48
168
  self.add_counts_to_summary(dkey, dataitem[fieldname])
49
169
 
50
170
  def get_prefix(self, fname):
@@ -58,30 +178,122 @@ class Metrics():
58
178
  self.datasummary[dkey][counttype] = 0
59
179
  self.datasummary[dkey][counttype] += int(count_block[counttype])
60
180
 
61
- def increment_key_count(self, dkey, count_type):
62
- """
63
- Intended to work with the mapstream functions
64
- """
181
+ def increment_key_count(
182
+ self, source, fieldname, tablename, concept_id, additional, count_type
183
+ ):
184
+ dkey = DataKey(source, fieldname, tablename, concept_id, additional)
185
+
65
186
  if dkey not in self.datasummary:
66
- self.datasummary[dkey] = {}
67
- if count_type not in self.datasummary[dkey]:
68
- self.datasummary[dkey][count_type] = 0
69
- self.datasummary[dkey][count_type] += 1
187
+ self.datasummary[dkey] = CountData()
188
+
189
+ self.datasummary[dkey].increment(count_type)
190
+
191
+ def increment_with_datacol(
192
+ self,
193
+ source_path: str,
194
+ target_file: str,
195
+ datacol: str,
196
+ out_record: List[str],
197
+ ) -> None:
198
+ # Are the parameters for DataKeys hierarchical?
199
+ # If so, a nested structure where a Source contains n Fields etc. and each has a method to sum its children would be better
200
+ # But I don't know if that's the desired behaviour
201
+
202
+ # A lot of these increment the same thing, so I have defined `increment_this`
203
+ def increment_this(
204
+ fieldname: str,
205
+ concept_id: str,
206
+ additional="",
207
+ ) -> None:
208
+ self.increment_key_count(
209
+ source=source_path,
210
+ fieldname=fieldname,
211
+ tablename=target_file,
212
+ concept_id=concept_id,
213
+ additional=additional,
214
+ count_type="output_count",
215
+ )
216
+
217
+ self.increment_key_count(
218
+ source=source_path,
219
+ fieldname="all",
220
+ tablename="all",
221
+ concept_id="all",
222
+ additional="",
223
+ count_type="output_count",
224
+ )
225
+
226
+ self.increment_key_count(
227
+ source="all",
228
+ fieldname="all",
229
+ tablename=target_file,
230
+ concept_id="all",
231
+ additional="",
232
+ count_type="output_count",
233
+ )
234
+ increment_this(fieldname="all", concept_id="all")
235
+
236
+ if target_file == "person":
237
+ increment_this(fieldname="all", concept_id=out_record[1])
238
+ increment_this(
239
+ fieldname="all", concept_id=out_record[1], additional=out_record[2]
240
+ )
241
+ else:
242
+ increment_this(fieldname=datacol, concept_id=out_record[2])
243
+ increment_this(fieldname="all", concept_id=out_record[2])
244
+ self.increment_key_count(
245
+ source="all",
246
+ fieldname="all",
247
+ tablename=target_file,
248
+ concept_id=out_record[2],
249
+ additional="",
250
+ count_type="output_count",
251
+ )
252
+ self.increment_key_count(
253
+ source="all",
254
+ fieldname="all",
255
+ tablename="all",
256
+ concept_id=out_record[2],
257
+ additional="",
258
+ count_type="output_count",
259
+ )
70
260
 
71
261
  def get_summary(self):
72
262
  summary_str = "source\ttablename\tname\tcolumn name\tbefore\tafter content check\tpct reject content check\tafter date format check\tpct reject date format\n"
73
263
 
74
264
  for dkey in self.datasummary:
75
- #print(dkey)
76
- source, tablename, name, colname = dkey.split('.')
265
+ logger.debug(dkey)
266
+ source, tablename, name, colname = dkey.split(".")
77
267
  before_count = int(self.datasummary[dkey]["before"])
78
268
  after_count = int(self.datasummary[dkey]["after"])
79
269
  after_pct = (float)(before_count - after_count) * 100 / before_count
80
- summary_str += source + "\t" + tablename + "\t" + name + "\t" + colname + "\t" + str(before_count) + "\t" + str(after_count) + "\t" + "{0:.3f}".format(after_pct) + "\t"
270
+ summary_str += (
271
+ source
272
+ + "\t"
273
+ + tablename
274
+ + "\t"
275
+ + name
276
+ + "\t"
277
+ + colname
278
+ + "\t"
279
+ + str(before_count)
280
+ + "\t"
281
+ + str(after_count)
282
+ + "\t"
283
+ + "{0:.3f}".format(after_pct)
284
+ + "\t"
285
+ )
81
286
  if "after_formatting" in self.datasummary[dkey]:
82
287
  after_format_count = int(self.datasummary[dkey]["after_formatting"])
83
- after_format_pct = (float)(after_count - after_format_count) * 100 / after_count
84
- summary_str += str(after_format_count) + "\t" + "{0:.3f}".format(after_format_pct) + "\n"
288
+ after_format_pct = (
289
+ (float)(after_count - after_format_count) * 100 / after_count
290
+ )
291
+ summary_str += (
292
+ str(after_format_count)
293
+ + "\t"
294
+ + "{0:.3f}".format(after_format_pct)
295
+ + "\n"
296
+ )
85
297
  else:
86
298
  summary_str += "NA\tNA\n"
87
299
 
@@ -90,40 +302,53 @@ class Metrics():
90
302
  def get_data_summary(self):
91
303
  return self.datasummary
92
304
 
93
- def get_mapstream_summary(self):
94
- summary_str = "dsname\tsource\tsource_field\ttarget\tconcept_id\tadditional\tincount\tinvalid_persid\tinvalid_date\tinvalid_source\toutcount\n"
95
-
96
- for dkey in sorted(self.datasummary):
97
- try:
98
- source, fieldname, tablename, concept_id, additional = dkey.split('~')
99
- except ValueError:
100
- print("get_mapstream_summary - ValueError: {0}".format(dkey))
101
- break
102
-
103
- source = self.get_prefix(source)
104
- dvalue = self.datasummary[dkey]
305
+ def get_mapstream_summary_rows(self) -> List[MapstreamSummaryRow]:
306
+ """
307
+ Creates a list of MapstreamSummaryRow from the datasummary
308
+ """
309
+ rows = []
105
310
 
106
- input_count = "0"
107
- if "input_count" in dvalue:
108
- input_count = str(dvalue["input_count"])
311
+ for d_key in sorted(self.datasummary.keys(), key=str):
312
+ source = self.get_prefix(d_key.source)
313
+ count_data = self.datasummary[d_key]
109
314
 
110
- invalid_person_ids = "0"
111
- if "invalid_person_ids" in dvalue:
112
- invalid_person_ids = str(dvalue["invalid_person_ids"])
315
+ row = MapstreamSummaryRow(
316
+ dataset_name=self.dataset_name,
317
+ source=source,
318
+ fieldname=d_key.fieldname,
319
+ tablename=d_key.tablename,
320
+ concept_id=d_key.concept_id,
321
+ additional=d_key.additional,
322
+ input_count=count_data.get_count("input_count"),
323
+ invalid_person_ids=count_data.get_count("invalid_person_ids"),
324
+ invalid_date_fields=count_data.get_count("invalid_date_fields"),
325
+ invalid_source_fields=count_data.get_count("invalid_source_fields"),
326
+ output_count=count_data.get_count("output_count"),
327
+ )
113
328
 
114
- invalid_source_fields = "0"
115
- if "invalid_source_fields" in dvalue:
116
- invalid_source_fields = str(dvalue["invalid_source_fields"])
329
+ if row.output_count >= self.log_threshold:
330
+ rows.append(row)
331
+ return rows
117
332
 
118
- invalid_date_fields = "0"
119
- if "invalid_date_fields" in dvalue:
120
- invalid_date_fields = str(dvalue["invalid_date_fields"])
333
+ def get_mapstream_summary(self) -> str:
334
+ """
335
+ Makes a TSV string of the mapstream summary
336
+ """
337
+ summary_rows = self.get_mapstream_summary_rows()
338
+ result = MapstreamSummaryRow.get_header()
121
339
 
122
- output_count = "0"
123
- if "output_count" in dvalue:
124
- output_count = str(dvalue["output_count"])
340
+ for row in summary_rows:
341
+ result += row.to_tsv_row()
125
342
 
126
- if (int(output_count) >= self.log_threshold):
127
- summary_str += self.dataset_name + "\t" + source + "\t" + fieldname + "\t" + tablename + "\t" + concept_id + "\t" + additional +"\t" + input_count + "\t" + invalid_person_ids + "\t" + invalid_date_fields + "\t" + invalid_source_fields + "\t" + output_count + "\n"
343
+ return result
128
344
 
129
- return summary_str
345
+ def get_mapstream_summary_dict(self) -> Dict:
346
+ """
347
+ Makes a dict of the mapstream summary
348
+ """
349
+ rows = self.get_mapstream_summary_rows()
350
+ return {
351
+ "dataset": self.dataset_name,
352
+ "threshold": self.log_threshold,
353
+ "rows": [vars(row) for row in rows],
354
+ }
@@ -1,13 +1,19 @@
1
1
  import carrottransform.tools as tools
2
2
  import json
3
+ from carrottransform.tools.logger import logger_setup
3
4
  import re
4
5
  import sys
5
6
 
7
+ from pathlib import Path
8
+
9
+ logger = logger_setup()
10
+
11
+
6
12
  class OmopCDM:
7
13
  """
8
14
  Load and parse OMOP DDL data, to make an in-memory json CDM
9
15
  Merge in manual additions (currently necessary to identify, person id, date / time fields and autonumber fields)
10
- Define a series of "get" functions to allow CDM component discovery
16
+ Define a series of "get" functions to allow CDM component discovery
11
17
  """
12
18
 
13
19
  def __init__(self, omopddl, omopcfg):
@@ -28,15 +34,14 @@ class OmopCDM:
28
34
  self.person_id_field = self.get_columns("person_id_field")
29
35
  self.auto_number_field = self.get_columns("auto_number_field")
30
36
 
31
-
32
- def load_ddl(self, omopddl):
37
+ def load_ddl(self, omopddl: Path):
33
38
  try:
34
- fp = open(omopddl, "r")
35
- except Exception as err:
36
- print("OMOP ddl file ({0}) not found".format(omopddl))
39
+ fp = omopddl.open("r")
40
+ except Exception:
41
+ logger.exception("OMOP ddl file ({0}) not found".format(omopddl))
37
42
  sys.exit()
38
-
39
- return(self.process_ddl(fp))
43
+
44
+ return self.process_ddl(fp)
40
45
 
41
46
  def process_ddl(self, fp):
42
47
  """
@@ -51,13 +56,13 @@ class OmopCDM:
51
56
  output_dict["date_fields"] = {}
52
57
 
53
58
  ## matching for version number - matches '--postgres', any number of chars and some digits of the form X.Y, plus an end of string or end of line
54
- ver_rgx = re.compile(r'^--postgresql.*(\d+\.\d+)$')
59
+ ver_rgx = re.compile(r"^--postgresql.*(\d+\.\d+)$")
55
60
  ## matching for table name - matches 'CREATE TABLE @', some letters (upper and lower case), '.' and some more letters (lower case)
56
- start_rgx = re.compile(r'^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([a-zA-Z_]+)')
61
+ start_rgx = re.compile(r"^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([a-zA-Z_]+)")
57
62
  ## matches some whitespace, lower case letters(or underscores), whitespace, letters (upper/lower and underscores)
58
- datatype_rgx = re.compile(r'^\s*([a-z_]+)\s+([a-zA-Z_]+)')
63
+ datatype_rgx = re.compile(r"^\s*([a-z_]+)\s+([a-zA-Z_]+)")
59
64
  ## matching for end of file - matches close bracket, semi colon, end of file or line
60
- end_rgx = re.compile(r'.*[)];$')
65
+ end_rgx = re.compile(r".*[)];$")
61
66
  vermatched = False
62
67
  processing_table_data = False
63
68
  tabname = ""
@@ -65,21 +70,22 @@ class OmopCDM:
65
70
  for line in fp:
66
71
  line = line.strip()
67
72
  # check for line with version, if present
68
- if vermatched == False:
73
+ if not vermatched:
69
74
  vmatch = ver_rgx.search(line)
70
- if vmatch != None:
75
+ if vmatch is not None:
71
76
  version_string = vmatch.group(1)
72
77
  output_dict["omop_version"] = version_string
73
78
  vermatched = True
79
+
74
80
  # check for start of table definition
75
- if processing_table_data == False:
81
+ if not processing_table_data:
76
82
  smatch = start_rgx.search(line)
77
- if smatch != None:
83
+ if smatch is not None:
78
84
  processing_table_data = True
79
85
  tabname = smatch.group(2).lower()
80
86
  else:
81
87
  idtmatch = datatype_rgx.search(line)
82
- if idtmatch != None:
88
+ if idtmatch is not None:
83
89
  fname = idtmatch.group(1)
84
90
  ftype = idtmatch.group(2)
85
91
 
@@ -94,12 +100,16 @@ class OmopCDM:
94
100
  output_dict["datetime_fields"][tabname] = []
95
101
  if tabname not in output_dict["date_fields"]:
96
102
  output_dict["date_fields"][tabname] = []
97
-
103
+
98
104
  # Add in required column / field data
99
105
  output_dict["all_columns"][tabname].append(fname)
100
106
  if ftype.lower() in self.numeric_types:
101
107
  output_dict["numeric_fields"][tabname].append(fname)
102
- if ftype.lower() in self.numeric_types and "NOT" in line and "NULL" in line:
108
+ if (
109
+ ftype.lower() in self.numeric_types
110
+ and "NOT" in line
111
+ and "NULL" in line
112
+ ):
103
113
  output_dict["notnull_numeric_fields"][tabname].append(fname)
104
114
  if ftype.lower() in self.datetime_types:
105
115
  output_dict["datetime_fields"][tabname].append(fname)
@@ -107,19 +117,19 @@ class OmopCDM:
107
117
  output_dict["date_fields"][tabname].append(fname)
108
118
 
109
119
  ematch = end_rgx.search(line)
110
- if ematch != None:
120
+ if ematch is not None:
111
121
  processing_table_data = False
112
-
113
- return(output_dict)
114
-
122
+
123
+ return output_dict
124
+
115
125
  def dump_ddl(self):
116
- return(json.dumps(self.omop_json, indent=2))
126
+ return json.dumps(self.omop_json, indent=2)
117
127
 
118
128
  def merge_json(self, omopjson, omopcfg):
119
129
  tmp_json = tools.load_json(omopcfg)
120
130
  for key, data in tmp_json.items():
121
131
  omopjson[key] = data
122
- return(omopjson)
132
+ return omopjson
123
133
 
124
134
  def get_columns(self, colkey):
125
135
  if colkey in self.omop_json:
@@ -152,43 +162,43 @@ class OmopCDM:
152
162
  return True
153
163
 
154
164
  def get_omop_numeric_fields(self, tablename):
155
- if self.numeric_fields != None:
165
+ if self.numeric_fields is not None:
156
166
  if tablename in self.numeric_fields:
157
167
  return self.numeric_fields[tablename]
158
168
  return []
159
169
 
160
170
  def get_omop_notnull_numeric_fields(self, tablename):
161
- if self.notnull_numeric_fields != None:
171
+ if self.notnull_numeric_fields is not None:
162
172
  if tablename in self.notnull_numeric_fields:
163
173
  return self.notnull_numeric_fields[tablename]
164
174
  return []
165
175
 
166
176
  def get_omop_datetime_linked_fields(self, tablename):
167
- if self.datetime_linked_fields != None:
177
+ if self.datetime_linked_fields is not None:
168
178
  if tablename in self.datetime_linked_fields:
169
179
  return self.datetime_linked_fields[tablename]
170
180
  return {}
171
181
 
172
182
  def get_omop_date_field_components(self, tablename):
173
- if self.date_field_components != None:
183
+ if self.date_field_components is not None:
174
184
  if tablename in self.date_field_components:
175
185
  return self.date_field_components[tablename]
176
186
  return {}
177
187
 
178
188
  def get_omop_datetime_fields(self, tablename):
179
- if self.datetime_fields != None:
189
+ if self.datetime_fields is not None:
180
190
  if tablename in self.datetime_fields:
181
191
  return self.datetime_fields[tablename]
182
192
  return []
183
193
 
184
194
  def get_omop_person_id_field(self, tablename):
185
- if self.person_id_field != None:
195
+ if self.person_id_field is not None:
186
196
  if tablename in self.person_id_field:
187
197
  return self.person_id_field[tablename]
188
198
  return None
189
199
 
190
200
  def get_omop_auto_number_field(self, tablename):
191
- if self.auto_number_field != None:
201
+ if self.auto_number_field is not None:
192
202
  if tablename in self.auto_number_field:
193
203
  return self.auto_number_field[tablename]
194
204
  return None