carrot-transform 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

Files changed (32) hide show
  1. {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
  2. carrot_transform-0.4.0.dist-info/RECORD +41 -0
  3. {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
  4. carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
  5. carrottransform/__init__.py +1 -1
  6. carrottransform/_version.py +2 -2
  7. carrottransform/cli/command.py +9 -5
  8. carrottransform/cli/subcommands/run.py +214 -526
  9. carrottransform/cli/subcommands/run_v2.py +145 -0
  10. carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
  11. carrottransform/examples/test/rules/v1.json +280 -0
  12. carrottransform/examples/test/rules/v2.json +115 -0
  13. carrottransform/tools/__init__.py +4 -14
  14. carrottransform/tools/args.py +128 -0
  15. carrottransform/tools/concept_helpers.py +61 -0
  16. carrottransform/tools/core.py +163 -0
  17. carrottransform/tools/date_helpers.py +79 -0
  18. carrottransform/tools/file_helpers.py +153 -9
  19. carrottransform/tools/logger.py +19 -0
  20. carrottransform/tools/mapping_types.py +32 -0
  21. carrottransform/tools/mappingrules.py +297 -34
  22. carrottransform/tools/metrics.py +162 -109
  23. carrottransform/tools/omopcdm.py +37 -32
  24. carrottransform/tools/orchestrator.py +381 -0
  25. carrottransform/tools/person_helpers.py +126 -0
  26. carrottransform/tools/record_builder.py +413 -0
  27. carrottransform/tools/stream_helpers.py +71 -0
  28. carrottransform/tools/types.py +71 -0
  29. carrottransform/tools/validation.py +62 -0
  30. carrot_transform-0.3.5.dist-info/RECORD +0 -25
  31. carrot_transform-0.3.5.dist-info/entry_points.txt +0 -3
  32. {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,17 +1,18 @@
1
-
2
- import logging
3
- logger = logging.getLogger(__name__)
4
-
5
1
  from dataclasses import dataclass, field
6
2
  from typing import Dict, List
7
3
 
4
+ from carrottransform.tools.logger import logger_setup
5
+
6
+ logger = logger_setup()
7
+
8
+
8
9
  @dataclass
9
10
  class DataKey:
10
11
  source: str
11
- fieldname:str
12
- tablename:str
13
- concept_id:str
14
- additional:str
12
+ fieldname: str
13
+ tablename: str
14
+ concept_id: str
15
+ additional: str
15
16
 
16
17
  def __str__(self) -> str:
17
18
  """
@@ -19,11 +20,21 @@ class DataKey:
19
20
  This is here in case that representation is needed somewhere
20
21
  """
21
22
  return f"{self.source}~{self.fieldname}~{self.tablename}~{self.concept_id}~{self.additional}"
23
+
22
24
  def __hash__(self) -> int:
23
25
  """
24
26
  The DataKey is used as a key for a dictionary of key counts
25
27
  """
26
- return hash((self.source, self.fieldname, self.tablename, self.concept_id, self.additional))
28
+ return hash(
29
+ (
30
+ self.source,
31
+ self.fieldname,
32
+ self.tablename,
33
+ self.concept_id,
34
+ self.additional,
35
+ )
36
+ )
37
+
27
38
 
28
39
  @dataclass
29
40
  class CountData:
@@ -33,13 +44,15 @@ class CountData:
33
44
  if count_type not in self.counts:
34
45
  self.counts[count_type] = 0
35
46
  self.counts[count_type] += 1
36
-
37
- def get_count(self, count_type: str, default: int=0):
47
+
48
+ def get_count(self, count_type: str, default: int = 0):
38
49
  return self.counts.get(count_type, default)
39
50
 
51
+
40
52
  @dataclass
41
53
  class MapstreamSummaryRow:
42
54
  """Represents a single row in the mapstream summary"""
55
+
43
56
  dataset_name: str
44
57
  source: str
45
58
  fieldname: str
@@ -51,57 +64,62 @@ class MapstreamSummaryRow:
51
64
  invalid_date_fields: int = 0
52
65
  invalid_source_fields: int = 0
53
66
  output_count: int = 0
54
-
67
+
55
68
  def to_tsv_row(self) -> str:
56
69
  """Convert the row to a tab-separated string"""
57
- row_list = [str(col) for col in [
58
- self.dataset_name,
59
- self.source,
60
- self.fieldname,
61
- self.tablename,
62
- self.concept_id,
63
- self.additional,
64
- self.input_count,
65
- self.invalid_person_ids,
66
- self.invalid_date_fields,
67
- self.invalid_source_fields,
68
- self.output_count
69
- ]]
70
+ row_list = [
71
+ str(col)
72
+ for col in [
73
+ self.dataset_name,
74
+ self.source,
75
+ self.fieldname,
76
+ self.tablename,
77
+ self.concept_id,
78
+ self.additional,
79
+ self.input_count,
80
+ self.invalid_person_ids,
81
+ self.invalid_date_fields,
82
+ self.invalid_source_fields,
83
+ self.output_count,
84
+ ]
85
+ ]
70
86
  # If python gets updated, you can move the row_str expression into the f-string
71
- row_str = '\t'.join(row_list)
87
+ row_str = "\t".join(row_list)
72
88
  return f"{row_str}\n"
73
-
89
+
74
90
  @classmethod
75
91
  def get_header(cls) -> str:
76
92
  """Return the TSV header row"""
77
93
  header = [
78
- "dsname",
79
- "source",
80
- "source_field",
81
- "target",
82
- "concept_id",
83
- "additional",
84
- "incount",
85
- "invalid_persid",
86
- "invalid_date",
87
- "invalid_source",
88
- "outcount"
89
- ]
90
- header_str = '\t'.join(header)
94
+ "dsname",
95
+ "source",
96
+ "source_field",
97
+ "target",
98
+ "concept_id",
99
+ "additional",
100
+ "incount",
101
+ "invalid_persid",
102
+ "invalid_date",
103
+ "invalid_source",
104
+ "outcount",
105
+ ]
106
+ header_str = "\t".join(header)
91
107
  return f"{header_str}\n"
92
108
 
93
- class Metrics():
109
+
110
+ class Metrics:
94
111
  """
95
112
  Capture metrics for output to a summary tsv file, record counts at multiple levels
96
113
  The main principle is to increment counts associated with datakeys (dkey) at different levels
97
114
  """
115
+
98
116
  def __init__(self, dataset_name, log_threshold=0):
99
117
  """
100
118
  self.datasummary holds all the saved counts
101
119
  """
102
- self.datasummary={}
103
- self.allcounts={}
104
- self.dataset_name=dataset_name
120
+ self.datasummary = {}
121
+ self.allcounts = {}
122
+ self.dataset_name = dataset_name
105
123
  self.log_threshold = log_threshold
106
124
 
107
125
  def get_new_mapstream_counts(self):
@@ -135,8 +153,18 @@ class Metrics():
135
153
  prfx = "NA"
136
154
  if "source_files" in increment:
137
155
  if fieldname in increment["source_files"]:
138
- prfx = self.get_prefix(increment["source_files"][fieldname]["table"])
139
- dkey = prfx + "." + desttablename + "." + name + "." + fieldname
156
+ prfx = self.get_prefix(
157
+ increment["source_files"][fieldname]["table"]
158
+ )
159
+ dkey = (
160
+ prfx
161
+ + "."
162
+ + desttablename
163
+ + "."
164
+ + name
165
+ + "."
166
+ + fieldname
167
+ )
140
168
  self.add_counts_to_summary(dkey, dataitem[fieldname])
141
169
 
142
170
  def get_prefix(self, fname):
@@ -150,7 +178,9 @@ class Metrics():
150
178
  self.datasummary[dkey][counttype] = 0
151
179
  self.datasummary[dkey][counttype] += int(count_block[counttype])
152
180
 
153
- def increment_key_count(self, source, fieldname, tablename, concept_id, additional, count_type):
181
+ def increment_key_count(
182
+ self, source, fieldname, tablename, concept_id, additional, count_type
183
+ ):
154
184
  dkey = DataKey(source, fieldname, tablename, concept_id, additional)
155
185
 
156
186
  if dkey not in self.datasummary:
@@ -159,87 +189,111 @@ class Metrics():
159
189
  self.datasummary[dkey].increment(count_type)
160
190
 
161
191
  def increment_with_datacol(
162
- self,
163
- source_path: str,
164
- target_file: str,
165
- datacol: str,
166
- out_record: List[str],
167
- ) -> None:
168
- #Are the parameters for DataKeys hierarchical?
169
- #If so, a nested structure where a Source contains n Fields etc. and each has a method to sum its children would be better
170
- #But I don't know if that's the desired behaviour
171
-
172
- #A lot of these increment the same thing, so I have defined `increment_this`
192
+ self,
193
+ source_path: str,
194
+ target_file: str,
195
+ datacol: str,
196
+ out_record: List[str],
197
+ ) -> None:
198
+ # Are the parameters for DataKeys hierarchical?
199
+ # If so, a nested structure where a Source contains n Fields etc. and each has a method to sum its children would be better
200
+ # But I don't know if that's the desired behaviour
201
+
202
+ # A lot of these increment the same thing, so I have defined `increment_this`
173
203
  def increment_this(
174
- fieldname: str,
175
- concept_id: str,
176
- additional = "",
177
- ) -> None:
204
+ fieldname: str,
205
+ concept_id: str,
206
+ additional="",
207
+ ) -> None:
178
208
  self.increment_key_count(
179
- source=source_path,
180
- fieldname=fieldname,
181
- tablename=target_file,
182
- concept_id=concept_id,
183
- additional=additional,
184
- count_type="output_count"
185
- )
186
- self.increment_key_count(
187
209
  source=source_path,
188
- fieldname="all",
189
- tablename="all",
190
- concept_id="all",
191
- additional="",
192
- count_type="output_count"
193
- )
210
+ fieldname=fieldname,
211
+ tablename=target_file,
212
+ concept_id=concept_id,
213
+ additional=additional,
214
+ count_type="output_count",
215
+ )
194
216
 
195
217
  self.increment_key_count(
196
- source="all",
197
- fieldname="all",
198
- tablename=target_file,
199
- concept_id="all",
200
- additional="",
201
- count_type="output_count"
202
- )
218
+ source=source_path,
219
+ fieldname="all",
220
+ tablename="all",
221
+ concept_id="all",
222
+ additional="",
223
+ count_type="output_count",
224
+ )
225
+
226
+ self.increment_key_count(
227
+ source="all",
228
+ fieldname="all",
229
+ tablename=target_file,
230
+ concept_id="all",
231
+ additional="",
232
+ count_type="output_count",
233
+ )
203
234
  increment_this(fieldname="all", concept_id="all")
204
-
235
+
205
236
  if target_file == "person":
206
237
  increment_this(fieldname="all", concept_id=out_record[1])
207
- increment_this(fieldname="all", concept_id=out_record[1], additional=out_record[2])
238
+ increment_this(
239
+ fieldname="all", concept_id=out_record[1], additional=out_record[2]
240
+ )
208
241
  else:
209
242
  increment_this(fieldname=datacol, concept_id=out_record[2])
210
243
  increment_this(fieldname="all", concept_id=out_record[2])
211
244
  self.increment_key_count(
212
- source="all",
213
- fieldname="all",
214
- tablename=target_file,
215
- concept_id=out_record[2],
216
- additional="",
217
- count_type="output_count"
218
- )
245
+ source="all",
246
+ fieldname="all",
247
+ tablename=target_file,
248
+ concept_id=out_record[2],
249
+ additional="",
250
+ count_type="output_count",
251
+ )
219
252
  self.increment_key_count(
220
- source="all",
221
- fieldname="all",
222
- tablename="all",
223
- concept_id=out_record[2],
224
- additional="",
225
- count_type="output_count"
226
- )
227
-
253
+ source="all",
254
+ fieldname="all",
255
+ tablename="all",
256
+ concept_id=out_record[2],
257
+ additional="",
258
+ count_type="output_count",
259
+ )
228
260
 
229
261
  def get_summary(self):
230
262
  summary_str = "source\ttablename\tname\tcolumn name\tbefore\tafter content check\tpct reject content check\tafter date format check\tpct reject date format\n"
231
263
 
232
264
  for dkey in self.datasummary:
233
265
  logger.debug(dkey)
234
- source, tablename, name, colname = dkey.split('.')
266
+ source, tablename, name, colname = dkey.split(".")
235
267
  before_count = int(self.datasummary[dkey]["before"])
236
268
  after_count = int(self.datasummary[dkey]["after"])
237
269
  after_pct = (float)(before_count - after_count) * 100 / before_count
238
- summary_str += source + "\t" + tablename + "\t" + name + "\t" + colname + "\t" + str(before_count) + "\t" + str(after_count) + "\t" + "{0:.3f}".format(after_pct) + "\t"
270
+ summary_str += (
271
+ source
272
+ + "\t"
273
+ + tablename
274
+ + "\t"
275
+ + name
276
+ + "\t"
277
+ + colname
278
+ + "\t"
279
+ + str(before_count)
280
+ + "\t"
281
+ + str(after_count)
282
+ + "\t"
283
+ + "{0:.3f}".format(after_pct)
284
+ + "\t"
285
+ )
239
286
  if "after_formatting" in self.datasummary[dkey]:
240
287
  after_format_count = int(self.datasummary[dkey]["after_formatting"])
241
- after_format_pct = (float)(after_count - after_format_count) * 100 / after_count
242
- summary_str += str(after_format_count) + "\t" + "{0:.3f}".format(after_format_pct) + "\n"
288
+ after_format_pct = (
289
+ (float)(after_count - after_format_count) * 100 / after_count
290
+ )
291
+ summary_str += (
292
+ str(after_format_count)
293
+ + "\t"
294
+ + "{0:.3f}".format(after_format_pct)
295
+ + "\n"
296
+ )
243
297
  else:
244
298
  summary_str += "NA\tNA\n"
245
299
 
@@ -269,24 +323,23 @@ class Metrics():
269
323
  invalid_person_ids=count_data.get_count("invalid_person_ids"),
270
324
  invalid_date_fields=count_data.get_count("invalid_date_fields"),
271
325
  invalid_source_fields=count_data.get_count("invalid_source_fields"),
272
- output_count=count_data.get_count("output_count")
326
+ output_count=count_data.get_count("output_count"),
273
327
  )
274
328
 
275
329
  if row.output_count >= self.log_threshold:
276
330
  rows.append(row)
277
331
  return rows
278
332
 
279
-
280
333
  def get_mapstream_summary(self) -> str:
281
334
  """
282
335
  Makes a TSV string of the mapstream summary
283
336
  """
284
337
  summary_rows = self.get_mapstream_summary_rows()
285
338
  result = MapstreamSummaryRow.get_header()
286
-
339
+
287
340
  for row in summary_rows:
288
341
  result += row.to_tsv_row()
289
-
342
+
290
343
  return result
291
344
 
292
345
  def get_mapstream_summary_dict(self) -> Dict:
@@ -297,5 +350,5 @@ class Metrics():
297
350
  return {
298
351
  "dataset": self.dataset_name,
299
352
  "threshold": self.log_threshold,
300
- "rows": [vars(row) for row in rows]
353
+ "rows": [vars(row) for row in rows],
301
354
  }
@@ -1,18 +1,19 @@
1
1
  import carrottransform.tools as tools
2
2
  import json
3
- import logging
3
+ from carrottransform.tools.logger import logger_setup
4
4
  import re
5
5
  import sys
6
6
 
7
7
  from pathlib import Path
8
8
 
9
- logger = logging.getLogger(__name__)
9
+ logger = logger_setup()
10
+
10
11
 
11
12
  class OmopCDM:
12
13
  """
13
14
  Load and parse OMOP DDL data, to make an in-memory json CDM
14
15
  Merge in manual additions (currently necessary to identify, person id, date / time fields and autonumber fields)
15
- Define a series of "get" functions to allow CDM component discovery
16
+ Define a series of "get" functions to allow CDM component discovery
16
17
  """
17
18
 
18
19
  def __init__(self, omopddl, omopcfg):
@@ -33,15 +34,14 @@ class OmopCDM:
33
34
  self.person_id_field = self.get_columns("person_id_field")
34
35
  self.auto_number_field = self.get_columns("auto_number_field")
35
36
 
36
-
37
37
  def load_ddl(self, omopddl: Path):
38
38
  try:
39
- fp = omopddl.open("r")
40
- except Exception as err:
39
+ fp = omopddl.open("r")
40
+ except Exception:
41
41
  logger.exception("OMOP ddl file ({0}) not found".format(omopddl))
42
42
  sys.exit()
43
-
44
- return(self.process_ddl(fp))
43
+
44
+ return self.process_ddl(fp)
45
45
 
46
46
  def process_ddl(self, fp):
47
47
  """
@@ -56,13 +56,13 @@ class OmopCDM:
56
56
  output_dict["date_fields"] = {}
57
57
 
58
58
  ## matching for version number - matches '--postgres', any number of chars and some digits of the form X.Y, plus an end of string or end of line
59
- ver_rgx = re.compile(r'^--postgresql.*(\d+\.\d+)$')
59
+ ver_rgx = re.compile(r"^--postgresql.*(\d+\.\d+)$")
60
60
  ## matching for table name - matches 'CREATE TABLE @', some letters (upper and lower case), '.' and some more letters (lower case)
61
- start_rgx = re.compile(r'^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([a-zA-Z_]+)')
61
+ start_rgx = re.compile(r"^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([a-zA-Z_]+)")
62
62
  ## matches some whitespace, lower case letters(or underscores), whitespace, letters (upper/lower and underscores)
63
- datatype_rgx = re.compile(r'^\s*([a-z_]+)\s+([a-zA-Z_]+)')
63
+ datatype_rgx = re.compile(r"^\s*([a-z_]+)\s+([a-zA-Z_]+)")
64
64
  ## matching for end of file - matches close bracket, semi colon, end of file or line
65
- end_rgx = re.compile(r'.*[)];$')
65
+ end_rgx = re.compile(r".*[)];$")
66
66
  vermatched = False
67
67
  processing_table_data = False
68
68
  tabname = ""
@@ -70,21 +70,22 @@ class OmopCDM:
70
70
  for line in fp:
71
71
  line = line.strip()
72
72
  # check for line with version, if present
73
- if vermatched == False:
73
+ if not vermatched:
74
74
  vmatch = ver_rgx.search(line)
75
- if vmatch != None:
75
+ if vmatch is not None:
76
76
  version_string = vmatch.group(1)
77
77
  output_dict["omop_version"] = version_string
78
78
  vermatched = True
79
+
79
80
  # check for start of table definition
80
- if processing_table_data == False:
81
+ if not processing_table_data:
81
82
  smatch = start_rgx.search(line)
82
- if smatch != None:
83
+ if smatch is not None:
83
84
  processing_table_data = True
84
85
  tabname = smatch.group(2).lower()
85
86
  else:
86
87
  idtmatch = datatype_rgx.search(line)
87
- if idtmatch != None:
88
+ if idtmatch is not None:
88
89
  fname = idtmatch.group(1)
89
90
  ftype = idtmatch.group(2)
90
91
 
@@ -99,12 +100,16 @@ class OmopCDM:
99
100
  output_dict["datetime_fields"][tabname] = []
100
101
  if tabname not in output_dict["date_fields"]:
101
102
  output_dict["date_fields"][tabname] = []
102
-
103
+
103
104
  # Add in required column / field data
104
105
  output_dict["all_columns"][tabname].append(fname)
105
106
  if ftype.lower() in self.numeric_types:
106
107
  output_dict["numeric_fields"][tabname].append(fname)
107
- if ftype.lower() in self.numeric_types and "NOT" in line and "NULL" in line:
108
+ if (
109
+ ftype.lower() in self.numeric_types
110
+ and "NOT" in line
111
+ and "NULL" in line
112
+ ):
108
113
  output_dict["notnull_numeric_fields"][tabname].append(fname)
109
114
  if ftype.lower() in self.datetime_types:
110
115
  output_dict["datetime_fields"][tabname].append(fname)
@@ -112,19 +117,19 @@ class OmopCDM:
112
117
  output_dict["date_fields"][tabname].append(fname)
113
118
 
114
119
  ematch = end_rgx.search(line)
115
- if ematch != None:
120
+ if ematch is not None:
116
121
  processing_table_data = False
117
-
118
- return(output_dict)
119
-
122
+
123
+ return output_dict
124
+
120
125
  def dump_ddl(self):
121
- return(json.dumps(self.omop_json, indent=2))
126
+ return json.dumps(self.omop_json, indent=2)
122
127
 
123
128
  def merge_json(self, omopjson, omopcfg):
124
129
  tmp_json = tools.load_json(omopcfg)
125
130
  for key, data in tmp_json.items():
126
131
  omopjson[key] = data
127
- return(omopjson)
132
+ return omopjson
128
133
 
129
134
  def get_columns(self, colkey):
130
135
  if colkey in self.omop_json:
@@ -157,43 +162,43 @@ class OmopCDM:
157
162
  return True
158
163
 
159
164
  def get_omop_numeric_fields(self, tablename):
160
- if self.numeric_fields != None:
165
+ if self.numeric_fields is not None:
161
166
  if tablename in self.numeric_fields:
162
167
  return self.numeric_fields[tablename]
163
168
  return []
164
169
 
165
170
  def get_omop_notnull_numeric_fields(self, tablename):
166
- if self.notnull_numeric_fields != None:
171
+ if self.notnull_numeric_fields is not None:
167
172
  if tablename in self.notnull_numeric_fields:
168
173
  return self.notnull_numeric_fields[tablename]
169
174
  return []
170
175
 
171
176
  def get_omop_datetime_linked_fields(self, tablename):
172
- if self.datetime_linked_fields != None:
177
+ if self.datetime_linked_fields is not None:
173
178
  if tablename in self.datetime_linked_fields:
174
179
  return self.datetime_linked_fields[tablename]
175
180
  return {}
176
181
 
177
182
  def get_omop_date_field_components(self, tablename):
178
- if self.date_field_components != None:
183
+ if self.date_field_components is not None:
179
184
  if tablename in self.date_field_components:
180
185
  return self.date_field_components[tablename]
181
186
  return {}
182
187
 
183
188
  def get_omop_datetime_fields(self, tablename):
184
- if self.datetime_fields != None:
189
+ if self.datetime_fields is not None:
185
190
  if tablename in self.datetime_fields:
186
191
  return self.datetime_fields[tablename]
187
192
  return []
188
193
 
189
194
  def get_omop_person_id_field(self, tablename):
190
- if self.person_id_field != None:
195
+ if self.person_id_field is not None:
191
196
  if tablename in self.person_id_field:
192
197
  return self.person_id_field[tablename]
193
198
  return None
194
199
 
195
200
  def get_omop_auto_number_field(self, tablename):
196
- if self.auto_number_field != None:
201
+ if self.auto_number_field is not None:
197
202
  if tablename in self.auto_number_field:
198
203
  return self.auto_number_field[tablename]
199
204
  return None