carrot-transform 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

@@ -1,3 +1,95 @@
1
+
2
+ import logging
3
+ logger = logging.getLogger(__name__)
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Dict, List
7
+
8
+ @dataclass
9
+ class DataKey:
10
+ source: str
11
+ fieldname:str
12
+ tablename:str
13
+ concept_id:str
14
+ additional:str
15
+
16
+ def __str__(self) -> str:
17
+ """
18
+ The original implementation used strings as keys, then split by `~`.
19
+ This is here in case that representation is needed somewhere
20
+ """
21
+ return f"{self.source}~{self.fieldname}~{self.tablename}~{self.concept_id}~{self.additional}"
22
+ def __hash__(self) -> int:
23
+ """
24
+ The DataKey is used as a key for a dictionary of key counts
25
+ """
26
+ return hash((self.source, self.fieldname, self.tablename, self.concept_id, self.additional))
27
+
28
+ @dataclass
29
+ class CountData:
30
+ counts: Dict[str, int] = field(default_factory=dict)
31
+
32
+ def increment(self, count_type: str):
33
+ if count_type not in self.counts:
34
+ self.counts[count_type] = 0
35
+ self.counts[count_type] += 1
36
+
37
+ def get_count(self, count_type: str, default: int=0):
38
+ return self.counts.get(count_type, default)
39
+
40
+ @dataclass
41
+ class MapstreamSummaryRow:
42
+ """Represents a single row in the mapstream summary"""
43
+ dataset_name: str
44
+ source: str
45
+ fieldname: str
46
+ tablename: str
47
+ concept_id: str
48
+ additional: str
49
+ input_count: int = 0
50
+ invalid_person_ids: int = 0
51
+ invalid_date_fields: int = 0
52
+ invalid_source_fields: int = 0
53
+ output_count: int = 0
54
+
55
+ def to_tsv_row(self) -> str:
56
+ """Convert the row to a tab-separated string"""
57
+ row_list = [str(col) for col in [
58
+ self.dataset_name,
59
+ self.source,
60
+ self.fieldname,
61
+ self.tablename,
62
+ self.concept_id,
63
+ self.additional,
64
+ self.input_count,
65
+ self.invalid_person_ids,
66
+ self.invalid_date_fields,
67
+ self.invalid_source_fields,
68
+ self.output_count
69
+ ]]
70
+ # If python gets updated, you can move the row_str expression into the f-string
71
+ row_str = '\t'.join(row_list)
72
+ return f"{row_str}\n"
73
+
74
+ @classmethod
75
+ def get_header(cls) -> str:
76
+ """Return the TSV header row"""
77
+ header = [
78
+ "dsname",
79
+ "source",
80
+ "source_field",
81
+ "target",
82
+ "concept_id",
83
+ "additional",
84
+ "incount",
85
+ "invalid_persid",
86
+ "invalid_date",
87
+ "invalid_source",
88
+ "outcount"
89
+ ]
90
+ header_str = '\t'.join(header)
91
+ return f"{header_str}\n"
92
+
1
93
  class Metrics():
2
94
  """
3
95
  Capture metrics for output to a summary tsv file, record counts at multiple levels
@@ -58,21 +150,87 @@ class Metrics():
58
150
  self.datasummary[dkey][counttype] = 0
59
151
  self.datasummary[dkey][counttype] += int(count_block[counttype])
60
152
 
61
- def increment_key_count(self, dkey, count_type):
62
- """
63
- Intended to work with the mapstream functions
64
- """
153
+ def increment_key_count(self, source, fieldname, tablename, concept_id, additional, count_type):
154
+ dkey = DataKey(source, fieldname, tablename, concept_id, additional)
155
+
65
156
  if dkey not in self.datasummary:
66
- self.datasummary[dkey] = {}
67
- if count_type not in self.datasummary[dkey]:
68
- self.datasummary[dkey][count_type] = 0
69
- self.datasummary[dkey][count_type] += 1
157
+ self.datasummary[dkey] = CountData()
158
+
159
+ self.datasummary[dkey].increment(count_type)
160
+
161
+ def increment_with_datacol(
162
+ self,
163
+ source_path: str,
164
+ target_file: str,
165
+ datacol: str,
166
+ out_record: List[str],
167
+ ) -> None:
168
+ #Are the parameters for DataKeys hierarchical?
169
+ #If so, a nested structure where a Source contains n Fields etc. and each has a method to sum its children would be better
170
+ #But I don't know if that's the desired behaviour
171
+
172
+ #A lot of these increment the same thing, so I have defined `increment_this`
173
+ def increment_this(
174
+ fieldname: str,
175
+ concept_id: str,
176
+ additional = "",
177
+ ) -> None:
178
+ self.increment_key_count(
179
+ source=source_path,
180
+ fieldname=fieldname,
181
+ tablename=target_file,
182
+ concept_id=concept_id,
183
+ additional=additional,
184
+ count_type="output_count"
185
+ )
186
+ self.increment_key_count(
187
+ source=source_path,
188
+ fieldname="all",
189
+ tablename="all",
190
+ concept_id="all",
191
+ additional="",
192
+ count_type="output_count"
193
+ )
194
+
195
+ self.increment_key_count(
196
+ source="all",
197
+ fieldname="all",
198
+ tablename=target_file,
199
+ concept_id="all",
200
+ additional="",
201
+ count_type="output_count"
202
+ )
203
+ increment_this(fieldname="all", concept_id="all")
204
+
205
+ if target_file == "person":
206
+ increment_this(fieldname="all", concept_id=out_record[1])
207
+ increment_this(fieldname="all", concept_id=out_record[1], additional=out_record[2])
208
+ else:
209
+ increment_this(fieldname=datacol, concept_id=out_record[2])
210
+ increment_this(fieldname="all", concept_id=out_record[2])
211
+ self.increment_key_count(
212
+ source="all",
213
+ fieldname="all",
214
+ tablename=target_file,
215
+ concept_id=out_record[2],
216
+ additional="",
217
+ count_type="output_count"
218
+ )
219
+ self.increment_key_count(
220
+ source="all",
221
+ fieldname="all",
222
+ tablename="all",
223
+ concept_id=out_record[2],
224
+ additional="",
225
+ count_type="output_count"
226
+ )
227
+
70
228
 
71
229
  def get_summary(self):
72
230
  summary_str = "source\ttablename\tname\tcolumn name\tbefore\tafter content check\tpct reject content check\tafter date format check\tpct reject date format\n"
73
231
 
74
232
  for dkey in self.datasummary:
75
- #print(dkey)
233
+ logger.debug(dkey)
76
234
  source, tablename, name, colname = dkey.split('.')
77
235
  before_count = int(self.datasummary[dkey]["before"])
78
236
  after_count = int(self.datasummary[dkey]["after"])
@@ -90,40 +248,54 @@ class Metrics():
90
248
  def get_data_summary(self):
91
249
  return self.datasummary
92
250
 
93
- def get_mapstream_summary(self):
94
- summary_str = "dsname\tsource\tsource_field\ttarget\tconcept_id\tadditional\tincount\tinvalid_persid\tinvalid_date\tinvalid_source\toutcount\n"
95
-
96
- for dkey in sorted(self.datasummary):
97
- try:
98
- source, fieldname, tablename, concept_id, additional = dkey.split('~')
99
- except ValueError:
100
- print("get_mapstream_summary - ValueError: {0}".format(dkey))
101
- break
102
-
103
- source = self.get_prefix(source)
104
- dvalue = self.datasummary[dkey]
105
-
106
- input_count = "0"
107
- if "input_count" in dvalue:
108
- input_count = str(dvalue["input_count"])
251
+ def get_mapstream_summary_rows(self) -> List[MapstreamSummaryRow]:
252
+ """
253
+ Creates a list of MapstreamSummaryRow from the datasummary
254
+ """
255
+ rows = []
109
256
 
110
- invalid_person_ids = "0"
111
- if "invalid_person_ids" in dvalue:
112
- invalid_person_ids = str(dvalue["invalid_person_ids"])
257
+ for d_key in sorted(self.datasummary.keys(), key=str):
258
+ source = self.get_prefix(d_key.source)
259
+ count_data = self.datasummary[d_key]
113
260
 
114
- invalid_source_fields = "0"
115
- if "invalid_source_fields" in dvalue:
116
- invalid_source_fields = str(dvalue["invalid_source_fields"])
261
+ row = MapstreamSummaryRow(
262
+ dataset_name=self.dataset_name,
263
+ source=source,
264
+ fieldname=d_key.fieldname,
265
+ tablename=d_key.tablename,
266
+ concept_id=d_key.concept_id,
267
+ additional=d_key.additional,
268
+ input_count=count_data.get_count("input_count"),
269
+ invalid_person_ids=count_data.get_count("invalid_person_ids"),
270
+ invalid_date_fields=count_data.get_count("invalid_date_fields"),
271
+ invalid_source_fields=count_data.get_count("invalid_source_fields"),
272
+ output_count=count_data.get_count("output_count")
273
+ )
117
274
 
118
- invalid_date_fields = "0"
119
- if "invalid_date_fields" in dvalue:
120
- invalid_date_fields = str(dvalue["invalid_date_fields"])
275
+ if row.output_count >= self.log_threshold:
276
+ rows.append(row)
277
+ return rows
121
278
 
122
- output_count = "0"
123
- if "output_count" in dvalue:
124
- output_count = str(dvalue["output_count"])
125
279
 
126
- if (int(output_count) >= self.log_threshold):
127
- summary_str += self.dataset_name + "\t" + source + "\t" + fieldname + "\t" + tablename + "\t" + concept_id + "\t" + additional +"\t" + input_count + "\t" + invalid_person_ids + "\t" + invalid_date_fields + "\t" + invalid_source_fields + "\t" + output_count + "\n"
280
+ def get_mapstream_summary(self) -> str:
281
+ """
282
+ Makes a TSV string of the mapstream summary
283
+ """
284
+ summary_rows = self.get_mapstream_summary_rows()
285
+ result = MapstreamSummaryRow.get_header()
286
+
287
+ for row in summary_rows:
288
+ result += row.to_tsv_row()
289
+
290
+ return result
128
291
 
129
- return summary_str
292
+ def get_mapstream_summary_dict(self) -> Dict:
293
+ """
294
+ Makes a dict of the mapstream summary
295
+ """
296
+ rows = self.get_mapstream_summary_rows()
297
+ return {
298
+ "dataset": self.dataset_name,
299
+ "threshold": self.log_threshold,
300
+ "rows": [vars(row) for row in rows]
301
+ }
@@ -1,8 +1,13 @@
1
1
  import carrottransform.tools as tools
2
2
  import json
3
+ import logging
3
4
  import re
4
5
  import sys
5
6
 
7
+ from pathlib import Path
8
+
9
+ logger = logging.getLogger(__name__)
10
+
6
11
  class OmopCDM:
7
12
  """
8
13
  Load and parse OMOP DDL data, to make an in-memory json CDM
@@ -14,7 +19,10 @@ class OmopCDM:
14
19
  self.numeric_types = ["integer", "numeric"]
15
20
  self.datetime_types = ["timestamp"]
16
21
  self.date_types = ["date"]
22
+ ## ddl sets the headers to go in each table, and whether or not to make it null. Also allows for more tables than we will use.
23
+ ## also adds additional useful keys, like 'all_columns' - before merge
17
24
  self.omop_json = self.load_ddl(omopddl)
25
+ ## adds fields as a dict of dicts - is this so they can get picked up by some of these get_columns?
18
26
  self.omop_json = self.merge_json(self.omop_json, omopcfg)
19
27
  self.all_columns = self.get_columns("all_columns")
20
28
  self.numeric_fields = self.get_columns("numeric_fields")
@@ -26,11 +34,11 @@ class OmopCDM:
26
34
  self.auto_number_field = self.get_columns("auto_number_field")
27
35
 
28
36
 
29
- def load_ddl(self, omopddl):
37
+ def load_ddl(self, omopddl: Path):
30
38
  try:
31
- fp = open(omopddl, "r")
39
+ fp = omopddl.open("r")
32
40
  except Exception as err:
33
- print("OMOP ddl file ({0}) not found".format(omopddl))
41
+ logger.exception("OMOP ddl file ({0}) not found".format(omopddl))
34
42
  sys.exit()
35
43
 
36
44
  return(self.process_ddl(fp))
@@ -47,9 +55,13 @@ class OmopCDM:
47
55
  output_dict["datetime_fields"] = {}
48
56
  output_dict["date_fields"] = {}
49
57
 
58
+ ## matching for version number - matches '--postgres', any number of chars and some digits of the form X.Y, plus an end of string or end of line
50
59
  ver_rgx = re.compile(r'^--postgresql.*(\d+\.\d+)$')
51
- start_rgx = re.compile(r'^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([A-Z_]+)')
60
+ ## matching for table name - matches 'CREATE TABLE @', some letters (upper and lower case), '.' and some more letters (lower case)
61
+ start_rgx = re.compile(r'^CREATE\s*TABLE\s*(\@?[a-zA-Z]+\.)?([a-zA-Z_]+)')
62
+ ## matches some whitespace, lower case letters(or underscores), whitespace, letters (upper/lower and underscores)
52
63
  datatype_rgx = re.compile(r'^\s*([a-z_]+)\s+([a-zA-Z_]+)')
64
+ ## matching for end of file - matches close bracket, semi colon, end of file or line
53
65
  end_rgx = re.compile(r'.*[)];$')
54
66
  vermatched = False
55
67
  processing_table_data = False
@@ -76,7 +88,7 @@ class OmopCDM:
76
88
  fname = idtmatch.group(1)
77
89
  ftype = idtmatch.group(2)
78
90
 
79
- # Check for dictionary element presence
91
+ # Check for dictionary element presence, adn start an empty list if it doesn't already exist
80
92
  if tabname not in output_dict["all_columns"]:
81
93
  output_dict["all_columns"][tabname] = []
82
94
  if tabname not in output_dict["numeric_fields"]:
@@ -1,48 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: carrot_transform
3
- Version: 0.3.3
4
- Summary:
5
- Author: anwarfg
6
- Author-email: 913028+anwarfg@users.noreply.github.com
7
- Requires-Python: >=3.10,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.10
10
- Classifier: Programming Language :: Python :: 3.11
11
- Classifier: Programming Language :: Python :: 3.12
12
- Classifier: Programming Language :: Python :: 3.13
13
- Requires-Dist: click (>=8.1.7,<9.0.0)
14
- Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
15
- Requires-Dist: pandas (>=2.2.3,<3.0.0)
16
- Description-Content-Type: text/markdown
17
-
18
- <p align="center">
19
- <a href="https://carrot.ac.uk/" target="_blank">
20
- <picture>
21
- <source media="(prefers-color-scheme: dark)" srcset="/images/logo-dark.png">
22
- <img alt="Carrot Logo" src="/images/logo-primary.png" width="280"/>
23
- </picture>
24
- </a>
25
- </p>
26
- <div align="center">
27
- <strong>
28
- <h2>Streamlined Data Mapping to OMOP</h2>
29
- <a href="https://carrot.ac.uk/">Carrot Tranform</a> executes the conversion of the data to the OMOP CDM.<br />
30
- </strong>
31
- </div>
32
-
33
- TODO:
34
-
35
- - Document carrot-transform
36
- - Add more comments in-code
37
- - Handle capture of ddl and json config via the command-line as optional args
38
-
39
- Reduction in complexity over the original CaRROT-CDM version for the Transform part of _ETL_ - In practice _Extract_ is always
40
- performed by Data Partners, _Load_ by database bulk-load software.
41
-
42
- Statistics
43
-
44
- External libraries imported (approximate)
45
-
46
- carrot-cdm 61
47
- carrot-transform 12
48
-
@@ -1,17 +0,0 @@
1
- carrottransform/__init__.py,sha256=cQJKTCpG2qmKxDl-VtSWQ3_WFjyzg4u_8nZacWAHFcU,73
2
- carrottransform/_version.py,sha256=NfGqG2TgfjxxrlCHaOtwl3BcE0f6UH0VPrQgoDPjV7Y,72
3
- carrottransform/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- carrottransform/cli/command.py,sha256=xYTaJsVZyRYv0CzUwrh7ZPK8hhGyC3MDfvVYxHcXYSM,508
5
- carrottransform/cli/subcommands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- carrottransform/cli/subcommands/run.py,sha256=3z5cRG4ekyPOP5tvjZOyHUxbclKfBr_Z0tQRRoKj73E,20651
7
- carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql,sha256=fXrPfdL3IzU5ux55ogsQKjjd-c1KzdP_N2A_JjlY3gk,18084
8
- carrottransform/config/omop.json,sha256=OT3jvfPjKhjsDnQcQw1OAEOHhQLoHXNxTj_MDwNbYqo,1934
9
- carrottransform/tools/__init__.py,sha256=b3JuCwgJVx0rqx5igB8hNNKO0ktlbQjHGHwy-vzpdo0,198
10
- carrottransform/tools/file_helpers.py,sha256=xlODDAUpsx0H4sweGZ81ttjJjNQGn2spNUa1Fndotw8,316
11
- carrottransform/tools/mappingrules.py,sha256=bV6tXHBwVeKAUgCwFTZE2-qTcxKtbs3zbJWedBSviVI,6567
12
- carrottransform/tools/metrics.py,sha256=LOzm80-YIVM9mvgvQXRpyArl2nSfSTTW9DikqJ5M2Yg,5700
13
- carrottransform/tools/omopcdm.py,sha256=ycyPGgUTUwui7MLxH8JXd-MyCRkG0xOfEoDhCXeogmQ,7623
14
- carrot_transform-0.3.3.dist-info/LICENSE,sha256=pqIiuuTs6Na-oFd10MMsZoZmdfhfUhHeOtQzgzSkcaw,1082
15
- carrot_transform-0.3.3.dist-info/METADATA,sha256=23mVHLHLXOqgXUFLoU7cSaqIr_yzl9mYf_zgZnteeoY,1474
16
- carrot_transform-0.3.3.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
17
- carrot_transform-0.3.3.dist-info/RECORD,,