carrot-transform 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: carrot_transform
3
- Version: 0.3.4
3
+ Version: 0.3.5
4
4
  Summary:
5
5
  Author: anwarfg
6
6
  Author-email: 913028+anwarfg@users.noreply.github.com
@@ -12,8 +12,8 @@ Classifier: Programming Language :: Python :: 3.12
12
12
  Classifier: Programming Language :: Python :: 3.13
13
13
  Requires-Dist: click (>=8.1.7,<9.0.0)
14
14
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
15
+ Requires-Dist: numpy (<2)
15
16
  Requires-Dist: pandas (>=2.2.3,<3.0.0)
16
- Requires-Dist: pytest (>=8.3.4,<9.0.0)
17
17
  Description-Content-Type: text/markdown
18
18
 
19
19
  <p align="center">
@@ -3,7 +3,7 @@ carrottransform/_version.py,sha256=bm7SM-_MN0gstlNsCDO6dAajKcjQD-NxI_xpvfRx0Ts,1
3
3
  carrottransform/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  carrottransform/cli/command.py,sha256=xYTaJsVZyRYv0CzUwrh7ZPK8hhGyC3MDfvVYxHcXYSM,508
5
5
  carrottransform/cli/subcommands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- carrottransform/cli/subcommands/run.py,sha256=r2XanTvy4QowPbziZ5lqs-Tm8CAzCquL7DRy4lTT9Ak,23977
6
+ carrottransform/cli/subcommands/run.py,sha256=GfRHG_aLoBxuXkpGTTrRmsEcNUjTUB6cl8f1B7lTBt8,28461
7
7
  carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql,sha256=fXrPfdL3IzU5ux55ogsQKjjd-c1KzdP_N2A_JjlY3gk,18084
8
8
  carrottransform/config/omop.json,sha256=OT3jvfPjKhjsDnQcQw1OAEOHhQLoHXNxTj_MDwNbYqo,1934
9
9
  carrottransform/examples/test/inputs/Covid19_test.csv,sha256=d5t7Lfhkwbfe3Uk2IBqB2ZT5o0h9QaeraC8E5-IMERo,67521
@@ -13,12 +13,13 @@ carrottransform/examples/test/inputs/covid19_antibody.csv,sha256=SPCpyqpTbVq9987
13
13
  carrottransform/examples/test/inputs/vaccine.csv,sha256=_gcM-SIymyt2Dkkr_zGmQI9keIdmDm-gDI_QvXXLFrY,44037
14
14
  carrottransform/examples/test/rules/rules_14June2021.json,sha256=n2OYNFhbx-NLhmqjAad6RsfXjQFknZIgQ7a5uyJF0Co,13226
15
15
  carrottransform/tools/__init__.py,sha256=b3JuCwgJVx0rqx5igB8hNNKO0ktlbQjHGHwy-vzpdo0,198
16
- carrottransform/tools/file_helpers.py,sha256=xlODDAUpsx0H4sweGZ81ttjJjNQGn2spNUa1Fndotw8,316
17
- carrottransform/tools/mappingrules.py,sha256=IiZx24G27Rag-YgV-4jDxprJea9Ce7SZUbjxMm0n49k,7040
18
- carrottransform/tools/metrics.py,sha256=LOzm80-YIVM9mvgvQXRpyArl2nSfSTTW9DikqJ5M2Yg,5700
19
- carrottransform/tools/omopcdm.py,sha256=MwS_MwwBrypwjbFLuxoE0xlddWIi0T3BEPgN9LPkGAs,8508
20
- carrot_transform-0.3.4.dist-info/LICENSE,sha256=pqIiuuTs6Na-oFd10MMsZoZmdfhfUhHeOtQzgzSkcaw,1082
21
- carrot_transform-0.3.4.dist-info/METADATA,sha256=mbB8-GgOH6EnJXDr2j46Q97R3ID4Dro9IbgAFcJVAXY,4219
22
- carrot_transform-0.3.4.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
23
- carrot_transform-0.3.4.dist-info/entry_points.txt,sha256=z7qmjTl7C8shrYiPBy6yZo9RRZ31Jcvo6L8ntdqbs2E,74
24
- carrot_transform-0.3.4.dist-info/RECORD,,
16
+ carrottransform/tools/click.py,sha256=5fxl9zL6piwWMN4cSule0tG90E9g7eFNosoSu1ES1og,471
17
+ carrottransform/tools/file_helpers.py,sha256=_NRswYjqpBBkp4efMBhFf9XIRaqYTw1-jA22usyrbqA,1204
18
+ carrottransform/tools/mappingrules.py,sha256=jvWTLCQoLoCegmLWHPyRSRVOTLejp7LzmFMr-ENmuTU,7121
19
+ carrottransform/tools/metrics.py,sha256=VrcePVGwgHCJqQ1i9Q_KqL6Cv8IbIce2pSRSBth9808,11011
20
+ carrottransform/tools/omopcdm.py,sha256=fcqIub5ud57i-5J3iUvPi2dqfGgyjWnWJTH1djQzq9E,8603
21
+ carrot_transform-0.3.5.dist-info/LICENSE,sha256=pqIiuuTs6Na-oFd10MMsZoZmdfhfUhHeOtQzgzSkcaw,1082
22
+ carrot_transform-0.3.5.dist-info/METADATA,sha256=cW5wfZRrZoai-nnV5k9FVYY8-XGm24Qadu0hYV4P9R8,4206
23
+ carrot_transform-0.3.5.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
24
+ carrot_transform-0.3.5.dist-info/entry_points.txt,sha256=z7qmjTl7C8shrYiPBy6yZo9RRZ31Jcvo6L8ntdqbs2E,74
25
+ carrot_transform-0.3.5.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.1
2
+ Generator: poetry-core 2.1.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,45 +1,65 @@
1
+ import carrottransform
2
+ import carrottransform.tools as tools
3
+ import click
1
4
  import csv
2
- import os, time
3
5
  import datetime
4
6
  import fnmatch
5
- import sys
6
- import click
7
- import json
8
7
  import importlib.resources
9
- import carrottransform
10
- import carrottransform.tools as tools
8
+ import json
9
+ import logging
10
+ import os
11
+ import sys
12
+ import time
13
+
14
+ from carrottransform.tools.click import PathArgs
11
15
  from carrottransform.tools.omopcdm import OmopCDM
12
- from typing import Iterator, IO
16
+ from pathlib import Path
17
+
18
+ from typing import Iterator, IO, Iterable
19
+ from ...tools.file_helpers import resolve_paths
20
+
21
+ logger = logging.getLogger(__name__)
22
+ if not logger.handlers:
23
+ logger.setLevel(logging.INFO)
24
+
25
+ console_handler = logging.StreamHandler()
26
+ console_handler.setLevel(logging.INFO)
27
+
28
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
29
+ console_handler.setFormatter(formatter)
13
30
 
31
+ logger.addHandler(console_handler)
14
32
 
15
33
  @click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
16
34
  def run():
17
35
  pass
18
36
 
37
+
19
38
  @click.command()
20
- @click.option("--rules-file",
39
+ @click.option("--rules-file", type=PathArgs,
21
40
  required=True,
22
41
  help="json file containing mapping rules")
23
- @click.option("--output-dir",
42
+ @click.option("--output-dir", type=PathArgs,
24
43
  default=None,
44
+ required=True,
25
45
  help="define the output directory for OMOP-format tsv files")
26
46
  @click.option("--write-mode",
27
47
  default='w',
28
48
  type=click.Choice(['w','a']),
29
49
  help="force write-mode on output files")
30
- @click.option("--person-file",
50
+ @click.option("--person-file", type=PathArgs,
31
51
  required=True,
32
52
  help="File containing person_ids in the first column")
33
- @click.option("--omop-ddl-file",
53
+ @click.option("--omop-ddl-file", type=PathArgs,
34
54
  required=False,
35
55
  help="File containing OHDSI ddl statements for OMOP tables")
36
- @click.option("--omop-config-file",
56
+ @click.option("--omop-config-file", type=PathArgs,
37
57
  required=False,
38
58
  help="File containing additional / override json config for omop outputs")
39
59
  @click.option("--omop-version",
40
60
  required=False,
41
61
  help="Quoted string containing omop version - eg '5.3'")
42
- @click.option("--saved-person-id-file",
62
+ @click.option("--saved-person-id-file", type=PathArgs,
43
63
  default=None,
44
64
  required=False,
45
65
  help="Full path to person id file used to save person_id state and share person_ids between data sets")
@@ -47,7 +67,7 @@ def run():
47
67
  required=False,
48
68
  default='N',
49
69
  help="Use person ids as input without generating new integers")
50
- @click.option("--last-used-ids-file",
70
+ @click.option("--last-used-ids-file", type=PathArgs,
51
71
  default=None,
52
72
  required=False,
53
73
  help="Full path to last used ids file for OMOP tables - format: tablename\tlast_used_id, \nwhere last_used_id must be an integer")
@@ -55,46 +75,108 @@ def run():
55
75
  required=False,
56
76
  default=0,
57
77
  help="Lower outcount limit for logfile output")
58
- @click.argument("input-dir",
59
- required=False,
60
- nargs=-1)
61
- def mapstream(rules_file, output_dir, write_mode,
62
- person_file, omop_ddl_file, omop_config_file,
63
- omop_version, saved_person_id_file, use_input_person_ids,
64
- last_used_ids_file, log_file_threshold, input_dir):
78
+ @click.option("--input-dir", type=PathArgs,
79
+ required=True,
80
+ multiple=True,
81
+ help="Input directories")
82
+ def mapstream(
83
+ rules_file: Path,
84
+ output_dir: Path,
85
+ write_mode,
86
+ person_file: Path,
87
+ omop_ddl_file: Path,
88
+ omop_config_file: Path,
89
+ omop_version,
90
+ saved_person_id_file: Path,
91
+ use_input_person_ids,
92
+ last_used_ids_file: Path,
93
+ log_file_threshold,
94
+ input_dir: Iterable[Path],
95
+ ):
65
96
  """
66
97
  Map to output using input streams
67
98
  """
68
- # Initialisation
99
+
100
+
101
+ # Resolve any @package paths in the arguments
102
+ resolved_paths = resolve_paths([
103
+ rules_file,
104
+ output_dir,
105
+ person_file,
106
+ omop_ddl_file,
107
+ omop_config_file,
108
+ saved_person_id_file,
109
+ last_used_ids_file,
110
+ input_dir[0] if input_dir else None # Take first element of input_dir tuple
111
+ ])
112
+
113
+ # Assign back resolved paths
114
+ [rules_file, output_dir, person_file, omop_ddl_file,
115
+ omop_config_file, saved_person_id_file, last_used_ids_file,
116
+ input_dir] = resolved_paths
117
+
118
+ # Ensure input_dir is a list of paths
119
+ if isinstance(input_dir, (Path, str)):
120
+ input_dir = [input_dir]
121
+ elif isinstance(input_dir, tuple):
122
+ input_dir = list(input_dir)
123
+ # If it's already a list, leave it as is
124
+
125
+ # Initialisation
69
126
  # - check for values in optional arguments
70
127
  # - read in configuration files
71
128
  # - check main directories for existence
72
129
  # - handle saved person ids
73
130
  # - initialise metrics
74
- print(rules_file, output_dir, write_mode,
75
- person_file, omop_ddl_file, omop_config_file,
76
- omop_version, saved_person_id_file, use_input_person_ids,
77
- last_used_ids_file, log_file_threshold, input_dir)
131
+ logger.info(
132
+ ",".join(
133
+ map(
134
+ str,
135
+ [
136
+ rules_file,
137
+ output_dir,
138
+ write_mode,
139
+ person_file,
140
+ omop_ddl_file,
141
+ omop_config_file,
142
+ omop_version,
143
+ saved_person_id_file,
144
+ use_input_person_ids,
145
+ last_used_ids_file,
146
+ log_file_threshold,
147
+ input_dir,
148
+ ],
149
+ )
150
+ )
151
+ )
78
152
 
79
153
  ## set omop filenames
80
- omop_config_file, omop_ddl_file = set_omop_filenames(omop_ddl_file, omop_config_file, omop_version)
154
+ omop_config_file, omop_ddl_file = set_omop_filenames(
155
+ omop_ddl_file, omop_config_file, omop_version
156
+ )
81
157
  ## check directories are valid
82
- check_dir_isvalid(input_dir)
83
- check_dir_isvalid(output_dir)
158
+ for idir in input_dir:
159
+ check_dir_isvalid(idir) # Input directory must exist
160
+ check_dir_isvalid(output_dir, create_if_missing=True) # Create output directory if needed
161
+
84
162
 
85
163
  saved_person_id_file = set_saved_person_id_file(saved_person_id_file, output_dir)
86
-
87
- starttime = time.time()
164
+
165
+ start_time = time.time()
88
166
  ## create OmopCDM object, which contains attributes and methods for the omop data tables.
89
167
  omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
90
168
 
91
169
  ## mapping rules determine the ouput files? which input files and fields in the source data, AND the mappings to omop concepts
92
170
  mappingrules = tools.mappingrules.MappingRules(rules_file, omopcdm)
93
171
  metrics = tools.metrics.Metrics(mappingrules.get_dataset_name(), log_file_threshold)
94
- nowtime = time.time()
95
172
 
96
- print("--------------------------------------------------------------------------------")
97
- print("Loaded mapping rules from: {0} in {1:.5f} secs".format(rules_file, (nowtime - starttime)))
173
+ logger.info(
174
+ "--------------------------------------------------------------------------------"
175
+ )
176
+ logger.info(
177
+ f"Loaded mapping rules from: {rules_file} in {time.time() - start_time:.5f} secs"
178
+ )
179
+
98
180
  output_files = mappingrules.get_all_outfile_names()
99
181
 
100
182
  ## set record number
@@ -102,31 +184,30 @@ def mapstream(rules_file, output_dir, write_mode,
102
184
  record_numbers = {}
103
185
  for output_file in output_files:
104
186
  record_numbers[output_file] = 1
105
- if last_used_ids_file != None:
106
- if os.path.isfile(last_used_ids_file):
107
- record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
187
+ if (last_used_ids_file is not None) and last_used_ids_file.is_file():
188
+ record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
108
189
 
109
190
  fhd = {}
110
191
  tgtcolmaps = {}
111
192
 
112
-
113
-
114
193
  try:
115
194
  ## get all person_ids from file and either renumber with an int or take directly, and add to a dict
116
- person_lookup, rejected_person_count = load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_person_ids)
195
+ person_lookup, rejected_person_count = load_person_ids(saved_person_id_file,
196
+ person_file, mappingrules,
197
+ use_input_person_ids)
117
198
  ## open person_ids output file
118
- with open(saved_person_id_file, mode="w") as fhpout:
199
+ with saved_person_id_file.open(mode="w") as fhpout:
119
200
  ## write the header to the file
120
201
  fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
121
202
  ##iterate through the ids and write them to the file.
122
203
  for person_id, person_assigned_id in person_lookup.items():
123
- fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
204
+ fhpout.write(f"{str(person_id)}\t{str(person_assigned_id)}")
124
205
 
125
206
  ## Initialise output files (adding them to a dict), output a header for each
126
207
  ## these aren't being closed deliberately
127
208
  for tgtfile in output_files:
128
- fhd[tgtfile] = open(output_dir + "/" + tgtfile + ".tsv", mode=write_mode)
129
- if write_mode == 'w':
209
+ fhd[tgtfile] = (output_dir / tgtfile).with_suffix(".tsv").open(mode=write_mode)
210
+ if write_mode == "w":
130
211
  outhdr = omopcdm.get_omop_column_list(tgtfile)
131
212
  fhd[tgtfile].write("\t".join(outhdr) + "\n")
132
213
  ## maps all omop columns for each file into a dict containing the column name and the index
@@ -134,13 +215,13 @@ def mapstream(rules_file, output_dir, write_mode,
134
215
  tgtcolmaps[tgtfile] = omopcdm.get_omop_column_map(tgtfile)
135
216
 
136
217
  except IOError as e:
137
- print("I/O - error({0}): {1} -> {2}".format(e.errno, e.strerror, str(e)))
218
+ logger.exception(f"I/O - error({e.errno}): {e.strerror} -> {str(e)}")
138
219
  exit()
139
220
 
140
- print("person_id stats: total loaded {0}, reject count {1}".format(len(person_lookup), rejected_person_count))
221
+ logger.info(f"person_id stats: total loaded {len(person_lookup)}, reject count {rejected_person_count}")
141
222
 
142
223
  ## Compare files found in the input_dir with those expected based on mapping rules
143
- existing_input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
224
+ existing_input_files = [f.name for f in input_dir[0].glob("*.csv")]
144
225
  rules_input_files = mappingrules.get_all_infile_names()
145
226
 
146
227
  ## Log mismatches but continue
@@ -149,7 +230,7 @@ def mapstream(rules_file, output_dir, write_mode,
149
230
  ## set up overall counts
150
231
  rejidcounts = {}
151
232
  rejdatecounts = {}
152
- print(rules_input_files)
233
+ logger.info(rules_input_files)
153
234
 
154
235
  ## set up per-input counts
155
236
  for srcfilename in rules_input_files:
@@ -162,7 +243,7 @@ def mapstream(rules_file, output_dir, write_mode,
162
243
  rejcounts = {}
163
244
  rcount = 0
164
245
 
165
- fh, csvr = open_file(input_dir[0], srcfilename)
246
+ fh, csvr = open_file(input_dir[0] / srcfilename)
166
247
  if fh is None:
167
248
  continue
168
249
 
@@ -181,21 +262,37 @@ def mapstream(rules_file, output_dir, write_mode,
181
262
  inputcolmap = omopcdm.get_column_map(hdrdata)
182
263
  pers_id_col = inputcolmap[infile_person_id_source]
183
264
  datetime_col = inputcolmap[infile_datetime_source]
184
- print("--------------------------------------------------------------------------------")
185
- print("Processing input: {0}".format(srcfilename))
265
+
266
+ logger.info(
267
+ "--------------------------------------------------------------------------------"
268
+ )
269
+ logger.info(f"Processing input: {srcfilename}")
186
270
 
187
271
  # for each input record
188
272
  for indata in csvr:
189
- key = srcfilename + "~all~all~all~"
190
- metrics.increment_key_count(key, "input_count")
273
+ metrics.increment_key_count(
274
+ source=srcfilename,
275
+ fieldname="all",
276
+ tablename="all",
277
+ concept_id="all",
278
+ additional="",
279
+ count_type="input_count"
280
+ )
191
281
  rcount += 1
192
282
  # if there is a date, parse it - read it is a string and convert to YYYY-MM-DD
193
283
  strdate = indata[datetime_col].split(" ")[0]
194
284
  fulldate = parse_date(strdate)
195
- if fulldate != None:
285
+ if fulldate is not None:
196
286
  indata[datetime_col] = fulldate
197
287
  else:
198
- metrics.increment_key_count(key, "invalid_date_fields")
288
+ metrics.increment_key_count(
289
+ source=srcfilename,
290
+ fieldname="all",
291
+ tablename="all",
292
+ concept_id="all",
293
+ additional="",
294
+ count_type="input_date_fields"
295
+ )
199
296
  continue
200
297
 
201
298
  for tgtfile in tgtfiles:
@@ -209,9 +306,9 @@ def mapstream(rules_file, output_dir, write_mode,
209
306
 
210
307
  for datacol in datacols:
211
308
  built_records, outrecords, metrics = get_target_records(tgtfile, tgtcolmap, src_to_tgt, datacol, indata, inputcolmap, srcfilename, omopcdm, metrics)
212
- if built_records == True:
309
+ if built_records:
213
310
  for outrecord in outrecords:
214
- if auto_num_col != None:
311
+ if auto_num_col is not None:
215
312
  outrecord[tgtcolmap[auto_num_col]] = str(record_numbers[tgtfile])
216
313
  ### most of the rest of this section is actually to do with metrics
217
314
  record_numbers[tgtfile] += 1
@@ -219,70 +316,61 @@ def mapstream(rules_file, output_dir, write_mode,
219
316
  outrecord[tgtcolmap[pers_id_col]] = person_lookup[outrecord[tgtcolmap[pers_id_col]]]
220
317
  outcounts[tgtfile] += 1
221
318
 
222
- increment_key_counts(srcfilename, metrics, tgtfile, datacol, outrecord)
319
+ metrics.increment_with_datacol(
320
+ source_path=srcfilename,
321
+ target_file=tgtfile,
322
+ datacol=datacol,
323
+ out_record=outrecord
324
+ )
223
325
 
224
326
  # write the line to the file
225
327
  fhd[tgtfile].write("\t".join(outrecord) + "\n")
226
328
  else:
227
- key = srcfilename + "~all~" + tgtfile + "~all~"
228
- metrics.increment_key_count(key, "invalid_person_ids")
329
+ metrics.increment_key_count(
330
+ source=srcfilename,
331
+ fieldname="all",
332
+ tablename=tgtfile,
333
+ concept_id="all",
334
+ additional="",
335
+ count_type="invalid_person_ids",
336
+ )
229
337
  rejidcounts[srcfilename] += 1
230
338
 
231
339
  fh.close()
232
340
 
233
- nowtime= time.time()
234
- print("INPUT file data : {0}: input count {1}, time since start {2:.5} secs".format(srcfilename, str(rcount), (nowtime - starttime)))
341
+ logger.info(f"INPUT file data : {srcfilename}: input count {str(rcount)}, time since start {time.time() - start_time:.5} secs")
235
342
  for outtablename, count in outcounts.items():
236
- print("TARGET: {0}: output count {1}".format(outtablename, str(count)))
343
+ logger.info(f"TARGET: {outtablename}: output count {str(count)}")
237
344
  # END main processing loop
238
345
 
239
- print("--------------------------------------------------------------------------------")
346
+ logger.info(
347
+ "--------------------------------------------------------------------------------"
348
+ )
349
+
240
350
  data_summary = metrics.get_mapstream_summary()
241
351
  try:
242
- dsfh = open(output_dir + "/summary_mapstream.tsv", mode="w")
352
+ dsfh = (output_dir / "summary_mapstream.tsv").open(mode="w")
243
353
  dsfh.write(data_summary)
244
354
  dsfh.close()
245
355
  except IOError as e:
246
- print("I/O error({0}): {1}".format(e.errno, e.strerror))
247
- print("Unable to write file")
356
+ logger.exception(f"I/O error({e.errno}): {e.strerror}")
357
+ logger.exception("Unable to write file")
358
+ raise e
248
359
 
249
360
  # END mapstream
250
- nowtime = time.time()
251
- print("Elapsed time = {0:.5f} secs".format(nowtime - starttime))
252
-
253
- def increment_key_counts(srcfilename: str, metrics: tools.metrics.Metrics, tgtfile: str, datacol: str, outrecord: list[str]) -> None:
254
- key = srcfilename + "~all~all~all~"
255
- metrics.increment_key_count(key, "output_count")
256
-
257
- key = "all~all~" + tgtfile + "~all~"
258
- metrics.increment_key_count(key, "output_count")
259
-
260
- key = srcfilename + "~all~" + tgtfile + "~all~"
261
- metrics.increment_key_count(key, "output_count")
262
-
263
- if tgtfile == "person":
264
- key = srcfilename + "~all~" + tgtfile + "~" + outrecord[1] + "~"
265
- metrics.increment_key_count(key, "output_count")
266
-
267
- key = srcfilename + "~" + datacol + "~" + tgtfile + "~" + outrecord[1] + "~" + outrecord[2]
268
- metrics.increment_key_count(key, "output_count")
269
- else:
270
- key = srcfilename + "~" + datacol + "~" + tgtfile + "~" + outrecord[2] + "~"
271
- metrics.increment_key_count(key, "output_count")
272
-
273
- key = srcfilename + "~all~" + tgtfile + "~" + outrecord[2] + "~"
274
- metrics.increment_key_count(key, "output_count")
275
-
276
- key = "all~all~" + tgtfile + "~" + outrecord[2] + "~"
277
- metrics.increment_key_count(key, "output_count")
278
-
279
- key = "all~all~all~" + outrecord[2] + "~"
280
- metrics.increment_key_count(key, "output_count")
281
- return
282
-
283
-
284
- def get_target_records(tgtfilename: str, tgtcolmap: dict[str, dict[str, int]], rulesmap: dict[str, list[dict[str, list[str]]]], srcfield: str, srcdata: list[str], srccolmap: dict[str, int], srcfilename: str, omopcdm: OmopCDM, metrics: tools.metrics.Metrics) -> \
285
- tuple[bool, list[str], tools.metrics.Metrics]:
361
+ logger.info(f"Elapsed time = {time.time() - start_time:.5f} secs")
362
+
363
+
364
+ def get_target_records(
365
+ tgtfilename: str,
366
+ tgtcolmap: dict[str, dict[str, int]],
367
+ rulesmap: dict[str, list[dict[str, list[str]]]],
368
+ srcfield: str,
369
+ srcdata: list[str],
370
+ srccolmap: dict[str, int],
371
+ srcfilename: str,
372
+ omopcdm: OmopCDM,
373
+ metrics: tools.metrics.Metrics) -> tuple[bool, list[str], tools.metrics.Metrics]:
286
374
  """
287
375
  build all target records for a given input field
288
376
  """
@@ -292,8 +380,8 @@ tuple[bool, list[str], tools.metrics.Metrics]:
292
380
  date_component_data = omopcdm.get_omop_date_field_components(tgtfilename)
293
381
  notnull_numeric_fields = omopcdm.get_omop_notnull_numeric_fields(tgtfilename)
294
382
 
295
- srckey = srcfilename + "~" + srcfield + "~" + tgtfilename
296
- summarykey = srcfilename + "~" + srcfield + "~" + tgtfilename + "~all~"
383
+ srckey = f"{srcfilename}~{srcfield}~{tgtfilename}"
384
+ summarykey = srckey + "~all~"
297
385
  if valid_value(str(srcdata[srccolmap[srcfield]])):
298
386
  ## check if either or both of the srckey and summarykey are in the rules
299
387
  srcfullkey = srcfilename + "~" + srcfield + "~" + str(srcdata[srccolmap[srcfield]]) + "~" + tgtfilename
@@ -304,7 +392,7 @@ tuple[bool, list[str], tools.metrics.Metrics]:
304
392
  if srckey in rulesmap:
305
393
  build_records = True
306
394
  dictkeys.append(srckey)
307
- if build_records == True:
395
+ if build_records:
308
396
  for dictkey in dictkeys:
309
397
  for out_data_elem in rulesmap[dictkey]:
310
398
  valid_data_elem = True
@@ -333,27 +421,47 @@ tuple[bool, list[str], tools.metrics.Metrics]:
333
421
  fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
334
422
  tgtarray[tgtcolmap[output_col_data]] = fulldate
335
423
  else:
336
- metrics.increment_key_count(summarykey, "invalid_date_fields")
424
+ metrics.increment_key_count(
425
+ source=srcfilename,
426
+ fieldname=srcfield,
427
+ tablename=tgtfilename,
428
+ concept_id="all",
429
+ additional="",
430
+ count_type="invalid_date_fields"
431
+ )
337
432
  valid_data_elem = False
338
433
  elif output_col_data in date_col_data:
339
434
  fulldate = srcdata[srccolmap[infield]]
340
435
  tgtarray[tgtcolmap[output_col_data]] = fulldate
341
436
  tgtarray[tgtcolmap[date_col_data[output_col_data]]] = fulldate
342
- if valid_data_elem == True:
437
+ if valid_data_elem:
343
438
  tgtrecords.append(tgtarray)
344
439
  else:
345
- metrics.increment_key_count(summarykey, "invalid_source_fields")
346
-
440
+ metrics.increment_key_count(
441
+ source=srcfilename,
442
+ fieldname=srcfield,
443
+ tablename=tgtfilename,
444
+ concept_id="all",
445
+ additional="",
446
+ count_type="invalid_source_fields"
447
+ )
347
448
 
348
449
  return build_records, tgtrecords, metrics
349
450
 
451
+
350
452
  def valid_value(item):
351
453
  """
352
454
  Check if an item is non blank (null)
353
455
  """
354
456
  if item.strip() == "":
355
- return(False)
356
- return(True)
457
+ return False
458
+ return True
459
+
460
+
461
+ # DATE TESTING
462
+ # ------------
463
+ # I started by changing the get_datetime_value to be neater.
464
+ # I think it should be handled all as one thing, but I've spent too much time doing this already
357
465
 
358
466
  def valid_date_value(item):
359
467
  """
@@ -363,44 +471,33 @@ def valid_date_value(item):
363
471
  if item.strip() == "":
364
472
  return(False)
365
473
  if not valid_iso_date(item) and not valid_reverse_iso_date(item) and not valid_uk_date(item):
366
- #print("Bad date : {0}".format(item))
367
- return(False)
368
- return(True)
474
+ logger.warning("Bad date : {0}".format(item))
475
+ return False
476
+ return True
477
+
369
478
 
370
479
  def get_datetime_value(item):
371
480
  """
372
- Check if a date item is non null and parses as ISO (YYYY-MM-DD), reverse-ISO
373
- or dd/mm/yyyy or mm/dd/yyyy
481
+ Check if a date item is non-null and parses as ISO (YYYY-MM-DD), reverse-ISO (DD-MM-YYYY),
482
+ or UK format (DD/MM/YYYY).
483
+ Returns a datetime object if successful, None otherwise.
374
484
  """
375
- dt = None
376
- # Does the date parse as an ISO date?
377
- try:
378
- dt = datetime.datetime.strptime(item, "%Y-%m-%d")
379
- except ValueError:
380
- pass
381
- if dt != None:
382
- return(dt)
383
-
384
- # Does the date parse as a reverse ISO date?
385
- try:
386
- dt = datetime.datetime.strptime(item, "%d-%m-%Y")
387
- except ValueError:
388
- pass
389
-
390
- if dt != None:
391
- return(dt)
392
-
393
- # Does the date parse as a UK old-style date?
394
- try:
395
- dt = datetime.datetime.strptime(item, "%d/%m/%Y")
396
- except ValueError:
397
- pass
398
-
399
- if dt != None:
400
- return(dt)
401
-
485
+ date_formats = [
486
+ "%Y-%m-%d", # ISO format (YYYY-MM-DD)
487
+ "%d-%m-%Y", # Reverse ISO format (DD-MM-YYYY)
488
+ "%d/%m/%Y", # UK old-style format (DD/MM/YYYY)
489
+ ]
490
+
491
+ for date_format in date_formats:
492
+ try:
493
+ return datetime.datetime.strptime(item, date_format)
494
+ except ValueError:
495
+ continue
496
+
497
+ # If we get here, none of the formats worked
402
498
  return None
403
499
 
500
+
404
501
  def parse_date(item):
405
502
  """
406
503
  Crude hand-coded check on date format
@@ -411,9 +508,8 @@ def parse_date(item):
411
508
  if len(datedata) != 3:
412
509
  return None
413
510
  if len(datedata[2]) == 4:
414
- return("{0}-{1}-{2}".format(datedata[2], datedata[1], datedata[0]))
415
- return("{0}-{1}-{2}".format(datedata[0], datedata[1], datedata[2]))
416
-
511
+ return(f"{datedata[2]}-{datedata[1]}-{datedata[0]}".format(datedata[2], datedata[1], datedata[0]))
512
+ return "-".join(datedata[:3])
417
513
 
418
514
  def valid_iso_date(item):
419
515
  """
@@ -422,9 +518,10 @@ def valid_iso_date(item):
422
518
  try:
423
519
  datetime.datetime.strptime(item, "%Y-%m-%d")
424
520
  except ValueError:
425
- return(False)
521
+ return False
522
+
523
+ return True
426
524
 
427
- return(True)
428
525
 
429
526
  def valid_reverse_iso_date(item):
430
527
  """
@@ -433,9 +530,10 @@ def valid_reverse_iso_date(item):
433
530
  try:
434
531
  datetime.datetime.strptime(item, "%d-%m-%Y")
435
532
  except ValueError:
436
- return(False)
533
+ return False
534
+
535
+ return True
437
536
 
438
- return(True)
439
537
 
440
538
  def valid_uk_date(item):
441
539
  """
@@ -444,12 +542,15 @@ def valid_uk_date(item):
444
542
  try:
445
543
  datetime.datetime.strptime(item, "%d/%m/%Y")
446
544
  except ValueError:
447
- return(False)
545
+ return False
448
546
 
449
- return(True)
547
+ return True
450
548
 
451
- def load_last_used_ids(last_used_ids_file, last_used_ids):
452
- fh = open(last_used_ids_file, mode="r", encoding="utf-8-sig")
549
+
550
+ # End of date code
551
+
552
+ def load_last_used_ids(last_used_ids_file: Path, last_used_ids):
553
+ fh = last_used_ids_file.open(mode="r", encoding="utf-8-sig")
453
554
  csvr = csv.reader(fh, delimiter="\t")
454
555
 
455
556
  for last_ids_data in csvr:
@@ -458,8 +559,9 @@ def load_last_used_ids(last_used_ids_file, last_used_ids):
458
559
  fh.close()
459
560
  return last_used_ids
460
561
 
461
- def load_saved_person_ids(person_file):
462
- fh = open(person_file, mode="r", encoding="utf-8-sig")
562
+
563
+ def load_saved_person_ids(person_file: Path):
564
+ fh = person_file.open(mode="r", encoding="utf-8-sig")
463
565
  csvr = csv.reader(fh, delimiter="\t")
464
566
  last_int = 1
465
567
  person_ids = {}
@@ -475,23 +577,28 @@ def load_saved_person_ids(person_file):
475
577
  def load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_person_ids, delim=","):
476
578
  person_ids, person_number = get_person_lookup(saved_person_id_file)
477
579
 
478
- fh = open(person_file, mode="r", encoding="utf-8-sig")
580
+ fh = person_file.open(mode="r", encoding="utf-8-sig")
479
581
  csvr = csv.reader(fh, delimiter=delim)
480
582
  person_columns = {}
481
583
  person_col_in_hdr_number = 0
482
584
  reject_count = 0
483
585
 
484
586
  personhdr = next(csvr)
485
- print(personhdr)
587
+ logger.info(personhdr)
486
588
 
487
589
  # Make a dictionary of column names vs their positions
488
590
  for col in personhdr:
489
591
  person_columns[col] = person_col_in_hdr_number
490
592
  person_col_in_hdr_number += 1
491
593
 
492
- ## check the mapping rules for person to find where to get the person data) i.e., which column in the person file contains dob, sex
493
- birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info("person")
494
- print("Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source))
594
+ ## check the mapping rules for person to find where to get the person data) i.e., which column in the person file contains dob, sex
595
+ birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info(
596
+ "person"
597
+ )
598
+ logger.info(
599
+ "Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source)
600
+ )
601
+
495
602
  ## get the column index of the PersonID from the input file
496
603
  person_col = person_columns[person_id_source]
497
604
 
@@ -516,55 +623,122 @@ def load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_p
516
623
  def py():
517
624
  pass
518
625
 
519
- def check_dir_isvalid(directory: str | tuple[str, ...]) -> None:
626
+
627
+ def check_dir_isvalid(directory: Path | tuple[Path, ...], create_if_missing: bool = False) -> None:
628
+ """Check if directory is valid, optionally create it if missing.
629
+
630
+ Args:
631
+ directory: Directory path as string or tuple
632
+ create_if_missing: If True, create directory if it doesn't exist
633
+ """
634
+
635
+ ## check directory has been set
636
+ if directory is None:
637
+ logger.warning("Directory not provided.")
638
+ sys.exit(1)
639
+
520
640
  ## check output dir is valid
521
- if type(directory) is tuple:
641
+ elif type(directory) is tuple:
522
642
  directory = directory[0]
523
643
 
524
- if not os.path.isdir(directory):
525
- print("Not a directory, dir {0}".format(directory))
526
- sys.exit(1)
527
644
 
528
- def set_saved_person_id_file(saved_person_id_file: str, output_dir: str) -> str:
529
- ## check if there is a saved person id file set in options - if not, check if the file exists and remove it
645
+ ## if not a directory, create it if requested (including parents. This option is for the output directory only).
646
+ if not directory.is_dir():
647
+ if create_if_missing:
648
+ try:
649
+ ## deliberately not using the exist_ok option, as we want to know whether it was created or not to provide different logger messages.
650
+ directory.mkdir(parents = True)
651
+ logger.info(f"Created directory: {directory}")
652
+ except OSError as e:
653
+ logger.warning(f"Failed to create directory {directory}: {e}")
654
+ sys.exit(1)
655
+ else:
656
+ logger.warning(f"Not a directory, dir {directory}")
657
+ sys.exit(1)
658
+
659
+ # Handle tuple input (like input_dir)
660
+ if isinstance(directory, tuple):
661
+ if not directory: # Empty tuple
662
+ print("No directory provided")
663
+ sys.exit(1)
664
+ directory = directory[0]
665
+
666
+ # Handle string input
667
+ dir_path = str(directory)
668
+ if not os.path.isdir(dir_path):
669
+ if create_if_missing:
670
+ try:
671
+ os.makedirs(dir_path)
672
+ print(f"Created directory: {dir_path}")
673
+ except OSError as e:
674
+ print(f"Failed to create directory {dir_path}: {e}")
675
+ sys.exit(1)
676
+ else:
677
+ print(f"Not a directory, dir {dir_path}")
678
+ sys.exit(1)
679
+
680
+
681
+ def set_saved_person_id_file(
682
+ saved_person_id_file: Path | None, output_dir: Path
683
+ ) -> Path:
684
+ """check if there is a saved person id file set in options - if not, check if the file exists and remove it"""
685
+
530
686
  if saved_person_id_file is None:
531
- saved_person_id_file = output_dir + "/" + "person_ids.tsv"
532
- if os.path.exists(saved_person_id_file):
533
- os.remove(saved_person_id_file)
687
+ saved_person_id_file = output_dir / "person_ids.tsv"
688
+ if saved_person_id_file.exists():
689
+ assert not saved_person_id_file.is_dir()
690
+ saved_person_id_file.unlink()
691
+ else:
692
+ assert not saved_person_id_file.is_dir()
534
693
  return saved_person_id_file
535
694
 
536
-
537
695
  def check_files_in_rules_exist(rules_input_files: list[str], existing_input_files: list[str]) -> None:
538
696
  for infile in existing_input_files:
539
697
  if infile not in rules_input_files:
540
- msg = "WARNING: no mapping rules found for existing input file - {0}".format(infile)
541
- print(msg)
698
+ msg = (
699
+ "WARNING: no mapping rules found for existing input file - {0}".format(
700
+ infile
701
+ )
702
+ )
703
+ logger.warning(msg)
542
704
  for infile in rules_input_files:
543
705
  if infile not in existing_input_files:
544
706
  msg = "WARNING: no data for mapped input file - {0}".format(infile)
545
- print(msg)
707
+ logger.warning(msg)
546
708
 
547
- def open_file(directory: str, filename: str) -> tuple[IO[str], Iterator[list[str]]] | None:
548
- #def open_file(directory: str, filename: str):
709
+ def open_file(file_path: Path) -> tuple[IO[str], Iterator[list[str]]] | None:
710
+ """opens a file and does something related to CSVs"""
549
711
  try:
550
- fh = open(directory + "/" + filename, mode="r", encoding="utf-8-sig")
712
+ fh = file_path.open(mode="r", encoding="utf-8-sig")
551
713
  csvr = csv.reader(fh)
552
714
  return fh, csvr
553
715
  except IOError as e:
554
- print("Unable to open: {0}".format(directory + "/" + filename))
555
- print("I/O error({0}): {1}".format(e.errno, e.strerror))
716
+ logger.exception("Unable to open: {0}".format(file_path))
717
+ logger.exception("I/O error({0}): {1}".format(e.errno, e.strerror))
556
718
  return None
557
719
 
558
- def set_omop_filenames(omop_ddl_file: str, omop_config_file: str, omop_version: str) -> tuple[str, str]:
559
- if (omop_ddl_file is None) and (omop_config_file is None) and (omop_version is not None):
560
- omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
720
+
721
+ def set_omop_filenames(
722
+ omop_ddl_file: Path, omop_config_file: Path, omop_version: str
723
+ ) -> tuple[Path, Path]:
724
+ if (
725
+ (omop_ddl_file is None)
726
+ and (omop_config_file is None)
727
+ and (omop_version is not None)
728
+ ):
729
+ omop_config_file = (
730
+ importlib.resources.files("carrottransform") / "config/omop.json"
731
+ )
561
732
  omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
562
- omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
733
+ omop_ddl_file = (
734
+ importlib.resources.files("carrottransform") / "config" / omop_ddl_file_name
735
+ )
563
736
  return omop_config_file, omop_ddl_file
564
737
 
565
- def get_person_lookup(saved_person_id_file: str) -> tuple[dict[str, str], int]:
738
+
739
+ def get_person_lookup(saved_person_id_file: Path) -> tuple[dict[str, str], int]:
566
740
  # Saved-person-file existence test, reload if found, return last used integer
567
- if os.path.isfile(saved_person_id_file):
741
+ if saved_person_id_file.is_file():
568
742
  person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
569
743
  else:
570
744
  person_lookup = {}
@@ -572,6 +746,3 @@ def get_person_lookup(saved_person_id_file: str) -> tuple[dict[str, str], int]:
572
746
  return person_lookup, last_used_integer
573
747
 
574
748
  run.add_command(mapstream,"mapstream")
575
-
576
- if __name__== '__main__':
577
- mapstream()
@@ -0,0 +1,21 @@
1
+ import click
2
+ from pathlib import Path
3
+
4
+
5
+ def PathArgs():
6
+ """used by the click library for CLI args that are files"""
7
+
8
+ class PathArgs(click.ParamType):
9
+ name = "pathlib.Path"
10
+
11
+ def convert(self, value, param, ctx):
12
+ try:
13
+ return Path(value)
14
+ except Exception as e:
15
+ self.fail(f"Invalid path: {value} ({e})", param, ctx)
16
+
17
+ return PathArgs()
18
+
19
+
20
+ # use this
21
+ PathArgs = PathArgs()
@@ -1,15 +1,41 @@
1
+ import json
2
+ import logging
1
3
  import os
2
4
  import sys
3
5
  import json
6
+ import importlib.resources as resources
7
+ from typing import List, Optional
8
+ from pathlib import Path
9
+
10
+ logger = logging.getLogger(__name__)
11
+
4
12
 
5
13
  # Function inherited from the "old" CaRROT-CDM (modfied to exit on error)
6
-
7
- def load_json(f_in):
14
+
15
+
16
+ def load_json(f_in: Path):
8
17
  try:
9
- data = json.load(open(f_in))
18
+ data = json.load(f_in.open())
10
19
  except Exception as err:
11
- print ("{0} not found. Or cannot parse as json".format(f_in))
20
+ logger.exception("{0} not found. Or cannot parse as json".format(f_in))
12
21
  sys.exit()
13
22
 
14
23
  return data
15
24
 
25
+
26
+ def resolve_paths(args: List[Optional[Path]]) -> List[Optional[Path]]:
27
+ """Resolve special path syntaxes in command line arguments."""
28
+ try:
29
+ with resources.files('carrottransform').joinpath('__init__.py') as f:
30
+ package_path = f.parent
31
+ except Exception:
32
+ # Fallback for development environment
33
+ import carrottransform
34
+ package_path = Path(carrottransform.__file__).resolve().parent
35
+
36
+ # Handle None values and replace @carrot with the actual package path
37
+ prefix = '@carrot'
38
+ return [
39
+ package_path / Path(str(arg).replace(prefix, '').lstrip('/')) if arg is not None and str(arg).startswith(prefix) else arg
40
+ for arg in args
41
+ ]
@@ -3,13 +3,16 @@ import json
3
3
  import carrottransform.tools as tools
4
4
  from .omopcdm import OmopCDM
5
5
 
6
+ import logging
7
+ logger = logging.getLogger(__name__)
8
+
6
9
  class MappingRules:
7
10
  """
8
11
  self.rules_data stores the mapping rules as untransformed json, as each input file is processed rules are reorganised
9
12
  as a file-specific dictionary allowing rules to be "looked-up" depending on data content
10
13
  """
11
14
 
12
- def __init__(self, rulesfilepath, omopcdm):
15
+ def __init__(self, rulesfilepath: os.PathLike, omopcdm: OmopCDM):
13
16
  ## just loads the json directly
14
17
  self.rules_data = tools.load_json(rulesfilepath)
15
18
  self.omopcdm = omopcdm
@@ -80,7 +83,7 @@ class MappingRules:
80
83
  outfile = keydata[-1]
81
84
  for outfield_elem in outfield_data:
82
85
  for infield, outfield_list in outfield_elem.items():
83
- #print("{0}, {1}, {2}".format(outfile, infield, str(outfield_list)))
86
+ logger.debug("{0}, {1}, {2}".format(outfile, infield, str(outfield_list)))
84
87
  for outfield in outfield_list:
85
88
  if outfield.split('~')[0] in self.omopcdm.get_omop_datetime_fields(outfile):
86
89
  datetime_source = infield
@@ -1,3 +1,95 @@
1
+
2
+ import logging
3
+ logger = logging.getLogger(__name__)
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Dict, List
7
+
8
+ @dataclass
9
+ class DataKey:
10
+ source: str
11
+ fieldname:str
12
+ tablename:str
13
+ concept_id:str
14
+ additional:str
15
+
16
+ def __str__(self) -> str:
17
+ """
18
+ The original implementation used strings as keys, then split by `~`.
19
+ This is here in case that representation is needed somewhere
20
+ """
21
+ return f"{self.source}~{self.fieldname}~{self.tablename}~{self.concept_id}~{self.additional}"
22
+ def __hash__(self) -> int:
23
+ """
24
+ The DataKey is used as a key for a dictionary of key counts
25
+ """
26
+ return hash((self.source, self.fieldname, self.tablename, self.concept_id, self.additional))
27
+
28
+ @dataclass
29
+ class CountData:
30
+ counts: Dict[str, int] = field(default_factory=dict)
31
+
32
+ def increment(self, count_type: str):
33
+ if count_type not in self.counts:
34
+ self.counts[count_type] = 0
35
+ self.counts[count_type] += 1
36
+
37
+ def get_count(self, count_type: str, default: int=0):
38
+ return self.counts.get(count_type, default)
39
+
40
+ @dataclass
41
+ class MapstreamSummaryRow:
42
+ """Represents a single row in the mapstream summary"""
43
+ dataset_name: str
44
+ source: str
45
+ fieldname: str
46
+ tablename: str
47
+ concept_id: str
48
+ additional: str
49
+ input_count: int = 0
50
+ invalid_person_ids: int = 0
51
+ invalid_date_fields: int = 0
52
+ invalid_source_fields: int = 0
53
+ output_count: int = 0
54
+
55
+ def to_tsv_row(self) -> str:
56
+ """Convert the row to a tab-separated string"""
57
+ row_list = [str(col) for col in [
58
+ self.dataset_name,
59
+ self.source,
60
+ self.fieldname,
61
+ self.tablename,
62
+ self.concept_id,
63
+ self.additional,
64
+ self.input_count,
65
+ self.invalid_person_ids,
66
+ self.invalid_date_fields,
67
+ self.invalid_source_fields,
68
+ self.output_count
69
+ ]]
70
+ # If python gets updated, you can move the row_str expression into the f-string
71
+ row_str = '\t'.join(row_list)
72
+ return f"{row_str}\n"
73
+
74
+ @classmethod
75
+ def get_header(cls) -> str:
76
+ """Return the TSV header row"""
77
+ header = [
78
+ "dsname",
79
+ "source",
80
+ "source_field",
81
+ "target",
82
+ "concept_id",
83
+ "additional",
84
+ "incount",
85
+ "invalid_persid",
86
+ "invalid_date",
87
+ "invalid_source",
88
+ "outcount"
89
+ ]
90
+ header_str = '\t'.join(header)
91
+ return f"{header_str}\n"
92
+
1
93
  class Metrics():
2
94
  """
3
95
  Capture metrics for output to a summary tsv file, record counts at multiple levels
@@ -58,21 +150,87 @@ class Metrics():
58
150
  self.datasummary[dkey][counttype] = 0
59
151
  self.datasummary[dkey][counttype] += int(count_block[counttype])
60
152
 
61
- def increment_key_count(self, dkey, count_type):
62
- """
63
- Intended to work with the mapstream functions
64
- """
153
+ def increment_key_count(self, source, fieldname, tablename, concept_id, additional, count_type):
154
+ dkey = DataKey(source, fieldname, tablename, concept_id, additional)
155
+
65
156
  if dkey not in self.datasummary:
66
- self.datasummary[dkey] = {}
67
- if count_type not in self.datasummary[dkey]:
68
- self.datasummary[dkey][count_type] = 0
69
- self.datasummary[dkey][count_type] += 1
157
+ self.datasummary[dkey] = CountData()
158
+
159
+ self.datasummary[dkey].increment(count_type)
160
+
161
+ def increment_with_datacol(
162
+ self,
163
+ source_path: str,
164
+ target_file: str,
165
+ datacol: str,
166
+ out_record: List[str],
167
+ ) -> None:
168
+ #Are the parameters for DataKeys hierarchical?
169
+ #If so, a nested structure where a Source contains n Fields etc. and each has a method to sum its children would be better
170
+ #But I don't know if that's the desired behaviour
171
+
172
+ #A lot of these increment the same thing, so I have defined `increment_this`
173
+ def increment_this(
174
+ fieldname: str,
175
+ concept_id: str,
176
+ additional = "",
177
+ ) -> None:
178
+ self.increment_key_count(
179
+ source=source_path,
180
+ fieldname=fieldname,
181
+ tablename=target_file,
182
+ concept_id=concept_id,
183
+ additional=additional,
184
+ count_type="output_count"
185
+ )
186
+ self.increment_key_count(
187
+ source=source_path,
188
+ fieldname="all",
189
+ tablename="all",
190
+ concept_id="all",
191
+ additional="",
192
+ count_type="output_count"
193
+ )
194
+
195
+ self.increment_key_count(
196
+ source="all",
197
+ fieldname="all",
198
+ tablename=target_file,
199
+ concept_id="all",
200
+ additional="",
201
+ count_type="output_count"
202
+ )
203
+ increment_this(fieldname="all", concept_id="all")
204
+
205
+ if target_file == "person":
206
+ increment_this(fieldname="all", concept_id=out_record[1])
207
+ increment_this(fieldname="all", concept_id=out_record[1], additional=out_record[2])
208
+ else:
209
+ increment_this(fieldname=datacol, concept_id=out_record[2])
210
+ increment_this(fieldname="all", concept_id=out_record[2])
211
+ self.increment_key_count(
212
+ source="all",
213
+ fieldname="all",
214
+ tablename=target_file,
215
+ concept_id=out_record[2],
216
+ additional="",
217
+ count_type="output_count"
218
+ )
219
+ self.increment_key_count(
220
+ source="all",
221
+ fieldname="all",
222
+ tablename="all",
223
+ concept_id=out_record[2],
224
+ additional="",
225
+ count_type="output_count"
226
+ )
227
+
70
228
 
71
229
  def get_summary(self):
72
230
  summary_str = "source\ttablename\tname\tcolumn name\tbefore\tafter content check\tpct reject content check\tafter date format check\tpct reject date format\n"
73
231
 
74
232
  for dkey in self.datasummary:
75
- #print(dkey)
233
+ logger.debug(dkey)
76
234
  source, tablename, name, colname = dkey.split('.')
77
235
  before_count = int(self.datasummary[dkey]["before"])
78
236
  after_count = int(self.datasummary[dkey]["after"])
@@ -90,40 +248,54 @@ class Metrics():
90
248
  def get_data_summary(self):
91
249
  return self.datasummary
92
250
 
93
- def get_mapstream_summary(self):
94
- summary_str = "dsname\tsource\tsource_field\ttarget\tconcept_id\tadditional\tincount\tinvalid_persid\tinvalid_date\tinvalid_source\toutcount\n"
95
-
96
- for dkey in sorted(self.datasummary):
97
- try:
98
- source, fieldname, tablename, concept_id, additional = dkey.split('~')
99
- except ValueError:
100
- print("get_mapstream_summary - ValueError: {0}".format(dkey))
101
- break
102
-
103
- source = self.get_prefix(source)
104
- dvalue = self.datasummary[dkey]
105
-
106
- input_count = "0"
107
- if "input_count" in dvalue:
108
- input_count = str(dvalue["input_count"])
251
+ def get_mapstream_summary_rows(self) -> List[MapstreamSummaryRow]:
252
+ """
253
+ Creates a list of MapstreamSummaryRow from the datasummary
254
+ """
255
+ rows = []
109
256
 
110
- invalid_person_ids = "0"
111
- if "invalid_person_ids" in dvalue:
112
- invalid_person_ids = str(dvalue["invalid_person_ids"])
257
+ for d_key in sorted(self.datasummary.keys(), key=str):
258
+ source = self.get_prefix(d_key.source)
259
+ count_data = self.datasummary[d_key]
113
260
 
114
- invalid_source_fields = "0"
115
- if "invalid_source_fields" in dvalue:
116
- invalid_source_fields = str(dvalue["invalid_source_fields"])
261
+ row = MapstreamSummaryRow(
262
+ dataset_name=self.dataset_name,
263
+ source=source,
264
+ fieldname=d_key.fieldname,
265
+ tablename=d_key.tablename,
266
+ concept_id=d_key.concept_id,
267
+ additional=d_key.additional,
268
+ input_count=count_data.get_count("input_count"),
269
+ invalid_person_ids=count_data.get_count("invalid_person_ids"),
270
+ invalid_date_fields=count_data.get_count("invalid_date_fields"),
271
+ invalid_source_fields=count_data.get_count("invalid_source_fields"),
272
+ output_count=count_data.get_count("output_count")
273
+ )
117
274
 
118
- invalid_date_fields = "0"
119
- if "invalid_date_fields" in dvalue:
120
- invalid_date_fields = str(dvalue["invalid_date_fields"])
275
+ if row.output_count >= self.log_threshold:
276
+ rows.append(row)
277
+ return rows
121
278
 
122
- output_count = "0"
123
- if "output_count" in dvalue:
124
- output_count = str(dvalue["output_count"])
125
279
 
126
- if (int(output_count) >= self.log_threshold):
127
- summary_str += self.dataset_name + "\t" + source + "\t" + fieldname + "\t" + tablename + "\t" + concept_id + "\t" + additional +"\t" + input_count + "\t" + invalid_person_ids + "\t" + invalid_date_fields + "\t" + invalid_source_fields + "\t" + output_count + "\n"
280
+ def get_mapstream_summary(self) -> str:
281
+ """
282
+ Makes a TSV string of the mapstream summary
283
+ """
284
+ summary_rows = self.get_mapstream_summary_rows()
285
+ result = MapstreamSummaryRow.get_header()
286
+
287
+ for row in summary_rows:
288
+ result += row.to_tsv_row()
289
+
290
+ return result
128
291
 
129
- return summary_str
292
+ def get_mapstream_summary_dict(self) -> Dict:
293
+ """
294
+ Makes a dict of the mapstream summary
295
+ """
296
+ rows = self.get_mapstream_summary_rows()
297
+ return {
298
+ "dataset": self.dataset_name,
299
+ "threshold": self.log_threshold,
300
+ "rows": [vars(row) for row in rows]
301
+ }
@@ -1,8 +1,13 @@
1
1
  import carrottransform.tools as tools
2
2
  import json
3
+ import logging
3
4
  import re
4
5
  import sys
5
6
 
7
+ from pathlib import Path
8
+
9
+ logger = logging.getLogger(__name__)
10
+
6
11
  class OmopCDM:
7
12
  """
8
13
  Load and parse OMOP DDL data, to make an in-memory json CDM
@@ -29,11 +34,11 @@ class OmopCDM:
29
34
  self.auto_number_field = self.get_columns("auto_number_field")
30
35
 
31
36
 
32
- def load_ddl(self, omopddl):
37
+ def load_ddl(self, omopddl: Path):
33
38
  try:
34
- fp = open(omopddl, "r")
39
+ fp = omopddl.open("r")
35
40
  except Exception as err:
36
- print("OMOP ddl file ({0}) not found".format(omopddl))
41
+ logger.exception("OMOP ddl file ({0}) not found".format(omopddl))
37
42
  sys.exit()
38
43
 
39
44
  return(self.process_ddl(fp))