carrot-transform 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

Files changed (33) hide show
  1. {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
  2. carrot_transform-0.4.0.dist-info/RECORD +41 -0
  3. {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
  4. carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
  5. carrottransform/__init__.py +1 -1
  6. carrottransform/_version.py +2 -2
  7. carrottransform/cli/command.py +9 -5
  8. carrottransform/cli/subcommands/run.py +302 -443
  9. carrottransform/cli/subcommands/run_v2.py +145 -0
  10. carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
  11. carrottransform/examples/test/rules/v1.json +280 -0
  12. carrottransform/examples/test/rules/v2.json +115 -0
  13. carrottransform/tools/__init__.py +4 -14
  14. carrottransform/tools/args.py +128 -0
  15. carrottransform/tools/click.py +21 -0
  16. carrottransform/tools/concept_helpers.py +61 -0
  17. carrottransform/tools/core.py +163 -0
  18. carrottransform/tools/date_helpers.py +79 -0
  19. carrottransform/tools/file_helpers.py +177 -7
  20. carrottransform/tools/logger.py +19 -0
  21. carrottransform/tools/mapping_types.py +32 -0
  22. carrottransform/tools/mappingrules.py +298 -32
  23. carrottransform/tools/metrics.py +274 -49
  24. carrottransform/tools/omopcdm.py +42 -32
  25. carrottransform/tools/orchestrator.py +381 -0
  26. carrottransform/tools/person_helpers.py +126 -0
  27. carrottransform/tools/record_builder.py +413 -0
  28. carrottransform/tools/stream_helpers.py +71 -0
  29. carrottransform/tools/types.py +71 -0
  30. carrottransform/tools/validation.py +62 -0
  31. carrot_transform-0.3.4.dist-info/RECORD +0 -24
  32. carrot_transform-0.3.4.dist-info/entry_points.txt +0 -3
  33. {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,100 +1,217 @@
1
- import csv
2
- import os, time
3
- import datetime
4
- import fnmatch
5
1
  import sys
2
+ import time
3
+ from pathlib import Path
6
4
  import click
7
- import json
8
- import importlib.resources
9
- import carrottransform
10
- import carrottransform.tools as tools
11
- from carrottransform.tools.omopcdm import OmopCDM
12
- from typing import Iterator, IO
13
5
 
6
+ import carrottransform.tools as tools
7
+ from carrottransform.tools.click import PathArgs
8
+ from carrottransform.tools.file_helpers import (
9
+ check_dir_isvalid,
10
+ check_files_in_rules_exist,
11
+ open_file,
12
+ resolve_paths,
13
+ set_omop_filenames,
14
+ )
15
+ from carrottransform.tools.logger import logger_setup
16
+ from carrottransform.tools.core import (
17
+ get_target_records,
18
+ )
19
+ from carrottransform.tools.date_helpers import normalise_to8601
20
+ from carrottransform.tools.person_helpers import (
21
+ load_last_used_ids,
22
+ load_person_ids,
23
+ set_saved_person_id_file,
24
+ )
25
+ from carrottransform.tools.args import person_rules_check, OnlyOnePersonInputAllowed
26
+
27
+ logger = logger_setup()
14
28
 
15
- @click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
16
- def run():
17
- pass
18
29
 
19
30
  @click.command()
20
- @click.option("--rules-file",
21
- required=True,
22
- help="json file containing mapping rules")
23
- @click.option("--output-dir",
24
- default=None,
25
- help="define the output directory for OMOP-format tsv files")
26
- @click.option("--write-mode",
27
- default='w',
28
- type=click.Choice(['w','a']),
29
- help="force write-mode on output files")
30
- @click.option("--person-file",
31
- required=True,
32
- help="File containing person_ids in the first column")
33
- @click.option("--omop-ddl-file",
34
- required=False,
35
- help="File containing OHDSI ddl statements for OMOP tables")
36
- @click.option("--omop-config-file",
37
- required=False,
38
- help="File containing additional / override json config for omop outputs")
39
- @click.option("--omop-version",
40
- required=False,
41
- help="Quoted string containing omop version - eg '5.3'")
42
- @click.option("--saved-person-id-file",
43
- default=None,
44
- required=False,
45
- help="Full path to person id file used to save person_id state and share person_ids between data sets")
46
- @click.option("--use-input-person-ids",
47
- required=False,
48
- default='N',
49
- help="Use person ids as input without generating new integers")
50
- @click.option("--last-used-ids-file",
51
- default=None,
52
- required=False,
53
- help="Full path to last used ids file for OMOP tables - format: tablename\tlast_used_id, \nwhere last_used_id must be an integer")
54
- @click.option("--log-file-threshold",
55
- required=False,
56
- default=0,
57
- help="Lower outcount limit for logfile output")
58
- @click.argument("input-dir",
59
- required=False,
60
- nargs=-1)
61
- def mapstream(rules_file, output_dir, write_mode,
62
- person_file, omop_ddl_file, omop_config_file,
63
- omop_version, saved_person_id_file, use_input_person_ids,
64
- last_used_ids_file, log_file_threshold, input_dir):
31
+ @click.option(
32
+ "--rules-file",
33
+ type=PathArgs,
34
+ required=True,
35
+ help="json file containing mapping rules",
36
+ )
37
+ @click.option(
38
+ "--output-dir",
39
+ type=PathArgs,
40
+ default=None,
41
+ required=True,
42
+ help="define the output directory for OMOP-format tsv files",
43
+ )
44
+ @click.option(
45
+ "--write-mode",
46
+ default="w",
47
+ type=click.Choice(["w", "a"]),
48
+ help="force write-mode on output files",
49
+ )
50
+ @click.option(
51
+ "--person-file",
52
+ type=PathArgs,
53
+ required=True,
54
+ help="File containing person_ids in the first column",
55
+ )
56
+ @click.option(
57
+ "--omop-ddl-file",
58
+ type=PathArgs,
59
+ required=False,
60
+ help="File containing OHDSI ddl statements for OMOP tables",
61
+ )
62
+ @click.option(
63
+ "--omop-config-file",
64
+ type=PathArgs,
65
+ required=False,
66
+ help="File containing additional / override json config for omop outputs",
67
+ )
68
+ @click.option(
69
+ "--omop-version",
70
+ required=False,
71
+ help="Quoted string containing omop version - eg '5.3'",
72
+ )
73
+ @click.option(
74
+ "--saved-person-id-file",
75
+ type=PathArgs,
76
+ default=None,
77
+ required=False,
78
+ help="Full path to person id file used to save person_id state and share person_ids between data sets",
79
+ )
80
+ @click.option(
81
+ "--use-input-person-ids",
82
+ required=False,
83
+ default="N",
84
+ help="Use person ids as input without generating new integers",
85
+ )
86
+ @click.option(
87
+ "--last-used-ids-file",
88
+ type=PathArgs,
89
+ default=None,
90
+ required=False,
91
+ help="Full path to last used ids file for OMOP tables - format: tablename\tlast_used_id, \nwhere last_used_id must be an integer",
92
+ )
93
+ @click.option(
94
+ "--log-file-threshold",
95
+ required=False,
96
+ default=0,
97
+ help="Lower outcount limit for logfile output",
98
+ )
99
+ @click.option("--input-dir", type=PathArgs, required=True, help="Input directories")
100
+ def mapstream(
101
+ rules_file: Path,
102
+ output_dir: Path,
103
+ write_mode,
104
+ person_file: Path,
105
+ omop_ddl_file: Path,
106
+ omop_config_file: Path,
107
+ omop_version,
108
+ saved_person_id_file: Path,
109
+ use_input_person_ids,
110
+ last_used_ids_file: Path,
111
+ log_file_threshold,
112
+ input_dir: Path,
113
+ ):
65
114
  """
66
115
  Map to output using input streams
67
116
  """
68
- # Initialisation
117
+
118
+ # Resolve any @package paths in the arguments
119
+ [
120
+ rules_file,
121
+ output_dir,
122
+ person_file,
123
+ omop_ddl_file,
124
+ omop_config_file,
125
+ saved_person_id_file,
126
+ last_used_ids_file,
127
+ input_dir,
128
+ ] = resolve_paths(
129
+ [
130
+ rules_file,
131
+ output_dir,
132
+ person_file,
133
+ omop_ddl_file,
134
+ omop_config_file,
135
+ saved_person_id_file,
136
+ last_used_ids_file,
137
+ input_dir,
138
+ ]
139
+ )
140
+
141
+ # Initialisation
69
142
  # - check for values in optional arguments
70
143
  # - read in configuration files
71
144
  # - check main directories for existence
72
145
  # - handle saved person ids
73
146
  # - initialise metrics
74
- print(rules_file, output_dir, write_mode,
75
- person_file, omop_ddl_file, omop_config_file,
76
- omop_version, saved_person_id_file, use_input_person_ids,
77
- last_used_ids_file, log_file_threshold, input_dir)
147
+ logger.info(
148
+ ",".join(
149
+ map(
150
+ str,
151
+ [
152
+ rules_file,
153
+ output_dir,
154
+ write_mode,
155
+ person_file,
156
+ omop_ddl_file,
157
+ omop_config_file,
158
+ omop_version,
159
+ saved_person_id_file,
160
+ use_input_person_ids,
161
+ last_used_ids_file,
162
+ log_file_threshold,
163
+ input_dir,
164
+ ],
165
+ )
166
+ )
167
+ )
168
+
169
+ # check on the rules file
170
+ if (rules_file is None) or (not rules_file.is_file()):
171
+ logger.exception(f"rules file was set to `{rules_file=}` and is missing")
172
+ sys.exit(-1)
78
173
 
79
174
  ## set omop filenames
80
- omop_config_file, omop_ddl_file = set_omop_filenames(omop_ddl_file, omop_config_file, omop_version)
175
+ omop_config_file, omop_ddl_file = set_omop_filenames(
176
+ omop_ddl_file, omop_config_file, omop_version
177
+ )
81
178
  ## check directories are valid
82
- check_dir_isvalid(input_dir)
83
- check_dir_isvalid(output_dir)
179
+ check_dir_isvalid(input_dir) # Input directory must exist - we need the files in it
180
+ check_dir_isvalid(
181
+ output_dir, create_if_missing=True
182
+ ) # Create output directory if needed
84
183
 
85
184
  saved_person_id_file = set_saved_person_id_file(saved_person_id_file, output_dir)
86
-
87
- starttime = time.time()
185
+
186
+ ## check on the person_file_rules
187
+ try:
188
+ person_rules_check(rules_file=rules_file, person_file=person_file)
189
+ except OnlyOnePersonInputAllowed as e:
190
+ inputs = list(sorted(list(e._inputs)))
191
+
192
+ logger.error(
193
+ f"Person properties were mapped from ({inputs}) but can only come from the person file {person_file.name=}"
194
+ )
195
+ sys.exit(-1)
196
+ except Exception as e:
197
+ logger.exception(f"person_file_rules check failed: {e}")
198
+ sys.exit(-1)
199
+
200
+ start_time = time.time()
88
201
  ## create OmopCDM object, which contains attributes and methods for the omop data tables.
89
202
  omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
90
203
 
91
204
  ## mapping rules determine the ouput files? which input files and fields in the source data, AND the mappings to omop concepts
92
205
  mappingrules = tools.mappingrules.MappingRules(rules_file, omopcdm)
93
206
  metrics = tools.metrics.Metrics(mappingrules.get_dataset_name(), log_file_threshold)
94
- nowtime = time.time()
95
207
 
96
- print("--------------------------------------------------------------------------------")
97
- print("Loaded mapping rules from: {0} in {1:.5f} secs".format(rules_file, (nowtime - starttime)))
208
+ logger.info(
209
+ "--------------------------------------------------------------------------------"
210
+ )
211
+ logger.info(
212
+ f"Loaded mapping rules from: {rules_file} in {time.time() - start_time:.5f} secs"
213
+ )
214
+
98
215
  output_files = mappingrules.get_all_outfile_names()
99
216
 
100
217
  ## set record number
@@ -102,31 +219,33 @@ def mapstream(rules_file, output_dir, write_mode,
102
219
  record_numbers = {}
103
220
  for output_file in output_files:
104
221
  record_numbers[output_file] = 1
105
- if last_used_ids_file != None:
106
- if os.path.isfile(last_used_ids_file):
107
- record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
222
+ if (last_used_ids_file is not None) and last_used_ids_file.is_file():
223
+ record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
108
224
 
109
225
  fhd = {}
110
226
  tgtcolmaps = {}
111
227
 
112
-
113
-
114
228
  try:
115
229
  ## get all person_ids from file and either renumber with an int or take directly, and add to a dict
116
- person_lookup, rejected_person_count = load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_person_ids)
230
+ person_lookup, rejected_person_count = load_person_ids(
231
+ saved_person_id_file, person_file, mappingrules, use_input_person_ids
232
+ )
233
+
117
234
  ## open person_ids output file
118
- with open(saved_person_id_file, mode="w") as fhpout:
235
+ with saved_person_id_file.open(mode="w") as fhpout:
119
236
  ## write the header to the file
120
237
  fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
121
238
  ##iterate through the ids and write them to the file.
122
239
  for person_id, person_assigned_id in person_lookup.items():
123
- fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
240
+ fhpout.write(f"{str(person_id)}\t{str(person_assigned_id)}\n")
124
241
 
125
242
  ## Initialise output files (adding them to a dict), output a header for each
126
243
  ## these aren't being closed deliberately
127
244
  for tgtfile in output_files:
128
- fhd[tgtfile] = open(output_dir + "/" + tgtfile + ".tsv", mode=write_mode)
129
- if write_mode == 'w':
245
+ fhd[tgtfile] = (
246
+ (output_dir / tgtfile).with_suffix(".tsv").open(mode=write_mode)
247
+ )
248
+ if write_mode == "w":
130
249
  outhdr = omopcdm.get_omop_column_list(tgtfile)
131
250
  fhd[tgtfile].write("\t".join(outhdr) + "\n")
132
251
  ## maps all omop columns for each file into a dict containing the column name and the index
@@ -134,13 +253,15 @@ def mapstream(rules_file, output_dir, write_mode,
134
253
  tgtcolmaps[tgtfile] = omopcdm.get_omop_column_map(tgtfile)
135
254
 
136
255
  except IOError as e:
137
- print("I/O - error({0}): {1} -> {2}".format(e.errno, e.strerror, str(e)))
256
+ logger.exception(f"I/O - error({e.errno}): {e.strerror} -> {str(e)}")
138
257
  exit()
139
258
 
140
- print("person_id stats: total loaded {0}, reject count {1}".format(len(person_lookup), rejected_person_count))
259
+ logger.info(
260
+ f"person_id stats: total loaded {len(person_lookup)}, reject count {rejected_person_count}"
261
+ )
141
262
 
142
263
  ## Compare files found in the input_dir with those expected based on mapping rules
143
- existing_input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
264
+ existing_input_files = [f.name for f in input_dir.glob("*.csv")]
144
265
  rules_input_files = mappingrules.get_all_infile_names()
145
266
 
146
267
  ## Log mismatches but continue
@@ -149,7 +270,7 @@ def mapstream(rules_file, output_dir, write_mode,
149
270
  ## set up overall counts
150
271
  rejidcounts = {}
151
272
  rejdatecounts = {}
152
- print(rules_input_files)
273
+ logger.info(rules_input_files)
153
274
 
154
275
  ## set up per-input counts
155
276
  for srcfilename in rules_input_files:
@@ -158,44 +279,64 @@ def mapstream(rules_file, output_dir, write_mode,
158
279
 
159
280
  ## main processing loop, for each input file
160
281
  for srcfilename in rules_input_files:
161
- outcounts = {}
162
- rejcounts = {}
163
282
  rcount = 0
164
283
 
165
- fh, csvr = open_file(input_dir[0], srcfilename)
166
- if fh is None:
167
- continue
168
-
284
+ fhcsvr = open_file(input_dir / srcfilename)
285
+ if fhcsvr is None: # check if it's none before unpacking
286
+ raise Exception(f"Couldn't find file {srcfilename} in {input_dir}")
287
+ fh, csvr = fhcsvr # unpack now because we can't unpack none
169
288
 
170
289
  ## create dict for input file, giving the data and output file
171
290
  tgtfiles, src_to_tgt = mappingrules.parse_rules_src_to_tgt(srcfilename)
172
- infile_datetime_source, infile_person_id_source = mappingrules.get_infile_date_person_id(srcfilename)
291
+ infile_datetime_source, infile_person_id_source = (
292
+ mappingrules.get_infile_date_person_id(srcfilename)
293
+ )
294
+
295
+ outcounts = {}
296
+ rejcounts = {}
173
297
  for tgtfile in tgtfiles:
174
298
  outcounts[tgtfile] = 0
175
299
  rejcounts[tgtfile] = 0
300
+
176
301
  datacolsall = []
177
- hdrdata = next(csvr)
302
+ csv_column_headers = next(csvr)
178
303
  dflist = mappingrules.get_infile_data_fields(srcfilename)
179
- for colname in hdrdata:
304
+ for colname in csv_column_headers:
180
305
  datacolsall.append(colname)
181
- inputcolmap = omopcdm.get_column_map(hdrdata)
306
+ inputcolmap = omopcdm.get_column_map(csv_column_headers)
182
307
  pers_id_col = inputcolmap[infile_person_id_source]
183
308
  datetime_col = inputcolmap[infile_datetime_source]
184
- print("--------------------------------------------------------------------------------")
185
- print("Processing input: {0}".format(srcfilename))
309
+
310
+ logger.info(
311
+ "--------------------------------------------------------------------------------"
312
+ )
313
+ logger.info(f"Processing input: {srcfilename}")
186
314
 
187
315
  # for each input record
188
316
  for indata in csvr:
189
- key = srcfilename + "~all~all~all~"
190
- metrics.increment_key_count(key, "input_count")
317
+ metrics.increment_key_count(
318
+ source=srcfilename,
319
+ fieldname="all",
320
+ tablename="all",
321
+ concept_id="all",
322
+ additional="",
323
+ count_type="input_count",
324
+ )
191
325
  rcount += 1
192
- # if there is a date, parse it - read it is a string and convert to YYYY-MM-DD
193
- strdate = indata[datetime_col].split(" ")[0]
194
- fulldate = parse_date(strdate)
195
- if fulldate != None:
326
+
327
+ # if there is a date, parse it - read it is a string and convert to YYYY-MM-DD HH:MM:SS
328
+ fulldate = normalise_to8601(indata[datetime_col])
329
+ if fulldate is not None:
196
330
  indata[datetime_col] = fulldate
197
331
  else:
198
- metrics.increment_key_count(key, "invalid_date_fields")
332
+ metrics.increment_key_count(
333
+ source=srcfilename,
334
+ fieldname="all",
335
+ tablename="all",
336
+ concept_id="all",
337
+ additional="",
338
+ count_type="input_date_fields",
339
+ )
199
340
  continue
200
341
 
201
342
  for tgtfile in tgtfiles:
@@ -208,370 +349,88 @@ def mapstream(rules_file, output_dir, write_mode,
208
349
  datacols = dflist[tgtfile]
209
350
 
210
351
  for datacol in datacols:
211
- built_records, outrecords, metrics = get_target_records(tgtfile, tgtcolmap, src_to_tgt, datacol, indata, inputcolmap, srcfilename, omopcdm, metrics)
212
- if built_records == True:
352
+ built_records, outrecords, metrics = get_target_records(
353
+ tgtfile,
354
+ tgtcolmap,
355
+ src_to_tgt,
356
+ datacol,
357
+ indata,
358
+ inputcolmap,
359
+ srcfilename,
360
+ omopcdm,
361
+ metrics,
362
+ )
363
+
364
+ if built_records:
213
365
  for outrecord in outrecords:
214
- if auto_num_col != None:
215
- outrecord[tgtcolmap[auto_num_col]] = str(record_numbers[tgtfile])
366
+ if auto_num_col is not None:
367
+ outrecord[tgtcolmap[auto_num_col]] = str(
368
+ record_numbers[tgtfile]
369
+ )
216
370
  ### most of the rest of this section is actually to do with metrics
217
371
  record_numbers[tgtfile] += 1
372
+
218
373
  if (outrecord[tgtcolmap[pers_id_col]]) in person_lookup:
219
- outrecord[tgtcolmap[pers_id_col]] = person_lookup[outrecord[tgtcolmap[pers_id_col]]]
374
+ outrecord[tgtcolmap[pers_id_col]] = person_lookup[
375
+ outrecord[tgtcolmap[pers_id_col]]
376
+ ]
220
377
  outcounts[tgtfile] += 1
221
378
 
222
- increment_key_counts(srcfilename, metrics, tgtfile, datacol, outrecord)
379
+ metrics.increment_with_datacol(
380
+ source_path=srcfilename,
381
+ target_file=tgtfile,
382
+ datacol=datacol,
383
+ out_record=outrecord,
384
+ )
223
385
 
224
386
  # write the line to the file
225
387
  fhd[tgtfile].write("\t".join(outrecord) + "\n")
226
388
  else:
227
- key = srcfilename + "~all~" + tgtfile + "~all~"
228
- metrics.increment_key_count(key, "invalid_person_ids")
389
+ metrics.increment_key_count(
390
+ source=srcfilename,
391
+ fieldname="all",
392
+ tablename=tgtfile,
393
+ concept_id="all",
394
+ additional="",
395
+ count_type="invalid_person_ids",
396
+ )
229
397
  rejidcounts[srcfilename] += 1
230
398
 
399
+ if tgtfile == "person":
400
+ break
401
+
231
402
  fh.close()
232
403
 
233
- nowtime= time.time()
234
- print("INPUT file data : {0}: input count {1}, time since start {2:.5} secs".format(srcfilename, str(rcount), (nowtime - starttime)))
404
+ logger.info(
405
+ f"INPUT file data : {srcfilename}: input count {str(rcount)}, time since start {time.time() - start_time:.5} secs"
406
+ )
235
407
  for outtablename, count in outcounts.items():
236
- print("TARGET: {0}: output count {1}".format(outtablename, str(count)))
408
+ logger.info(f"TARGET: {outtablename}: output count {str(count)}")
237
409
  # END main processing loop
238
410
 
239
- print("--------------------------------------------------------------------------------")
411
+ logger.info(
412
+ "--------------------------------------------------------------------------------"
413
+ )
414
+
240
415
  data_summary = metrics.get_mapstream_summary()
241
416
  try:
242
- dsfh = open(output_dir + "/summary_mapstream.tsv", mode="w")
417
+ dsfh = (output_dir / "summary_mapstream.tsv").open(mode="w")
243
418
  dsfh.write(data_summary)
244
419
  dsfh.close()
245
420
  except IOError as e:
246
- print("I/O error({0}): {1}".format(e.errno, e.strerror))
247
- print("Unable to write file")
421
+ logger.exception(f"I/O error({e.errno}): {e.strerror}")
422
+ logger.exception("Unable to write file")
423
+ raise e
248
424
 
249
425
  # END mapstream
250
- nowtime = time.time()
251
- print("Elapsed time = {0:.5f} secs".format(nowtime - starttime))
252
-
253
- def increment_key_counts(srcfilename: str, metrics: tools.metrics.Metrics, tgtfile: str, datacol: str, outrecord: list[str]) -> None:
254
- key = srcfilename + "~all~all~all~"
255
- metrics.increment_key_count(key, "output_count")
256
-
257
- key = "all~all~" + tgtfile + "~all~"
258
- metrics.increment_key_count(key, "output_count")
259
-
260
- key = srcfilename + "~all~" + tgtfile + "~all~"
261
- metrics.increment_key_count(key, "output_count")
262
-
263
- if tgtfile == "person":
264
- key = srcfilename + "~all~" + tgtfile + "~" + outrecord[1] + "~"
265
- metrics.increment_key_count(key, "output_count")
266
-
267
- key = srcfilename + "~" + datacol + "~" + tgtfile + "~" + outrecord[1] + "~" + outrecord[2]
268
- metrics.increment_key_count(key, "output_count")
269
- else:
270
- key = srcfilename + "~" + datacol + "~" + tgtfile + "~" + outrecord[2] + "~"
271
- metrics.increment_key_count(key, "output_count")
272
-
273
- key = srcfilename + "~all~" + tgtfile + "~" + outrecord[2] + "~"
274
- metrics.increment_key_count(key, "output_count")
275
-
276
- key = "all~all~" + tgtfile + "~" + outrecord[2] + "~"
277
- metrics.increment_key_count(key, "output_count")
278
-
279
- key = "all~all~all~" + outrecord[2] + "~"
280
- metrics.increment_key_count(key, "output_count")
281
- return
282
-
283
-
284
- def get_target_records(tgtfilename: str, tgtcolmap: dict[str, dict[str, int]], rulesmap: dict[str, list[dict[str, list[str]]]], srcfield: str, srcdata: list[str], srccolmap: dict[str, int], srcfilename: str, omopcdm: OmopCDM, metrics: tools.metrics.Metrics) -> \
285
- tuple[bool, list[str], tools.metrics.Metrics]:
286
- """
287
- build all target records for a given input field
288
- """
289
- build_records = False
290
- tgtrecords = []
291
- date_col_data = omopcdm.get_omop_datetime_linked_fields(tgtfilename)
292
- date_component_data = omopcdm.get_omop_date_field_components(tgtfilename)
293
- notnull_numeric_fields = omopcdm.get_omop_notnull_numeric_fields(tgtfilename)
294
-
295
- srckey = srcfilename + "~" + srcfield + "~" + tgtfilename
296
- summarykey = srcfilename + "~" + srcfield + "~" + tgtfilename + "~all~"
297
- if valid_value(str(srcdata[srccolmap[srcfield]])):
298
- ## check if either or both of the srckey and summarykey are in the rules
299
- srcfullkey = srcfilename + "~" + srcfield + "~" + str(srcdata[srccolmap[srcfield]]) + "~" + tgtfilename
300
- dictkeys = []
301
- if srcfullkey in rulesmap:
302
- build_records = True
303
- dictkeys.append(srcfullkey)
304
- if srckey in rulesmap:
305
- build_records = True
306
- dictkeys.append(srckey)
307
- if build_records == True:
308
- for dictkey in dictkeys:
309
- for out_data_elem in rulesmap[dictkey]:
310
- valid_data_elem = True
311
- ## create empty list to store the data. Populate numerical data elements with 0 instead of empty string.
312
- tgtarray = ['']*len(tgtcolmap)
313
- for req_integer in notnull_numeric_fields:
314
- tgtarray[tgtcolmap[req_integer]] = "0"
315
- for infield, outfield_list in out_data_elem.items():
316
- for output_col_data in outfield_list:
317
- if "~" in output_col_data:
318
- outcol, term = output_col_data.split("~")
319
- tgtarray[tgtcolmap[outcol]] = term
320
- else:
321
- tgtarray[tgtcolmap[output_col_data]] = srcdata[srccolmap[infield]]
322
- if output_col_data in date_component_data:
323
- ## parse the date and store it in the proper format
324
- strdate = srcdata[srccolmap[infield]].split(" ")[0]
325
- dt = get_datetime_value(strdate)
326
- if dt != None:
327
- year_field = date_component_data[output_col_data]["year"]
328
- month_field = date_component_data[output_col_data]["month"]
329
- day_field = date_component_data[output_col_data]["day"]
330
- tgtarray[tgtcolmap[year_field]] = str(dt.year)
331
- tgtarray[tgtcolmap[month_field]] = str(dt.month)
332
- tgtarray[tgtcolmap[day_field]] = str(dt.day)
333
- fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
334
- tgtarray[tgtcolmap[output_col_data]] = fulldate
335
- else:
336
- metrics.increment_key_count(summarykey, "invalid_date_fields")
337
- valid_data_elem = False
338
- elif output_col_data in date_col_data:
339
- fulldate = srcdata[srccolmap[infield]]
340
- tgtarray[tgtcolmap[output_col_data]] = fulldate
341
- tgtarray[tgtcolmap[date_col_data[output_col_data]]] = fulldate
342
- if valid_data_elem == True:
343
- tgtrecords.append(tgtarray)
344
- else:
345
- metrics.increment_key_count(summarykey, "invalid_source_fields")
346
-
347
-
348
- return build_records, tgtrecords, metrics
349
-
350
- def valid_value(item):
351
- """
352
- Check if an item is non blank (null)
353
- """
354
- if item.strip() == "":
355
- return(False)
356
- return(True)
357
-
358
- def valid_date_value(item):
359
- """
360
- Check if a date item is non null and parses as ISO (YYYY-MM-DD), reverse-ISO
361
- or dd/mm/yyyy or mm/dd/yyyy
362
- """
363
- if item.strip() == "":
364
- return(False)
365
- if not valid_iso_date(item) and not valid_reverse_iso_date(item) and not valid_uk_date(item):
366
- #print("Bad date : {0}".format(item))
367
- return(False)
368
- return(True)
369
-
370
- def get_datetime_value(item):
371
- """
372
- Check if a date item is non null and parses as ISO (YYYY-MM-DD), reverse-ISO
373
- or dd/mm/yyyy or mm/dd/yyyy
374
- """
375
- dt = None
376
- # Does the date parse as an ISO date?
377
- try:
378
- dt = datetime.datetime.strptime(item, "%Y-%m-%d")
379
- except ValueError:
380
- pass
381
- if dt != None:
382
- return(dt)
383
-
384
- # Does the date parse as a reverse ISO date?
385
- try:
386
- dt = datetime.datetime.strptime(item, "%d-%m-%Y")
387
- except ValueError:
388
- pass
389
-
390
- if dt != None:
391
- return(dt)
392
-
393
- # Does the date parse as a UK old-style date?
394
- try:
395
- dt = datetime.datetime.strptime(item, "%d/%m/%Y")
396
- except ValueError:
397
- pass
398
-
399
- if dt != None:
400
- return(dt)
401
-
402
- return None
403
-
404
- def parse_date(item):
405
- """
406
- Crude hand-coded check on date format
407
- """
408
- datedata = item.split("-")
409
- if len(datedata) != 3:
410
- datedata = item.split("/")
411
- if len(datedata) != 3:
412
- return None
413
- if len(datedata[2]) == 4:
414
- return("{0}-{1}-{2}".format(datedata[2], datedata[1], datedata[0]))
415
- return("{0}-{1}-{2}".format(datedata[0], datedata[1], datedata[2]))
416
-
417
-
418
- def valid_iso_date(item):
419
- """
420
- Check if a date item is non null and parses as ISO (YYYY-MM-DD)
421
- """
422
- try:
423
- datetime.datetime.strptime(item, "%Y-%m-%d")
424
- except ValueError:
425
- return(False)
426
-
427
- return(True)
428
-
429
- def valid_reverse_iso_date(item):
430
- """
431
- Check if a date item is non null and parses as reverse ISO (DD-MM-YYYY)
432
- """
433
- try:
434
- datetime.datetime.strptime(item, "%d-%m-%Y")
435
- except ValueError:
436
- return(False)
426
+ logger.info(f"Elapsed time = {time.time() - start_time:.5f} secs")
437
427
 
438
- return(True)
439
428
 
440
- def valid_uk_date(item):
441
- """
442
- Check if a date item is non null and parses as UK format (DD/MM/YYYY)
443
- """
444
- try:
445
- datetime.datetime.strptime(item, "%d/%m/%Y")
446
- except ValueError:
447
- return(False)
448
-
449
- return(True)
450
-
451
- def load_last_used_ids(last_used_ids_file, last_used_ids):
452
- fh = open(last_used_ids_file, mode="r", encoding="utf-8-sig")
453
- csvr = csv.reader(fh, delimiter="\t")
454
-
455
- for last_ids_data in csvr:
456
- last_used_ids[last_ids_data[0]] = int(last_ids_data[1]) + 1
457
-
458
- fh.close()
459
- return last_used_ids
460
-
461
- def load_saved_person_ids(person_file):
462
- fh = open(person_file, mode="r", encoding="utf-8-sig")
463
- csvr = csv.reader(fh, delimiter="\t")
464
- last_int = 1
465
- person_ids = {}
466
-
467
- next(csvr)
468
- for persondata in csvr:
469
- person_ids[persondata[0]] = persondata[1]
470
- last_int += 1
471
-
472
- fh.close()
473
- return person_ids, last_int
474
-
475
- def load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_person_ids, delim=","):
476
- person_ids, person_number = get_person_lookup(saved_person_id_file)
477
-
478
- fh = open(person_file, mode="r", encoding="utf-8-sig")
479
- csvr = csv.reader(fh, delimiter=delim)
480
- person_columns = {}
481
- person_col_in_hdr_number = 0
482
- reject_count = 0
483
-
484
- personhdr = next(csvr)
485
- print(personhdr)
486
-
487
- # Make a dictionary of column names vs their positions
488
- for col in personhdr:
489
- person_columns[col] = person_col_in_hdr_number
490
- person_col_in_hdr_number += 1
491
-
492
- ## check the mapping rules for person to find where to get the person data) i.e., which column in the person file contains dob, sex
493
- birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info("person")
494
- print("Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source))
495
- ## get the column index of the PersonID from the input file
496
- person_col = person_columns[person_id_source]
497
-
498
- for persondata in csvr:
499
- if not valid_value(persondata[person_columns[person_id_source]]): #just checking that the id is not an empty string
500
- reject_count += 1
501
- continue
502
- if not valid_date_value(persondata[person_columns[birth_datetime_source]]):
503
- reject_count += 1
504
- continue
505
- if persondata[person_col] not in person_ids: #if not already in person_ids dict, add it
506
- if use_input_person_ids == "N":
507
- person_ids[persondata[person_col]] = str(person_number) #create a new integer person_id
508
- person_number += 1
509
- else:
510
- person_ids[persondata[person_col]] = str(persondata[person_col]) #use existing person_id
511
- fh.close()
512
-
513
- return person_ids, reject_count
514
-
515
- @click.group(help="Commands for using python configurations to run the ETL transformation.")
516
- def py():
429
+ @click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
430
+ def run():
517
431
  pass
518
432
 
519
- def check_dir_isvalid(directory: str | tuple[str, ...]) -> None:
520
- ## check output dir is valid
521
- if type(directory) is tuple:
522
- directory = directory[0]
523
-
524
- if not os.path.isdir(directory):
525
- print("Not a directory, dir {0}".format(directory))
526
- sys.exit(1)
527
-
528
- def set_saved_person_id_file(saved_person_id_file: str, output_dir: str) -> str:
529
- ## check if there is a saved person id file set in options - if not, check if the file exists and remove it
530
- if saved_person_id_file is None:
531
- saved_person_id_file = output_dir + "/" + "person_ids.tsv"
532
- if os.path.exists(saved_person_id_file):
533
- os.remove(saved_person_id_file)
534
- return saved_person_id_file
535
-
536
-
537
- def check_files_in_rules_exist(rules_input_files: list[str], existing_input_files: list[str]) -> None:
538
- for infile in existing_input_files:
539
- if infile not in rules_input_files:
540
- msg = "WARNING: no mapping rules found for existing input file - {0}".format(infile)
541
- print(msg)
542
- for infile in rules_input_files:
543
- if infile not in existing_input_files:
544
- msg = "WARNING: no data for mapped input file - {0}".format(infile)
545
- print(msg)
546
-
547
- def open_file(directory: str, filename: str) -> tuple[IO[str], Iterator[list[str]]] | None:
548
- #def open_file(directory: str, filename: str):
549
- try:
550
- fh = open(directory + "/" + filename, mode="r", encoding="utf-8-sig")
551
- csvr = csv.reader(fh)
552
- return fh, csvr
553
- except IOError as e:
554
- print("Unable to open: {0}".format(directory + "/" + filename))
555
- print("I/O error({0}): {1}".format(e.errno, e.strerror))
556
- return None
557
-
558
- def set_omop_filenames(omop_ddl_file: str, omop_config_file: str, omop_version: str) -> tuple[str, str]:
559
- if (omop_ddl_file is None) and (omop_config_file is None) and (omop_version is not None):
560
- omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
561
- omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
562
- omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
563
- return omop_config_file, omop_ddl_file
564
-
565
- def get_person_lookup(saved_person_id_file: str) -> tuple[dict[str, str], int]:
566
- # Saved-person-file existence test, reload if found, return last used integer
567
- if os.path.isfile(saved_person_id_file):
568
- person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
569
- else:
570
- person_lookup = {}
571
- last_used_integer = 1
572
- return person_lookup, last_used_integer
573
-
574
- run.add_command(mapstream,"mapstream")
575
-
576
- if __name__== '__main__':
577
- mapstream()
433
+
434
+ run.add_command(mapstream, "mapstream")
435
+ if __name__ == "__main__":
436
+ run()