carrot-transform 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

Files changed (32) hide show
  1. {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
  2. carrot_transform-0.4.0.dist-info/RECORD +41 -0
  3. {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
  4. carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
  5. carrottransform/__init__.py +1 -1
  6. carrottransform/_version.py +2 -2
  7. carrottransform/cli/command.py +9 -5
  8. carrottransform/cli/subcommands/run.py +214 -526
  9. carrottransform/cli/subcommands/run_v2.py +145 -0
  10. carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
  11. carrottransform/examples/test/rules/v1.json +280 -0
  12. carrottransform/examples/test/rules/v2.json +115 -0
  13. carrottransform/tools/__init__.py +4 -14
  14. carrottransform/tools/args.py +128 -0
  15. carrottransform/tools/concept_helpers.py +61 -0
  16. carrottransform/tools/core.py +163 -0
  17. carrottransform/tools/date_helpers.py +79 -0
  18. carrottransform/tools/file_helpers.py +153 -9
  19. carrottransform/tools/logger.py +19 -0
  20. carrottransform/tools/mapping_types.py +32 -0
  21. carrottransform/tools/mappingrules.py +297 -34
  22. carrottransform/tools/metrics.py +162 -109
  23. carrottransform/tools/omopcdm.py +37 -32
  24. carrottransform/tools/orchestrator.py +381 -0
  25. carrottransform/tools/person_helpers.py +126 -0
  26. carrottransform/tools/record_builder.py +413 -0
  27. carrottransform/tools/stream_helpers.py +71 -0
  28. carrottransform/tools/types.py +71 -0
  29. carrottransform/tools/validation.py +62 -0
  30. carrot_transform-0.3.5.dist-info/RECORD +0 -25
  31. carrot_transform-0.3.5.dist-info/entry_points.txt +0 -3
  32. {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,84 +1,102 @@
1
- import carrottransform
2
- import carrottransform.tools as tools
3
- import click
4
- import csv
5
- import datetime
6
- import fnmatch
7
- import importlib.resources
8
- import json
9
- import logging
10
- import os
11
1
  import sys
12
2
  import time
13
-
14
- from carrottransform.tools.click import PathArgs
15
- from carrottransform.tools.omopcdm import OmopCDM
16
3
  from pathlib import Path
4
+ import click
17
5
 
18
- from typing import Iterator, IO, Iterable
19
- from ...tools.file_helpers import resolve_paths
20
-
21
- logger = logging.getLogger(__name__)
22
- if not logger.handlers:
23
- logger.setLevel(logging.INFO)
24
-
25
- console_handler = logging.StreamHandler()
26
- console_handler.setLevel(logging.INFO)
27
-
28
- formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
29
- console_handler.setFormatter(formatter)
30
-
31
- logger.addHandler(console_handler)
32
-
33
- @click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
34
- def run():
35
- pass
6
+ import carrottransform.tools as tools
7
+ from carrottransform.tools.click import PathArgs
8
+ from carrottransform.tools.file_helpers import (
9
+ check_dir_isvalid,
10
+ check_files_in_rules_exist,
11
+ open_file,
12
+ resolve_paths,
13
+ set_omop_filenames,
14
+ )
15
+ from carrottransform.tools.logger import logger_setup
16
+ from carrottransform.tools.core import (
17
+ get_target_records,
18
+ )
19
+ from carrottransform.tools.date_helpers import normalise_to8601
20
+ from carrottransform.tools.person_helpers import (
21
+ load_last_used_ids,
22
+ load_person_ids,
23
+ set_saved_person_id_file,
24
+ )
25
+ from carrottransform.tools.args import person_rules_check, OnlyOnePersonInputAllowed
26
+
27
+ logger = logger_setup()
36
28
 
37
29
 
38
30
  @click.command()
39
- @click.option("--rules-file", type=PathArgs,
40
- required=True,
41
- help="json file containing mapping rules")
42
- @click.option("--output-dir", type=PathArgs,
43
- default=None,
44
- required=True,
45
- help="define the output directory for OMOP-format tsv files")
46
- @click.option("--write-mode",
47
- default='w',
48
- type=click.Choice(['w','a']),
49
- help="force write-mode on output files")
50
- @click.option("--person-file", type=PathArgs,
51
- required=True,
52
- help="File containing person_ids in the first column")
53
- @click.option("--omop-ddl-file", type=PathArgs,
54
- required=False,
55
- help="File containing OHDSI ddl statements for OMOP tables")
56
- @click.option("--omop-config-file", type=PathArgs,
57
- required=False,
58
- help="File containing additional / override json config for omop outputs")
59
- @click.option("--omop-version",
60
- required=False,
61
- help="Quoted string containing omop version - eg '5.3'")
62
- @click.option("--saved-person-id-file", type=PathArgs,
63
- default=None,
64
- required=False,
65
- help="Full path to person id file used to save person_id state and share person_ids between data sets")
66
- @click.option("--use-input-person-ids",
67
- required=False,
68
- default='N',
69
- help="Use person ids as input without generating new integers")
70
- @click.option("--last-used-ids-file", type=PathArgs,
71
- default=None,
72
- required=False,
73
- help="Full path to last used ids file for OMOP tables - format: tablename\tlast_used_id, \nwhere last_used_id must be an integer")
74
- @click.option("--log-file-threshold",
75
- required=False,
76
- default=0,
77
- help="Lower outcount limit for logfile output")
78
- @click.option("--input-dir", type=PathArgs,
31
+ @click.option(
32
+ "--rules-file",
33
+ type=PathArgs,
34
+ required=True,
35
+ help="json file containing mapping rules",
36
+ )
37
+ @click.option(
38
+ "--output-dir",
39
+ type=PathArgs,
40
+ default=None,
41
+ required=True,
42
+ help="define the output directory for OMOP-format tsv files",
43
+ )
44
+ @click.option(
45
+ "--write-mode",
46
+ default="w",
47
+ type=click.Choice(["w", "a"]),
48
+ help="force write-mode on output files",
49
+ )
50
+ @click.option(
51
+ "--person-file",
52
+ type=PathArgs,
79
53
  required=True,
80
- multiple=True,
81
- help="Input directories")
54
+ help="File containing person_ids in the first column",
55
+ )
56
+ @click.option(
57
+ "--omop-ddl-file",
58
+ type=PathArgs,
59
+ required=False,
60
+ help="File containing OHDSI ddl statements for OMOP tables",
61
+ )
62
+ @click.option(
63
+ "--omop-config-file",
64
+ type=PathArgs,
65
+ required=False,
66
+ help="File containing additional / override json config for omop outputs",
67
+ )
68
+ @click.option(
69
+ "--omop-version",
70
+ required=False,
71
+ help="Quoted string containing omop version - eg '5.3'",
72
+ )
73
+ @click.option(
74
+ "--saved-person-id-file",
75
+ type=PathArgs,
76
+ default=None,
77
+ required=False,
78
+ help="Full path to person id file used to save person_id state and share person_ids between data sets",
79
+ )
80
+ @click.option(
81
+ "--use-input-person-ids",
82
+ required=False,
83
+ default="N",
84
+ help="Use person ids as input without generating new integers",
85
+ )
86
+ @click.option(
87
+ "--last-used-ids-file",
88
+ type=PathArgs,
89
+ default=None,
90
+ required=False,
91
+ help="Full path to last used ids file for OMOP tables - format: tablename\tlast_used_id, \nwhere last_used_id must be an integer",
92
+ )
93
+ @click.option(
94
+ "--log-file-threshold",
95
+ required=False,
96
+ default=0,
97
+ help="Lower outcount limit for logfile output",
98
+ )
99
+ @click.option("--input-dir", type=PathArgs, required=True, help="Input directories")
82
100
  def mapstream(
83
101
  rules_file: Path,
84
102
  output_dir: Path,
@@ -91,15 +109,14 @@ def mapstream(
91
109
  use_input_person_ids,
92
110
  last_used_ids_file: Path,
93
111
  log_file_threshold,
94
- input_dir: Iterable[Path],
112
+ input_dir: Path,
95
113
  ):
96
114
  """
97
115
  Map to output using input streams
98
116
  """
99
117
 
100
-
101
118
  # Resolve any @package paths in the arguments
102
- resolved_paths = resolve_paths([
119
+ [
103
120
  rules_file,
104
121
  output_dir,
105
122
  person_file,
@@ -107,20 +124,19 @@ def mapstream(
107
124
  omop_config_file,
108
125
  saved_person_id_file,
109
126
  last_used_ids_file,
110
- input_dir[0] if input_dir else None # Take first element of input_dir tuple
111
- ])
112
-
113
- # Assign back resolved paths
114
- [rules_file, output_dir, person_file, omop_ddl_file,
115
- omop_config_file, saved_person_id_file, last_used_ids_file,
116
- input_dir] = resolved_paths
117
-
118
- # Ensure input_dir is a list of paths
119
- if isinstance(input_dir, (Path, str)):
120
- input_dir = [input_dir]
121
- elif isinstance(input_dir, tuple):
122
- input_dir = list(input_dir)
123
- # If it's already a list, leave it as is
127
+ input_dir,
128
+ ] = resolve_paths(
129
+ [
130
+ rules_file,
131
+ output_dir,
132
+ person_file,
133
+ omop_ddl_file,
134
+ omop_config_file,
135
+ saved_person_id_file,
136
+ last_used_ids_file,
137
+ input_dir,
138
+ ]
139
+ )
124
140
 
125
141
  # Initialisation
126
142
  # - check for values in optional arguments
@@ -150,18 +166,37 @@ def mapstream(
150
166
  )
151
167
  )
152
168
 
169
+ # check on the rules file
170
+ if (rules_file is None) or (not rules_file.is_file()):
171
+ logger.exception(f"rules file was set to `{rules_file=}` and is missing")
172
+ sys.exit(-1)
173
+
153
174
  ## set omop filenames
154
175
  omop_config_file, omop_ddl_file = set_omop_filenames(
155
176
  omop_ddl_file, omop_config_file, omop_version
156
177
  )
157
178
  ## check directories are valid
158
- for idir in input_dir:
159
- check_dir_isvalid(idir) # Input directory must exist
160
- check_dir_isvalid(output_dir, create_if_missing=True) # Create output directory if needed
161
-
179
+ check_dir_isvalid(input_dir) # Input directory must exist - we need the files in it
180
+ check_dir_isvalid(
181
+ output_dir, create_if_missing=True
182
+ ) # Create output directory if needed
162
183
 
163
184
  saved_person_id_file = set_saved_person_id_file(saved_person_id_file, output_dir)
164
185
 
186
+ ## check on the person_file_rules
187
+ try:
188
+ person_rules_check(rules_file=rules_file, person_file=person_file)
189
+ except OnlyOnePersonInputAllowed as e:
190
+ inputs = list(sorted(list(e._inputs)))
191
+
192
+ logger.error(
193
+ f"Person properties were mapped from ({inputs}) but can only come from the person file {person_file.name=}"
194
+ )
195
+ sys.exit(-1)
196
+ except Exception as e:
197
+ logger.exception(f"person_file_rules check failed: {e}")
198
+ sys.exit(-1)
199
+
165
200
  start_time = time.time()
166
201
  ## create OmopCDM object, which contains attributes and methods for the omop data tables.
167
202
  omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
@@ -192,21 +227,24 @@ def mapstream(
192
227
 
193
228
  try:
194
229
  ## get all person_ids from file and either renumber with an int or take directly, and add to a dict
195
- person_lookup, rejected_person_count = load_person_ids(saved_person_id_file,
196
- person_file, mappingrules,
197
- use_input_person_ids)
230
+ person_lookup, rejected_person_count = load_person_ids(
231
+ saved_person_id_file, person_file, mappingrules, use_input_person_ids
232
+ )
233
+
198
234
  ## open person_ids output file
199
235
  with saved_person_id_file.open(mode="w") as fhpout:
200
236
  ## write the header to the file
201
237
  fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
202
238
  ##iterate through the ids and write them to the file.
203
239
  for person_id, person_assigned_id in person_lookup.items():
204
- fhpout.write(f"{str(person_id)}\t{str(person_assigned_id)}")
240
+ fhpout.write(f"{str(person_id)}\t{str(person_assigned_id)}\n")
205
241
 
206
242
  ## Initialise output files (adding them to a dict), output a header for each
207
243
  ## these aren't being closed deliberately
208
244
  for tgtfile in output_files:
209
- fhd[tgtfile] = (output_dir / tgtfile).with_suffix(".tsv").open(mode=write_mode)
245
+ fhd[tgtfile] = (
246
+ (output_dir / tgtfile).with_suffix(".tsv").open(mode=write_mode)
247
+ )
210
248
  if write_mode == "w":
211
249
  outhdr = omopcdm.get_omop_column_list(tgtfile)
212
250
  fhd[tgtfile].write("\t".join(outhdr) + "\n")
@@ -218,10 +256,12 @@ def mapstream(
218
256
  logger.exception(f"I/O - error({e.errno}): {e.strerror} -> {str(e)}")
219
257
  exit()
220
258
 
221
- logger.info(f"person_id stats: total loaded {len(person_lookup)}, reject count {rejected_person_count}")
259
+ logger.info(
260
+ f"person_id stats: total loaded {len(person_lookup)}, reject count {rejected_person_count}"
261
+ )
222
262
 
223
263
  ## Compare files found in the input_dir with those expected based on mapping rules
224
- existing_input_files = [f.name for f in input_dir[0].glob("*.csv")]
264
+ existing_input_files = [f.name for f in input_dir.glob("*.csv")]
225
265
  rules_input_files = mappingrules.get_all_infile_names()
226
266
 
227
267
  ## Log mismatches but continue
@@ -239,27 +279,31 @@ def mapstream(
239
279
 
240
280
  ## main processing loop, for each input file
241
281
  for srcfilename in rules_input_files:
242
- outcounts = {}
243
- rejcounts = {}
244
282
  rcount = 0
245
283
 
246
- fh, csvr = open_file(input_dir[0] / srcfilename)
247
- if fh is None:
248
- continue
249
-
284
+ fhcsvr = open_file(input_dir / srcfilename)
285
+ if fhcsvr is None: # check if it's none before unpacking
286
+ raise Exception(f"Couldn't find file {srcfilename} in {input_dir}")
287
+ fh, csvr = fhcsvr # unpack now because we can't unpack none
250
288
 
251
289
  ## create dict for input file, giving the data and output file
252
290
  tgtfiles, src_to_tgt = mappingrules.parse_rules_src_to_tgt(srcfilename)
253
- infile_datetime_source, infile_person_id_source = mappingrules.get_infile_date_person_id(srcfilename)
291
+ infile_datetime_source, infile_person_id_source = (
292
+ mappingrules.get_infile_date_person_id(srcfilename)
293
+ )
294
+
295
+ outcounts = {}
296
+ rejcounts = {}
254
297
  for tgtfile in tgtfiles:
255
298
  outcounts[tgtfile] = 0
256
299
  rejcounts[tgtfile] = 0
300
+
257
301
  datacolsall = []
258
- hdrdata = next(csvr)
302
+ csv_column_headers = next(csvr)
259
303
  dflist = mappingrules.get_infile_data_fields(srcfilename)
260
- for colname in hdrdata:
304
+ for colname in csv_column_headers:
261
305
  datacolsall.append(colname)
262
- inputcolmap = omopcdm.get_column_map(hdrdata)
306
+ inputcolmap = omopcdm.get_column_map(csv_column_headers)
263
307
  pers_id_col = inputcolmap[infile_person_id_source]
264
308
  datetime_col = inputcolmap[infile_datetime_source]
265
309
 
@@ -271,28 +315,28 @@ def mapstream(
271
315
  # for each input record
272
316
  for indata in csvr:
273
317
  metrics.increment_key_count(
318
+ source=srcfilename,
319
+ fieldname="all",
320
+ tablename="all",
321
+ concept_id="all",
322
+ additional="",
323
+ count_type="input_count",
324
+ )
325
+ rcount += 1
326
+
327
+ # if there is a date, parse it - read it is a string and convert to YYYY-MM-DD HH:MM:SS
328
+ fulldate = normalise_to8601(indata[datetime_col])
329
+ if fulldate is not None:
330
+ indata[datetime_col] = fulldate
331
+ else:
332
+ metrics.increment_key_count(
274
333
  source=srcfilename,
275
334
  fieldname="all",
276
335
  tablename="all",
277
336
  concept_id="all",
278
337
  additional="",
279
- count_type="input_count"
338
+ count_type="input_date_fields",
280
339
  )
281
- rcount += 1
282
- # if there is a date, parse it - read it is a string and convert to YYYY-MM-DD
283
- strdate = indata[datetime_col].split(" ")[0]
284
- fulldate = parse_date(strdate)
285
- if fulldate is not None:
286
- indata[datetime_col] = fulldate
287
- else:
288
- metrics.increment_key_count(
289
- source=srcfilename,
290
- fieldname="all",
291
- tablename="all",
292
- concept_id="all",
293
- additional="",
294
- count_type="input_date_fields"
295
- )
296
340
  continue
297
341
 
298
342
  for tgtfile in tgtfiles:
@@ -305,40 +349,61 @@ def mapstream(
305
349
  datacols = dflist[tgtfile]
306
350
 
307
351
  for datacol in datacols:
308
- built_records, outrecords, metrics = get_target_records(tgtfile, tgtcolmap, src_to_tgt, datacol, indata, inputcolmap, srcfilename, omopcdm, metrics)
352
+ built_records, outrecords, metrics = get_target_records(
353
+ tgtfile,
354
+ tgtcolmap,
355
+ src_to_tgt,
356
+ datacol,
357
+ indata,
358
+ inputcolmap,
359
+ srcfilename,
360
+ omopcdm,
361
+ metrics,
362
+ )
363
+
309
364
  if built_records:
310
365
  for outrecord in outrecords:
311
366
  if auto_num_col is not None:
312
- outrecord[tgtcolmap[auto_num_col]] = str(record_numbers[tgtfile])
367
+ outrecord[tgtcolmap[auto_num_col]] = str(
368
+ record_numbers[tgtfile]
369
+ )
313
370
  ### most of the rest of this section is actually to do with metrics
314
371
  record_numbers[tgtfile] += 1
372
+
315
373
  if (outrecord[tgtcolmap[pers_id_col]]) in person_lookup:
316
- outrecord[tgtcolmap[pers_id_col]] = person_lookup[outrecord[tgtcolmap[pers_id_col]]]
374
+ outrecord[tgtcolmap[pers_id_col]] = person_lookup[
375
+ outrecord[tgtcolmap[pers_id_col]]
376
+ ]
317
377
  outcounts[tgtfile] += 1
318
378
 
319
379
  metrics.increment_with_datacol(
320
- source_path=srcfilename,
321
- target_file=tgtfile,
322
- datacol=datacol,
323
- out_record=outrecord
324
- )
380
+ source_path=srcfilename,
381
+ target_file=tgtfile,
382
+ datacol=datacol,
383
+ out_record=outrecord,
384
+ )
325
385
 
326
386
  # write the line to the file
327
387
  fhd[tgtfile].write("\t".join(outrecord) + "\n")
328
388
  else:
329
389
  metrics.increment_key_count(
330
- source=srcfilename,
331
- fieldname="all",
332
- tablename=tgtfile,
333
- concept_id="all",
334
- additional="",
335
- count_type="invalid_person_ids",
336
- )
390
+ source=srcfilename,
391
+ fieldname="all",
392
+ tablename=tgtfile,
393
+ concept_id="all",
394
+ additional="",
395
+ count_type="invalid_person_ids",
396
+ )
337
397
  rejidcounts[srcfilename] += 1
338
398
 
399
+ if tgtfile == "person":
400
+ break
401
+
339
402
  fh.close()
340
403
 
341
- logger.info(f"INPUT file data : {srcfilename}: input count {str(rcount)}, time since start {time.time() - start_time:.5} secs")
404
+ logger.info(
405
+ f"INPUT file data : {srcfilename}: input count {str(rcount)}, time since start {time.time() - start_time:.5} secs"
406
+ )
342
407
  for outtablename, count in outcounts.items():
343
408
  logger.info(f"TARGET: {outtablename}: output count {str(count)}")
344
409
  # END main processing loop
@@ -346,7 +411,7 @@ def mapstream(
346
411
  logger.info(
347
412
  "--------------------------------------------------------------------------------"
348
413
  )
349
-
414
+
350
415
  data_summary = metrics.get_mapstream_summary()
351
416
  try:
352
417
  dsfh = (output_dir / "summary_mapstream.tsv").open(mode="w")
@@ -361,388 +426,11 @@ def mapstream(
361
426
  logger.info(f"Elapsed time = {time.time() - start_time:.5f} secs")
362
427
 
363
428
 
364
- def get_target_records(
365
- tgtfilename: str,
366
- tgtcolmap: dict[str, dict[str, int]],
367
- rulesmap: dict[str, list[dict[str, list[str]]]],
368
- srcfield: str,
369
- srcdata: list[str],
370
- srccolmap: dict[str, int],
371
- srcfilename: str,
372
- omopcdm: OmopCDM,
373
- metrics: tools.metrics.Metrics) -> tuple[bool, list[str], tools.metrics.Metrics]:
374
- """
375
- build all target records for a given input field
376
- """
377
- build_records = False
378
- tgtrecords = []
379
- date_col_data = omopcdm.get_omop_datetime_linked_fields(tgtfilename)
380
- date_component_data = omopcdm.get_omop_date_field_components(tgtfilename)
381
- notnull_numeric_fields = omopcdm.get_omop_notnull_numeric_fields(tgtfilename)
382
-
383
- srckey = f"{srcfilename}~{srcfield}~{tgtfilename}"
384
- summarykey = srckey + "~all~"
385
- if valid_value(str(srcdata[srccolmap[srcfield]])):
386
- ## check if either or both of the srckey and summarykey are in the rules
387
- srcfullkey = srcfilename + "~" + srcfield + "~" + str(srcdata[srccolmap[srcfield]]) + "~" + tgtfilename
388
- dictkeys = []
389
- if srcfullkey in rulesmap:
390
- build_records = True
391
- dictkeys.append(srcfullkey)
392
- if srckey in rulesmap:
393
- build_records = True
394
- dictkeys.append(srckey)
395
- if build_records:
396
- for dictkey in dictkeys:
397
- for out_data_elem in rulesmap[dictkey]:
398
- valid_data_elem = True
399
- ## create empty list to store the data. Populate numerical data elements with 0 instead of empty string.
400
- tgtarray = ['']*len(tgtcolmap)
401
- for req_integer in notnull_numeric_fields:
402
- tgtarray[tgtcolmap[req_integer]] = "0"
403
- for infield, outfield_list in out_data_elem.items():
404
- for output_col_data in outfield_list:
405
- if "~" in output_col_data:
406
- outcol, term = output_col_data.split("~")
407
- tgtarray[tgtcolmap[outcol]] = term
408
- else:
409
- tgtarray[tgtcolmap[output_col_data]] = srcdata[srccolmap[infield]]
410
- if output_col_data in date_component_data:
411
- ## parse the date and store it in the proper format
412
- strdate = srcdata[srccolmap[infield]].split(" ")[0]
413
- dt = get_datetime_value(strdate)
414
- if dt != None:
415
- year_field = date_component_data[output_col_data]["year"]
416
- month_field = date_component_data[output_col_data]["month"]
417
- day_field = date_component_data[output_col_data]["day"]
418
- tgtarray[tgtcolmap[year_field]] = str(dt.year)
419
- tgtarray[tgtcolmap[month_field]] = str(dt.month)
420
- tgtarray[tgtcolmap[day_field]] = str(dt.day)
421
- fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
422
- tgtarray[tgtcolmap[output_col_data]] = fulldate
423
- else:
424
- metrics.increment_key_count(
425
- source=srcfilename,
426
- fieldname=srcfield,
427
- tablename=tgtfilename,
428
- concept_id="all",
429
- additional="",
430
- count_type="invalid_date_fields"
431
- )
432
- valid_data_elem = False
433
- elif output_col_data in date_col_data:
434
- fulldate = srcdata[srccolmap[infield]]
435
- tgtarray[tgtcolmap[output_col_data]] = fulldate
436
- tgtarray[tgtcolmap[date_col_data[output_col_data]]] = fulldate
437
- if valid_data_elem:
438
- tgtrecords.append(tgtarray)
439
- else:
440
- metrics.increment_key_count(
441
- source=srcfilename,
442
- fieldname=srcfield,
443
- tablename=tgtfilename,
444
- concept_id="all",
445
- additional="",
446
- count_type="invalid_source_fields"
447
- )
448
-
449
- return build_records, tgtrecords, metrics
450
-
451
-
452
- def valid_value(item):
453
- """
454
- Check if an item is non blank (null)
455
- """
456
- if item.strip() == "":
457
- return False
458
- return True
459
-
460
-
461
- # DATE TESTING
462
- # ------------
463
- # I started by changing the get_datetime_value to be neater.
464
- # I think it should be handled all as one thing, but I've spent too much time doing this already
465
-
466
- def valid_date_value(item):
467
- """
468
- Check if a date item is non null and parses as ISO (YYYY-MM-DD), reverse-ISO
469
- or dd/mm/yyyy or mm/dd/yyyy
470
- """
471
- if item.strip() == "":
472
- return(False)
473
- if not valid_iso_date(item) and not valid_reverse_iso_date(item) and not valid_uk_date(item):
474
- logger.warning("Bad date : {0}".format(item))
475
- return False
476
- return True
477
-
478
-
479
- def get_datetime_value(item):
480
- """
481
- Check if a date item is non-null and parses as ISO (YYYY-MM-DD), reverse-ISO (DD-MM-YYYY),
482
- or UK format (DD/MM/YYYY).
483
- Returns a datetime object if successful, None otherwise.
484
- """
485
- date_formats = [
486
- "%Y-%m-%d", # ISO format (YYYY-MM-DD)
487
- "%d-%m-%Y", # Reverse ISO format (DD-MM-YYYY)
488
- "%d/%m/%Y", # UK old-style format (DD/MM/YYYY)
489
- ]
490
-
491
- for date_format in date_formats:
492
- try:
493
- return datetime.datetime.strptime(item, date_format)
494
- except ValueError:
495
- continue
496
-
497
- # If we get here, none of the formats worked
498
- return None
499
-
500
-
501
- def parse_date(item):
502
- """
503
- Crude hand-coded check on date format
504
- """
505
- datedata = item.split("-")
506
- if len(datedata) != 3:
507
- datedata = item.split("/")
508
- if len(datedata) != 3:
509
- return None
510
- if len(datedata[2]) == 4:
511
- return(f"{datedata[2]}-{datedata[1]}-{datedata[0]}".format(datedata[2], datedata[1], datedata[0]))
512
- return "-".join(datedata[:3])
513
-
514
- def valid_iso_date(item):
515
- """
516
- Check if a date item is non null and parses as ISO (YYYY-MM-DD)
517
- """
518
- try:
519
- datetime.datetime.strptime(item, "%Y-%m-%d")
520
- except ValueError:
521
- return False
522
-
523
- return True
524
-
525
-
526
- def valid_reverse_iso_date(item):
527
- """
528
- Check if a date item is non null and parses as reverse ISO (DD-MM-YYYY)
529
- """
530
- try:
531
- datetime.datetime.strptime(item, "%d-%m-%Y")
532
- except ValueError:
533
- return False
534
-
535
- return True
536
-
537
-
538
- def valid_uk_date(item):
539
- """
540
- Check if a date item is non null and parses as UK format (DD/MM/YYYY)
541
- """
542
- try:
543
- datetime.datetime.strptime(item, "%d/%m/%Y")
544
- except ValueError:
545
- return False
546
-
547
- return True
548
-
549
-
550
- # End of date code
551
-
552
- def load_last_used_ids(last_used_ids_file: Path, last_used_ids):
553
- fh = last_used_ids_file.open(mode="r", encoding="utf-8-sig")
554
- csvr = csv.reader(fh, delimiter="\t")
555
-
556
- for last_ids_data in csvr:
557
- last_used_ids[last_ids_data[0]] = int(last_ids_data[1]) + 1
558
-
559
- fh.close()
560
- return last_used_ids
561
-
562
-
563
- def load_saved_person_ids(person_file: Path):
564
- fh = person_file.open(mode="r", encoding="utf-8-sig")
565
- csvr = csv.reader(fh, delimiter="\t")
566
- last_int = 1
567
- person_ids = {}
568
-
569
- next(csvr)
570
- for persondata in csvr:
571
- person_ids[persondata[0]] = persondata[1]
572
- last_int += 1
573
-
574
- fh.close()
575
- return person_ids, last_int
576
-
577
- def load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_person_ids, delim=","):
578
- person_ids, person_number = get_person_lookup(saved_person_id_file)
579
-
580
- fh = person_file.open(mode="r", encoding="utf-8-sig")
581
- csvr = csv.reader(fh, delimiter=delim)
582
- person_columns = {}
583
- person_col_in_hdr_number = 0
584
- reject_count = 0
585
-
586
- personhdr = next(csvr)
587
- logger.info(personhdr)
588
-
589
- # Make a dictionary of column names vs their positions
590
- for col in personhdr:
591
- person_columns[col] = person_col_in_hdr_number
592
- person_col_in_hdr_number += 1
593
-
594
- ## check the mapping rules for person to find where to get the person data) i.e., which column in the person file contains dob, sex
595
- birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info(
596
- "person"
597
- )
598
- logger.info(
599
- "Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source)
600
- )
601
-
602
- ## get the column index of the PersonID from the input file
603
- person_col = person_columns[person_id_source]
604
-
605
- for persondata in csvr:
606
- if not valid_value(persondata[person_columns[person_id_source]]): #just checking that the id is not an empty string
607
- reject_count += 1
608
- continue
609
- if not valid_date_value(persondata[person_columns[birth_datetime_source]]):
610
- reject_count += 1
611
- continue
612
- if persondata[person_col] not in person_ids: #if not already in person_ids dict, add it
613
- if use_input_person_ids == "N":
614
- person_ids[persondata[person_col]] = str(person_number) #create a new integer person_id
615
- person_number += 1
616
- else:
617
- person_ids[persondata[person_col]] = str(persondata[person_col]) #use existing person_id
618
- fh.close()
619
-
620
- return person_ids, reject_count
621
-
622
- @click.group(help="Commands for using python configurations to run the ETL transformation.")
623
- def py():
429
+ @click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
430
+ def run():
624
431
  pass
625
432
 
626
433
 
627
- def check_dir_isvalid(directory: Path | tuple[Path, ...], create_if_missing: bool = False) -> None:
628
- """Check if directory is valid, optionally create it if missing.
629
-
630
- Args:
631
- directory: Directory path as string or tuple
632
- create_if_missing: If True, create directory if it doesn't exist
633
- """
634
-
635
- ## check directory has been set
636
- if directory is None:
637
- logger.warning("Directory not provided.")
638
- sys.exit(1)
639
-
640
- ## check output dir is valid
641
- elif type(directory) is tuple:
642
- directory = directory[0]
643
-
644
-
645
- ## if not a directory, create it if requested (including parents. This option is for the output directory only).
646
- if not directory.is_dir():
647
- if create_if_missing:
648
- try:
649
- ## deliberately not using the exist_ok option, as we want to know whether it was created or not to provide different logger messages.
650
- directory.mkdir(parents = True)
651
- logger.info(f"Created directory: {directory}")
652
- except OSError as e:
653
- logger.warning(f"Failed to create directory {directory}: {e}")
654
- sys.exit(1)
655
- else:
656
- logger.warning(f"Not a directory, dir {directory}")
657
- sys.exit(1)
658
-
659
- # Handle tuple input (like input_dir)
660
- if isinstance(directory, tuple):
661
- if not directory: # Empty tuple
662
- print("No directory provided")
663
- sys.exit(1)
664
- directory = directory[0]
665
-
666
- # Handle string input
667
- dir_path = str(directory)
668
- if not os.path.isdir(dir_path):
669
- if create_if_missing:
670
- try:
671
- os.makedirs(dir_path)
672
- print(f"Created directory: {dir_path}")
673
- except OSError as e:
674
- print(f"Failed to create directory {dir_path}: {e}")
675
- sys.exit(1)
676
- else:
677
- print(f"Not a directory, dir {dir_path}")
678
- sys.exit(1)
679
-
680
-
681
- def set_saved_person_id_file(
682
- saved_person_id_file: Path | None, output_dir: Path
683
- ) -> Path:
684
- """check if there is a saved person id file set in options - if not, check if the file exists and remove it"""
685
-
686
- if saved_person_id_file is None:
687
- saved_person_id_file = output_dir / "person_ids.tsv"
688
- if saved_person_id_file.exists():
689
- assert not saved_person_id_file.is_dir()
690
- saved_person_id_file.unlink()
691
- else:
692
- assert not saved_person_id_file.is_dir()
693
- return saved_person_id_file
694
-
695
- def check_files_in_rules_exist(rules_input_files: list[str], existing_input_files: list[str]) -> None:
696
- for infile in existing_input_files:
697
- if infile not in rules_input_files:
698
- msg = (
699
- "WARNING: no mapping rules found for existing input file - {0}".format(
700
- infile
701
- )
702
- )
703
- logger.warning(msg)
704
- for infile in rules_input_files:
705
- if infile not in existing_input_files:
706
- msg = "WARNING: no data for mapped input file - {0}".format(infile)
707
- logger.warning(msg)
708
-
709
- def open_file(file_path: Path) -> tuple[IO[str], Iterator[list[str]]] | None:
710
- """opens a file and does something related to CSVs"""
711
- try:
712
- fh = file_path.open(mode="r", encoding="utf-8-sig")
713
- csvr = csv.reader(fh)
714
- return fh, csvr
715
- except IOError as e:
716
- logger.exception("Unable to open: {0}".format(file_path))
717
- logger.exception("I/O error({0}): {1}".format(e.errno, e.strerror))
718
- return None
719
-
720
-
721
- def set_omop_filenames(
722
- omop_ddl_file: Path, omop_config_file: Path, omop_version: str
723
- ) -> tuple[Path, Path]:
724
- if (
725
- (omop_ddl_file is None)
726
- and (omop_config_file is None)
727
- and (omop_version is not None)
728
- ):
729
- omop_config_file = (
730
- importlib.resources.files("carrottransform") / "config/omop.json"
731
- )
732
- omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
733
- omop_ddl_file = (
734
- importlib.resources.files("carrottransform") / "config" / omop_ddl_file_name
735
- )
736
- return omop_config_file, omop_ddl_file
737
-
738
-
739
- def get_person_lookup(saved_person_id_file: Path) -> tuple[dict[str, str], int]:
740
- # Saved-person-file existence test, reload if found, return last used integer
741
- if saved_person_id_file.is_file():
742
- person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
743
- else:
744
- person_lookup = {}
745
- last_used_integer = 1
746
- return person_lookup, last_used_integer
747
-
748
- run.add_command(mapstream,"mapstream")
434
+ run.add_command(mapstream, "mapstream")
435
+ if __name__ == "__main__":
436
+ run()