carrot-transform 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

@@ -1,42 +1,65 @@
1
+ import carrottransform
2
+ import carrottransform.tools as tools
3
+ import click
1
4
  import csv
2
- import os, time
3
5
  import datetime
4
6
  import fnmatch
5
- import sys
6
- import click
7
- import json
8
7
  import importlib.resources
9
- import carrottransform
10
- import carrottransform.tools as tools
8
+ import json
9
+ import logging
10
+ import os
11
+ import sys
12
+ import time
13
+
14
+ from carrottransform.tools.click import PathArgs
15
+ from carrottransform.tools.omopcdm import OmopCDM
16
+ from pathlib import Path
17
+
18
+ from typing import Iterator, IO, Iterable
19
+ from ...tools.file_helpers import resolve_paths
20
+
21
+ logger = logging.getLogger(__name__)
22
+ if not logger.handlers:
23
+ logger.setLevel(logging.INFO)
24
+
25
+ console_handler = logging.StreamHandler()
26
+ console_handler.setLevel(logging.INFO)
27
+
28
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
29
+ console_handler.setFormatter(formatter)
30
+
31
+ logger.addHandler(console_handler)
11
32
 
12
33
  @click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
13
34
  def run():
14
35
  pass
15
36
 
37
+
16
38
  @click.command()
17
- @click.option("--rules-file",
39
+ @click.option("--rules-file", type=PathArgs,
18
40
  required=True,
19
41
  help="json file containing mapping rules")
20
- @click.option("--output-dir",
42
+ @click.option("--output-dir", type=PathArgs,
21
43
  default=None,
44
+ required=True,
22
45
  help="define the output directory for OMOP-format tsv files")
23
46
  @click.option("--write-mode",
24
47
  default='w',
25
48
  type=click.Choice(['w','a']),
26
49
  help="force write-mode on output files")
27
- @click.option("--person-file",
50
+ @click.option("--person-file", type=PathArgs,
28
51
  required=True,
29
52
  help="File containing person_ids in the first column")
30
- @click.option("--omop-ddl-file",
53
+ @click.option("--omop-ddl-file", type=PathArgs,
31
54
  required=False,
32
55
  help="File containing OHDSI ddl statements for OMOP tables")
33
- @click.option("--omop-config-file",
56
+ @click.option("--omop-config-file", type=PathArgs,
34
57
  required=False,
35
58
  help="File containing additional / override json config for omop outputs")
36
59
  @click.option("--omop-version",
37
60
  required=False,
38
- help="Quoted string containing opmop version - eg '5.3'")
39
- @click.option("--saved-person-id-file",
61
+ help="Quoted string containing omop version - eg '5.3'")
62
+ @click.option("--saved-person-id-file", type=PathArgs,
40
63
  default=None,
41
64
  required=False,
42
65
  help="Full path to person id file used to save person_id state and share person_ids between data sets")
@@ -44,7 +67,7 @@ def run():
44
67
  required=False,
45
68
  default='N',
46
69
  help="Use person ids as input without generating new integers")
47
- @click.option("--last-used-ids-file",
70
+ @click.option("--last-used-ids-file", type=PathArgs,
48
71
  default=None,
49
72
  required=False,
50
73
  help="Full path to last used ids file for OMOP tables - format: tablename\tlast_used_id, \nwhere last_used_id must be an integer")
@@ -52,124 +75,180 @@ def run():
52
75
  required=False,
53
76
  default=0,
54
77
  help="Lower outcount limit for logfile output")
55
- @click.argument("input-dir",
56
- required=False,
57
- nargs=-1)
58
- def mapstream(rules_file, output_dir, write_mode,
59
- person_file, omop_ddl_file, omop_config_file,
60
- omop_version, saved_person_id_file, use_input_person_ids,
61
- last_used_ids_file, log_file_threshold, input_dir):
78
+ @click.option("--input-dir", type=PathArgs,
79
+ required=True,
80
+ multiple=True,
81
+ help="Input directories")
82
+ def mapstream(
83
+ rules_file: Path,
84
+ output_dir: Path,
85
+ write_mode,
86
+ person_file: Path,
87
+ omop_ddl_file: Path,
88
+ omop_config_file: Path,
89
+ omop_version,
90
+ saved_person_id_file: Path,
91
+ use_input_person_ids,
92
+ last_used_ids_file: Path,
93
+ log_file_threshold,
94
+ input_dir: Iterable[Path],
95
+ ):
62
96
  """
63
97
  Map to output using input streams
64
98
  """
65
- # Initialisation
99
+
100
+
101
+ # Resolve any @package paths in the arguments
102
+ resolved_paths = resolve_paths([
103
+ rules_file,
104
+ output_dir,
105
+ person_file,
106
+ omop_ddl_file,
107
+ omop_config_file,
108
+ saved_person_id_file,
109
+ last_used_ids_file,
110
+ input_dir[0] if input_dir else None # Take first element of input_dir tuple
111
+ ])
112
+
113
+ # Assign back resolved paths
114
+ [rules_file, output_dir, person_file, omop_ddl_file,
115
+ omop_config_file, saved_person_id_file, last_used_ids_file,
116
+ input_dir] = resolved_paths
117
+
118
+ # Ensure input_dir is a list of paths
119
+ if isinstance(input_dir, (Path, str)):
120
+ input_dir = [input_dir]
121
+ elif isinstance(input_dir, tuple):
122
+ input_dir = list(input_dir)
123
+ # If it's already a list, leave it as is
124
+
125
+ # Initialisation
66
126
  # - check for values in optional arguments
67
127
  # - read in configuration files
68
128
  # - check main directories for existence
69
- # - handle saved persion ids
70
- # - initialise metrics
71
- if (omop_ddl_file == None) and (omop_config_file == None) and (omop_version != None):
72
- omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
73
- omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
74
- omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
75
-
76
- if os.path.isdir(input_dir[0]) == False:
77
- print("Not a directory, input dir {0}".format(input_dir[0]))
78
- sys.exit(1)
79
-
80
- if os.path.isdir(output_dir) == False:
81
- print("Not a directory, output dir {0}".format(output_dir))
82
- sys.exit(1)
83
-
84
- if saved_person_id_file == None:
85
- saved_person_id_file = output_dir + "/" + "person_ids.tsv"
86
- if os.path.exists(saved_person_id_file):
87
- os.remove(saved_person_id_file)
88
-
89
- starttime = time.time()
129
+ # - handle saved person ids
130
+ # - initialise metrics
131
+ logger.info(
132
+ ",".join(
133
+ map(
134
+ str,
135
+ [
136
+ rules_file,
137
+ output_dir,
138
+ write_mode,
139
+ person_file,
140
+ omop_ddl_file,
141
+ omop_config_file,
142
+ omop_version,
143
+ saved_person_id_file,
144
+ use_input_person_ids,
145
+ last_used_ids_file,
146
+ log_file_threshold,
147
+ input_dir,
148
+ ],
149
+ )
150
+ )
151
+ )
152
+
153
+ ## set omop filenames
154
+ omop_config_file, omop_ddl_file = set_omop_filenames(
155
+ omop_ddl_file, omop_config_file, omop_version
156
+ )
157
+ ## check directories are valid
158
+ for idir in input_dir:
159
+ check_dir_isvalid(idir) # Input directory must exist
160
+ check_dir_isvalid(output_dir, create_if_missing=True) # Create output directory if needed
161
+
162
+
163
+ saved_person_id_file = set_saved_person_id_file(saved_person_id_file, output_dir)
164
+
165
+ start_time = time.time()
166
+ ## create OmopCDM object, which contains attributes and methods for the omop data tables.
90
167
  omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
168
+
169
+ ## mapping rules determine the ouput files? which input files and fields in the source data, AND the mappings to omop concepts
91
170
  mappingrules = tools.mappingrules.MappingRules(rules_file, omopcdm)
92
171
  metrics = tools.metrics.Metrics(mappingrules.get_dataset_name(), log_file_threshold)
93
- nowtime = time.time()
94
172
 
95
- print("--------------------------------------------------------------------------------")
96
- print("Loaded mapping rules from: {0} in {1:.5f} secs".format(rules_file, (nowtime - starttime)))
173
+ logger.info(
174
+ "--------------------------------------------------------------------------------"
175
+ )
176
+ logger.info(
177
+ f"Loaded mapping rules from: {rules_file} in {time.time() - start_time:.5f} secs"
178
+ )
179
+
97
180
  output_files = mappingrules.get_all_outfile_names()
181
+
182
+ ## set record number
183
+ ## will keep track of the current record number in each file, e.g., measurement_id, observation_id.
98
184
  record_numbers = {}
99
185
  for output_file in output_files:
100
186
  record_numbers[output_file] = 1
187
+ if (last_used_ids_file is not None) and last_used_ids_file.is_file():
188
+ record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
101
189
 
102
190
  fhd = {}
103
191
  tgtcolmaps = {}
104
192
 
105
193
  try:
106
- # Saved-person-file existence test, reload if found, return last used integer
107
- if os.path.isfile(saved_person_id_file):
108
- person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
109
- else:
110
- person_lookup = {}
111
- last_used_integer = 1
112
- if last_used_ids_file != None:
113
- if os.path.isfile(last_used_ids_file):
114
- record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
115
-
116
- person_lookup, rejected_person_count = load_person_ids(person_file, person_lookup, mappingrules, use_input_person_ids, last_used_integer)
117
- fhpout = open(saved_person_id_file, mode="w")
118
- fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
119
- for person_id, person_assigned_id in person_lookup.items():
120
- fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
121
- fhpout.close()
122
- # Initialise output files, output a header for each
194
+ ## get all person_ids from file and either renumber with an int or take directly, and add to a dict
195
+ person_lookup, rejected_person_count = load_person_ids(saved_person_id_file,
196
+ person_file, mappingrules,
197
+ use_input_person_ids)
198
+ ## open person_ids output file
199
+ with saved_person_id_file.open(mode="w") as fhpout:
200
+ ## write the header to the file
201
+ fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
202
+ ##iterate through the ids and write them to the file.
203
+ for person_id, person_assigned_id in person_lookup.items():
204
+ fhpout.write(f"{str(person_id)}\t{str(person_assigned_id)}")
205
+
206
+ ## Initialise output files (adding them to a dict), output a header for each
207
+ ## these aren't being closed deliberately
123
208
  for tgtfile in output_files:
124
- fhd[tgtfile] = open(output_dir + "/" + tgtfile + ".tsv", mode=write_mode)
125
- if write_mode == 'w':
209
+ fhd[tgtfile] = (output_dir / tgtfile).with_suffix(".tsv").open(mode=write_mode)
210
+ if write_mode == "w":
126
211
  outhdr = omopcdm.get_omop_column_list(tgtfile)
127
212
  fhd[tgtfile].write("\t".join(outhdr) + "\n")
213
+ ## maps all omop columns for each file into a dict containing the column name and the index
214
+ ## so tgtcolmaps is a dict of dicts.
128
215
  tgtcolmaps[tgtfile] = omopcdm.get_omop_column_map(tgtfile)
129
216
 
130
217
  except IOError as e:
131
- print("I/O - error({0}): {1} -> {2}".format(e.errno, e.strerror, str(e)))
218
+ logger.exception(f"I/O - error({e.errno}): {e.strerror} -> {str(e)}")
132
219
  exit()
133
220
 
134
- print("person_id stats: total loaded {0}, reject count {1}".format(len(person_lookup), rejected_person_count))
221
+ logger.info(f"person_id stats: total loaded {len(person_lookup)}, reject count {rejected_person_count}")
135
222
 
136
- # Compare files found in the input_dir with those expected based on mapping rules
137
- existing_input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
223
+ ## Compare files found in the input_dir with those expected based on mapping rules
224
+ existing_input_files = [f.name for f in input_dir[0].glob("*.csv")]
138
225
  rules_input_files = mappingrules.get_all_infile_names()
139
- # Log mismatches but continue
140
- for infile in existing_input_files:
141
- if infile not in rules_input_files:
142
- msg = "ERROR: no mapping rules found for existing input file - {0}".format(infile)
143
- print(msg)
144
- for infile in rules_input_files:
145
- if infile not in existing_input_files:
146
- msg = "ERROR: no data for mapped input file - {0}".format(infile)
147
- print(msg)
148
226
 
149
- # set up overall counts
227
+ ## Log mismatches but continue
228
+ check_files_in_rules_exist(rules_input_files, existing_input_files)
229
+
230
+ ## set up overall counts
150
231
  rejidcounts = {}
151
232
  rejdatecounts = {}
152
- print(rules_input_files)
233
+ logger.info(rules_input_files)
153
234
 
154
- # set up per-input counts
235
+ ## set up per-input counts
155
236
  for srcfilename in rules_input_files:
156
237
  rejidcounts[srcfilename] = 0
157
238
  rejdatecounts[srcfilename] = 0
158
239
 
159
- # main processing loop, for each input file
240
+ ## main processing loop, for each input file
160
241
  for srcfilename in rules_input_files:
161
242
  outcounts = {}
162
243
  rejcounts = {}
163
244
  rcount = 0
164
245
 
165
- try:
166
- fh = open(input_dir[0] + "/" + srcfilename, mode="r", encoding="utf-8-sig")
167
- csvr = csv.reader(fh)
168
- except IOError as e:
169
- print("Unable to open: {0}".format(input_dir[0] + "/" + srcfilename))
170
- print("I/O error({0}): {1}".format(e.errno, e.strerror))
246
+ fh, csvr = open_file(input_dir[0] / srcfilename)
247
+ if fh is None:
171
248
  continue
172
249
 
250
+
251
+ ## create dict for input file, giving the data and output file
173
252
  tgtfiles, src_to_tgt = mappingrules.parse_rules_src_to_tgt(srcfilename)
174
253
  infile_datetime_source, infile_person_id_source = mappingrules.get_infile_date_person_id(srcfilename)
175
254
  for tgtfile in tgtfiles:
@@ -183,20 +262,37 @@ def mapstream(rules_file, output_dir, write_mode,
183
262
  inputcolmap = omopcdm.get_column_map(hdrdata)
184
263
  pers_id_col = inputcolmap[infile_person_id_source]
185
264
  datetime_col = inputcolmap[infile_datetime_source]
186
- print("--------------------------------------------------------------------------------")
187
- print("Processing input: {0}".format(srcfilename))
188
-
265
+
266
+ logger.info(
267
+ "--------------------------------------------------------------------------------"
268
+ )
269
+ logger.info(f"Processing input: {srcfilename}")
270
+
189
271
  # for each input record
190
272
  for indata in csvr:
191
- key = srcfilename + "~all~all~all~"
192
- metrics.increment_key_count(key, "input_count")
273
+ metrics.increment_key_count(
274
+ source=srcfilename,
275
+ fieldname="all",
276
+ tablename="all",
277
+ concept_id="all",
278
+ additional="",
279
+ count_type="input_count"
280
+ )
193
281
  rcount += 1
282
+ # if there is a date, parse it - read it is a string and convert to YYYY-MM-DD
194
283
  strdate = indata[datetime_col].split(" ")[0]
195
284
  fulldate = parse_date(strdate)
196
- if fulldate != None:
285
+ if fulldate is not None:
197
286
  indata[datetime_col] = fulldate
198
287
  else:
199
- metrics.increment_key_count(key, "invalid_date_fields")
288
+ metrics.increment_key_count(
289
+ source=srcfilename,
290
+ fieldname="all",
291
+ tablename="all",
292
+ concept_id="all",
293
+ additional="",
294
+ count_type="input_date_fields"
295
+ )
200
296
  continue
201
297
 
202
298
  for tgtfile in tgtfiles:
@@ -210,63 +306,71 @@ def mapstream(rules_file, output_dir, write_mode,
210
306
 
211
307
  for datacol in datacols:
212
308
  built_records, outrecords, metrics = get_target_records(tgtfile, tgtcolmap, src_to_tgt, datacol, indata, inputcolmap, srcfilename, omopcdm, metrics)
213
- if built_records == True:
309
+ if built_records:
214
310
  for outrecord in outrecords:
215
- if auto_num_col != None:
311
+ if auto_num_col is not None:
216
312
  outrecord[tgtcolmap[auto_num_col]] = str(record_numbers[tgtfile])
313
+ ### most of the rest of this section is actually to do with metrics
217
314
  record_numbers[tgtfile] += 1
218
315
  if (outrecord[tgtcolmap[pers_id_col]]) in person_lookup:
219
316
  outrecord[tgtcolmap[pers_id_col]] = person_lookup[outrecord[tgtcolmap[pers_id_col]]]
220
317
  outcounts[tgtfile] += 1
221
- key = srcfilename + "~all~all~all~"
222
- metrics.increment_key_count(key, "output_count")
223
- key = "all~all~" + tgtfile + "~all~"
224
- metrics.increment_key_count(key, "output_count")
225
- key = srcfilename + "~all~" + tgtfile + "~all~"
226
- metrics.increment_key_count(key, "output_count")
227
- if tgtfile == "person":
228
- key = srcfilename + "~all~" + tgtfile + "~" + outrecord[1] +"~"
229
- metrics.increment_key_count(key, "output_count")
230
- key = srcfilename + "~" + datacol +"~" + tgtfile + "~" + outrecord[1] + "~" + outrecord[2]
231
- metrics.increment_key_count(key, "output_count")
232
- else:
233
- key = srcfilename + "~" + datacol +"~" + tgtfile + "~" + outrecord[2] + "~"
234
- metrics.increment_key_count(key, "output_count")
235
- key = srcfilename + "~all~" + tgtfile + "~" + outrecord[2] + "~"
236
- metrics.increment_key_count(key, "output_count")
237
- key = "all~all~" + tgtfile + "~" + outrecord[2] + "~"
238
- metrics.increment_key_count(key, "output_count")
239
- key = "all~all~all~" + outrecord[2] + "~"
240
- metrics.increment_key_count(key, "output_count")
318
+
319
+ metrics.increment_with_datacol(
320
+ source_path=srcfilename,
321
+ target_file=tgtfile,
322
+ datacol=datacol,
323
+ out_record=outrecord
324
+ )
325
+
326
+ # write the line to the file
241
327
  fhd[tgtfile].write("\t".join(outrecord) + "\n")
242
328
  else:
243
- key = srcfilename + "~all~" + tgtfile + "~all~"
244
- metrics.increment_key_count(key, "invalid_person_ids")
329
+ metrics.increment_key_count(
330
+ source=srcfilename,
331
+ fieldname="all",
332
+ tablename=tgtfile,
333
+ concept_id="all",
334
+ additional="",
335
+ count_type="invalid_person_ids",
336
+ )
245
337
  rejidcounts[srcfilename] += 1
246
338
 
247
339
  fh.close()
248
340
 
249
- nowtime= time.time()
250
- print("INPUT file data : {0}: input count {1}, time since start {2:.5} secs".format(srcfilename, str(rcount), (nowtime - starttime)))
341
+ logger.info(f"INPUT file data : {srcfilename}: input count {str(rcount)}, time since start {time.time() - start_time:.5} secs")
251
342
  for outtablename, count in outcounts.items():
252
- print("TARGET: {0}: output count {1}".format(outtablename, str(count)))
343
+ logger.info(f"TARGET: {outtablename}: output count {str(count)}")
253
344
  # END main processing loop
254
345
 
255
- print("--------------------------------------------------------------------------------")
346
+ logger.info(
347
+ "--------------------------------------------------------------------------------"
348
+ )
349
+
256
350
  data_summary = metrics.get_mapstream_summary()
257
351
  try:
258
- dsfh = open(output_dir + "/summary_mapstream.tsv", mode="w")
352
+ dsfh = (output_dir / "summary_mapstream.tsv").open(mode="w")
259
353
  dsfh.write(data_summary)
260
354
  dsfh.close()
261
355
  except IOError as e:
262
- print("I/O error({0}): {1}".format(e.errno, e.strerror))
263
- print("Unable to write file")
356
+ logger.exception(f"I/O error({e.errno}): {e.strerror}")
357
+ logger.exception("Unable to write file")
358
+ raise e
264
359
 
265
360
  # END mapstream
266
- nowtime = time.time()
267
- print("Elapsed time = {0:.5f} secs".format(nowtime - starttime))
268
-
269
- def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srccolmap, srcfilename, omopcdm, metrics):
361
+ logger.info(f"Elapsed time = {time.time() - start_time:.5f} secs")
362
+
363
+
364
+ def get_target_records(
365
+ tgtfilename: str,
366
+ tgtcolmap: dict[str, dict[str, int]],
367
+ rulesmap: dict[str, list[dict[str, list[str]]]],
368
+ srcfield: str,
369
+ srcdata: list[str],
370
+ srccolmap: dict[str, int],
371
+ srcfilename: str,
372
+ omopcdm: OmopCDM,
373
+ metrics: tools.metrics.Metrics) -> tuple[bool, list[str], tools.metrics.Metrics]:
270
374
  """
271
375
  build all target records for a given input field
272
376
  """
@@ -276,9 +380,10 @@ def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srcc
276
380
  date_component_data = omopcdm.get_omop_date_field_components(tgtfilename)
277
381
  notnull_numeric_fields = omopcdm.get_omop_notnull_numeric_fields(tgtfilename)
278
382
 
279
- srckey = srcfilename + "~" + srcfield + "~" + tgtfilename
280
- summarykey = srcfilename + "~" + srcfield + "~" + tgtfilename + "~all~"
383
+ srckey = f"{srcfilename}~{srcfield}~{tgtfilename}"
384
+ summarykey = srckey + "~all~"
281
385
  if valid_value(str(srcdata[srccolmap[srcfield]])):
386
+ ## check if either or both of the srckey and summarykey are in the rules
282
387
  srcfullkey = srcfilename + "~" + srcfield + "~" + str(srcdata[srccolmap[srcfield]]) + "~" + tgtfilename
283
388
  dictkeys = []
284
389
  if srcfullkey in rulesmap:
@@ -287,10 +392,11 @@ def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srcc
287
392
  if srckey in rulesmap:
288
393
  build_records = True
289
394
  dictkeys.append(srckey)
290
- if build_records == True:
395
+ if build_records:
291
396
  for dictkey in dictkeys:
292
397
  for out_data_elem in rulesmap[dictkey]:
293
398
  valid_data_elem = True
399
+ ## create empty list to store the data. Populate numerical data elements with 0 instead of empty string.
294
400
  tgtarray = ['']*len(tgtcolmap)
295
401
  for req_integer in notnull_numeric_fields:
296
402
  tgtarray[tgtcolmap[req_integer]] = "0"
@@ -302,6 +408,7 @@ def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srcc
302
408
  else:
303
409
  tgtarray[tgtcolmap[output_col_data]] = srcdata[srccolmap[infield]]
304
410
  if output_col_data in date_component_data:
411
+ ## parse the date and store it in the proper format
305
412
  strdate = srcdata[srccolmap[infield]].split(" ")[0]
306
413
  dt = get_datetime_value(strdate)
307
414
  if dt != None:
@@ -314,27 +421,47 @@ def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srcc
314
421
  fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
315
422
  tgtarray[tgtcolmap[output_col_data]] = fulldate
316
423
  else:
317
- metrics.increment_key_count(summarykey, "invalid_date_fields")
424
+ metrics.increment_key_count(
425
+ source=srcfilename,
426
+ fieldname=srcfield,
427
+ tablename=tgtfilename,
428
+ concept_id="all",
429
+ additional="",
430
+ count_type="invalid_date_fields"
431
+ )
318
432
  valid_data_elem = False
319
433
  elif output_col_data in date_col_data:
320
434
  fulldate = srcdata[srccolmap[infield]]
321
435
  tgtarray[tgtcolmap[output_col_data]] = fulldate
322
436
  tgtarray[tgtcolmap[date_col_data[output_col_data]]] = fulldate
323
- if valid_data_elem == True:
437
+ if valid_data_elem:
324
438
  tgtrecords.append(tgtarray)
325
439
  else:
326
- metrics.increment_key_count(summarykey, "invalid_source_fields")
327
-
440
+ metrics.increment_key_count(
441
+ source=srcfilename,
442
+ fieldname=srcfield,
443
+ tablename=tgtfilename,
444
+ concept_id="all",
445
+ additional="",
446
+ count_type="invalid_source_fields"
447
+ )
328
448
 
329
449
  return build_records, tgtrecords, metrics
330
450
 
451
+
331
452
  def valid_value(item):
332
453
  """
333
454
  Check if an item is non blank (null)
334
455
  """
335
456
  if item.strip() == "":
336
- return(False)
337
- return(True)
457
+ return False
458
+ return True
459
+
460
+
461
+ # DATE TESTING
462
+ # ------------
463
+ # I started by changing the get_datetime_value to be neater.
464
+ # I think it should be handled all as one thing, but I've spent too much time doing this already
338
465
 
339
466
  def valid_date_value(item):
340
467
  """
@@ -344,44 +471,33 @@ def valid_date_value(item):
344
471
  if item.strip() == "":
345
472
  return(False)
346
473
  if not valid_iso_date(item) and not valid_reverse_iso_date(item) and not valid_uk_date(item):
347
- #print("Bad date : {0}".format(item))
348
- return(False)
349
- return(True)
474
+ logger.warning("Bad date : {0}".format(item))
475
+ return False
476
+ return True
477
+
350
478
 
351
479
  def get_datetime_value(item):
352
480
  """
353
- Check if a date item is non null and parses as ISO (YYYY-MM-DD), reverse-ISO
354
- or dd/mm/yyyy or mm/dd/yyyy
481
+ Check if a date item is non-null and parses as ISO (YYYY-MM-DD), reverse-ISO (DD-MM-YYYY),
482
+ or UK format (DD/MM/YYYY).
483
+ Returns a datetime object if successful, None otherwise.
355
484
  """
356
- dt = None
357
- # Does the date parse as an ISO date?
358
- try:
359
- dt = datetime.datetime.strptime(item, "%Y-%m-%d")
360
- except ValueError:
361
- pass
362
- if dt != None:
363
- return(dt)
364
-
365
- # Does the date parse as a reverse ISO date?
366
- try:
367
- dt = datetime.datetime.strptime(item, "%d-%m-%Y")
368
- except ValueError:
369
- pass
370
-
371
- if dt != None:
372
- return(dt)
373
-
374
- # Does the date parse as a UK old-style date?
375
- try:
376
- dt = datetime.datetime.strptime(item, "%d/%m/%Y")
377
- except ValueError:
378
- pass
379
-
380
- if dt != None:
381
- return(dt)
382
-
485
+ date_formats = [
486
+ "%Y-%m-%d", # ISO format (YYYY-MM-DD)
487
+ "%d-%m-%Y", # Reverse ISO format (DD-MM-YYYY)
488
+ "%d/%m/%Y", # UK old-style format (DD/MM/YYYY)
489
+ ]
490
+
491
+ for date_format in date_formats:
492
+ try:
493
+ return datetime.datetime.strptime(item, date_format)
494
+ except ValueError:
495
+ continue
496
+
497
+ # If we get here, none of the formats worked
383
498
  return None
384
499
 
500
+
385
501
  def parse_date(item):
386
502
  """
387
503
  Crude hand-coded check on date format
@@ -392,9 +508,8 @@ def parse_date(item):
392
508
  if len(datedata) != 3:
393
509
  return None
394
510
  if len(datedata[2]) == 4:
395
- return("{0}-{1}-{2}".format(datedata[2], datedata[1], datedata[0]))
396
- return("{0}-{1}-{2}".format(datedata[0], datedata[1], datedata[2]))
397
-
511
+ return(f"{datedata[2]}-{datedata[1]}-{datedata[0]}".format(datedata[2], datedata[1], datedata[0]))
512
+ return "-".join(datedata[:3])
398
513
 
399
514
  def valid_iso_date(item):
400
515
  """
@@ -403,9 +518,10 @@ def valid_iso_date(item):
403
518
  try:
404
519
  datetime.datetime.strptime(item, "%Y-%m-%d")
405
520
  except ValueError:
406
- return(False)
521
+ return False
522
+
523
+ return True
407
524
 
408
- return(True)
409
525
 
410
526
  def valid_reverse_iso_date(item):
411
527
  """
@@ -414,9 +530,10 @@ def valid_reverse_iso_date(item):
414
530
  try:
415
531
  datetime.datetime.strptime(item, "%d-%m-%Y")
416
532
  except ValueError:
417
- return(False)
533
+ return False
534
+
535
+ return True
418
536
 
419
- return(True)
420
537
 
421
538
  def valid_uk_date(item):
422
539
  """
@@ -425,12 +542,15 @@ def valid_uk_date(item):
425
542
  try:
426
543
  datetime.datetime.strptime(item, "%d/%m/%Y")
427
544
  except ValueError:
428
- return(False)
545
+ return False
546
+
547
+ return True
548
+
429
549
 
430
- return(True)
550
+ # End of date code
431
551
 
432
- def load_last_used_ids(last_used_ids_file, last_used_ids):
433
- fh = open(last_used_ids_file, mode="r", encoding="utf-8-sig")
552
+ def load_last_used_ids(last_used_ids_file: Path, last_used_ids):
553
+ fh = last_used_ids_file.open(mode="r", encoding="utf-8-sig")
434
554
  csvr = csv.reader(fh, delimiter="\t")
435
555
 
436
556
  for last_ids_data in csvr:
@@ -439,8 +559,9 @@ def load_last_used_ids(last_used_ids_file, last_used_ids):
439
559
  fh.close()
440
560
  return last_used_ids
441
561
 
442
- def load_saved_person_ids(person_file):
443
- fh = open(person_file, mode="r", encoding="utf-8-sig")
562
+
563
+ def load_saved_person_ids(person_file: Path):
564
+ fh = person_file.open(mode="r", encoding="utf-8-sig")
444
565
  csvr = csv.reader(fh, delimiter="\t")
445
566
  last_int = 1
446
567
  person_ids = {}
@@ -453,38 +574,47 @@ def load_saved_person_ids(person_file):
453
574
  fh.close()
454
575
  return person_ids, last_int
455
576
 
456
- def load_person_ids(person_file, person_ids, mappingrules, use_input_person_ids, person_number=1, delim=","):
457
- fh = open(person_file, mode="r", encoding="utf-8-sig")
577
+ def load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_person_ids, delim=","):
578
+ person_ids, person_number = get_person_lookup(saved_person_id_file)
579
+
580
+ fh = person_file.open(mode="r", encoding="utf-8-sig")
458
581
  csvr = csv.reader(fh, delimiter=delim)
459
582
  person_columns = {}
460
583
  person_col_in_hdr_number = 0
461
584
  reject_count = 0
462
585
 
463
586
  personhdr = next(csvr)
464
- print(personhdr)
587
+ logger.info(personhdr)
465
588
 
466
589
  # Make a dictionary of column names vs their positions
467
590
  for col in personhdr:
468
591
  person_columns[col] = person_col_in_hdr_number
469
592
  person_col_in_hdr_number += 1
470
593
 
471
- birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info("person")
472
- print("Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source))
594
+ ## check the mapping rules for person to find where to get the person data) i.e., which column in the person file contains dob, sex
595
+ birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info(
596
+ "person"
597
+ )
598
+ logger.info(
599
+ "Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source)
600
+ )
601
+
602
+ ## get the column index of the PersonID from the input file
473
603
  person_col = person_columns[person_id_source]
474
604
 
475
605
  for persondata in csvr:
476
- if not valid_value(persondata[person_columns[person_id_source]]):
606
+ if not valid_value(persondata[person_columns[person_id_source]]): #just checking that the id is not an empty string
477
607
  reject_count += 1
478
608
  continue
479
609
  if not valid_date_value(persondata[person_columns[birth_datetime_source]]):
480
610
  reject_count += 1
481
611
  continue
482
- if persondata[person_col] not in person_ids:
612
+ if persondata[person_col] not in person_ids: #if not already in person_ids dict, add it
483
613
  if use_input_person_ids == "N":
484
- person_ids[persondata[person_col]] = str(person_number)
614
+ person_ids[persondata[person_col]] = str(person_number) #create a new integer person_id
485
615
  person_number += 1
486
616
  else:
487
- person_ids[persondata[person_col]] = str(persondata[person_col])
617
+ person_ids[persondata[person_col]] = str(persondata[person_col]) #use existing person_id
488
618
  fh.close()
489
619
 
490
620
  return person_ids, reject_count
@@ -493,4 +623,126 @@ def load_person_ids(person_file, person_ids, mappingrules, use_input_person_ids,
493
623
  def py():
494
624
  pass
495
625
 
626
+
627
+ def check_dir_isvalid(directory: Path | tuple[Path, ...], create_if_missing: bool = False) -> None:
628
+ """Check if directory is valid, optionally create it if missing.
629
+
630
+ Args:
631
+ directory: Directory path as string or tuple
632
+ create_if_missing: If True, create directory if it doesn't exist
633
+ """
634
+
635
+ ## check directory has been set
636
+ if directory is None:
637
+ logger.warning("Directory not provided.")
638
+ sys.exit(1)
639
+
640
+ ## check output dir is valid
641
+ elif type(directory) is tuple:
642
+ directory = directory[0]
643
+
644
+
645
+ ## if not a directory, create it if requested (including parents. This option is for the output directory only).
646
+ if not directory.is_dir():
647
+ if create_if_missing:
648
+ try:
649
+ ## deliberately not using the exist_ok option, as we want to know whether it was created or not to provide different logger messages.
650
+ directory.mkdir(parents = True)
651
+ logger.info(f"Created directory: {directory}")
652
+ except OSError as e:
653
+ logger.warning(f"Failed to create directory {directory}: {e}")
654
+ sys.exit(1)
655
+ else:
656
+ logger.warning(f"Not a directory, dir {directory}")
657
+ sys.exit(1)
658
+
659
+ # Handle tuple input (like input_dir)
660
+ if isinstance(directory, tuple):
661
+ if not directory: # Empty tuple
662
+ print("No directory provided")
663
+ sys.exit(1)
664
+ directory = directory[0]
665
+
666
+ # Handle string input
667
+ dir_path = str(directory)
668
+ if not os.path.isdir(dir_path):
669
+ if create_if_missing:
670
+ try:
671
+ os.makedirs(dir_path)
672
+ print(f"Created directory: {dir_path}")
673
+ except OSError as e:
674
+ print(f"Failed to create directory {dir_path}: {e}")
675
+ sys.exit(1)
676
+ else:
677
+ print(f"Not a directory, dir {dir_path}")
678
+ sys.exit(1)
679
+
680
+
681
+ def set_saved_person_id_file(
682
+ saved_person_id_file: Path | None, output_dir: Path
683
+ ) -> Path:
684
+ """check if there is a saved person id file set in options - if not, check if the file exists and remove it"""
685
+
686
+ if saved_person_id_file is None:
687
+ saved_person_id_file = output_dir / "person_ids.tsv"
688
+ if saved_person_id_file.exists():
689
+ assert not saved_person_id_file.is_dir()
690
+ saved_person_id_file.unlink()
691
+ else:
692
+ assert not saved_person_id_file.is_dir()
693
+ return saved_person_id_file
694
+
695
+ def check_files_in_rules_exist(rules_input_files: list[str], existing_input_files: list[str]) -> None:
696
+ for infile in existing_input_files:
697
+ if infile not in rules_input_files:
698
+ msg = (
699
+ "WARNING: no mapping rules found for existing input file - {0}".format(
700
+ infile
701
+ )
702
+ )
703
+ logger.warning(msg)
704
+ for infile in rules_input_files:
705
+ if infile not in existing_input_files:
706
+ msg = "WARNING: no data for mapped input file - {0}".format(infile)
707
+ logger.warning(msg)
708
+
709
+ def open_file(file_path: Path) -> tuple[IO[str], Iterator[list[str]]] | None:
710
+ """opens a file and does something related to CSVs"""
711
+ try:
712
+ fh = file_path.open(mode="r", encoding="utf-8-sig")
713
+ csvr = csv.reader(fh)
714
+ return fh, csvr
715
+ except IOError as e:
716
+ logger.exception("Unable to open: {0}".format(file_path))
717
+ logger.exception("I/O error({0}): {1}".format(e.errno, e.strerror))
718
+ return None
719
+
720
+
721
+ def set_omop_filenames(
722
+ omop_ddl_file: Path, omop_config_file: Path, omop_version: str
723
+ ) -> tuple[Path, Path]:
724
+ if (
725
+ (omop_ddl_file is None)
726
+ and (omop_config_file is None)
727
+ and (omop_version is not None)
728
+ ):
729
+ omop_config_file = (
730
+ importlib.resources.files("carrottransform") / "config/omop.json"
731
+ )
732
+ omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
733
+ omop_ddl_file = (
734
+ importlib.resources.files("carrottransform") / "config" / omop_ddl_file_name
735
+ )
736
+ return omop_config_file, omop_ddl_file
737
+
738
+
739
+ def get_person_lookup(saved_person_id_file: Path) -> tuple[dict[str, str], int]:
740
+ # Saved-person-file existence test, reload if found, return last used integer
741
+ if saved_person_id_file.is_file():
742
+ person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
743
+ else:
744
+ person_lookup = {}
745
+ last_used_integer = 1
746
+ return person_lookup, last_used_integer
747
+
496
748
  run.add_command(mapstream,"mapstream")