carrot-transform 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) [2024] [Philip Duncan Appleby]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,48 @@
1
+ Metadata-Version: 2.3
2
+ Name: carrot_transform
3
+ Version: 0.1.0
4
+ Summary:
5
+ Author: anwarfg
6
+ Author-email: 913028+anwarfg@users.noreply.github.com
7
+ Requires-Python: >=3.10,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Requires-Dist: click (>=8.1.7,<9.0.0)
14
+ Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
15
+ Requires-Dist: pandas (>=2.2.3,<3.0.0)
16
+ Description-Content-Type: text/markdown
17
+
18
+ <p align="center">
19
+ <a href="https://carrot.ac.uk/" target="_blank">
20
+ <picture>
21
+ <source media="(prefers-color-scheme: dark)" srcset="/images/logo-dark.png">
22
+ <img alt="Carrot Logo" src="/images/logo-primary.png" width="280"/>
23
+ </picture>
24
+ </a>
25
+ </p>
26
+ <div align="center">
27
+ <strong>
28
+ <h2>Streamlined Data Mapping to OMOP</h2>
29
+ <a href="https://carrot.ac.uk/">Carrot Tranform</a> executes the conversion of the data to the OMOP CDM.<br />
30
+ </strong>
31
+ </div>
32
+
33
+ TODO:
34
+
35
+ - Document carrot-transform
36
+ - Add more comments in-code
37
+ - Handle capture of ddl and json config via the command-line as optional args
38
+
39
+ Reduction in complexity over the original CaRROT-CDM version for the Transform part of _ETL_ - In practice _Extract_ is always
40
+ performed by Data Partners, _Load_ by database bulk-load software.
41
+
42
+ Statistics
43
+
44
+ External libraries imported (approximate)
45
+
46
+ carrot-cdm 61
47
+ carrot-transform 12
48
+
@@ -0,0 +1,30 @@
1
+ <p align="center">
2
+ <a href="https://carrot.ac.uk/" target="_blank">
3
+ <picture>
4
+ <source media="(prefers-color-scheme: dark)" srcset="/images/logo-dark.png">
5
+ <img alt="Carrot Logo" src="/images/logo-primary.png" width="280"/>
6
+ </picture>
7
+ </a>
8
+ </p>
9
+ <div align="center">
10
+ <strong>
11
+ <h2>Streamlined Data Mapping to OMOP</h2>
12
+ <a href="https://carrot.ac.uk/">Carrot Tranform</a> executes the conversion of the data to the OMOP CDM.<br />
13
+ </strong>
14
+ </div>
15
+
16
+ TODO:
17
+
18
+ - Document carrot-transform
19
+ - Add more comments in-code
20
+ - Handle capture of ddl and json config via the command-line as optional args
21
+
22
+ Reduction in complexity over the original CaRROT-CDM version for the Transform part of _ETL_ - In practice _Extract_ is always
23
+ performed by Data Partners, _Load_ by database bulk-load software.
24
+
25
+ Statistics
26
+
27
+ External libraries imported (approximate)
28
+
29
+ carrot-cdm 61
30
+ carrot-transform 12
@@ -0,0 +1,5 @@
1
+ from ._version import __version__
2
+
3
+ params = {
4
+ 'version':__version__,
5
+ }
@@ -0,0 +1,2 @@
1
+ # TODO - pick this up automatically when building
2
+ __version__ = '0.3.2'
File without changes
@@ -0,0 +1,21 @@
1
+ # Package entry point - sets up the "run" subcommand
2
+ from .subcommands.run import run
3
+
4
+ import carrottransform as c
5
+ import click
6
+
7
+ @click.group(invoke_without_command=True)
8
+ @click.option("--version","-v",is_flag=True)
9
+ @click.pass_context
10
+ def transform(ctx,version):
11
+ if ctx.invoked_subcommand == None :
12
+ if version:
13
+ click.echo(c.__version__)
14
+ else:
15
+ click.echo(ctx.get_help())
16
+ return
17
+
18
+ transform.add_command(run, "run")
19
+
20
+ if __name__ == "__main__":
21
+ transform()
@@ -0,0 +1,496 @@
1
+ import csv
2
+ import os, time
3
+ import datetime
4
+ import fnmatch
5
+ import sys
6
+ import click
7
+ import json
8
+ import importlib.resources
9
+ import carrottransform
10
+ import carrottransform.tools as tools
11
+
12
+ @click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
13
+ def run():
14
+ pass
15
+
16
+ @click.command()
17
+ @click.option("--rules-file",
18
+ required=True,
19
+ help="json file containing mapping rules")
20
+ @click.option("--output-dir",
21
+ default=None,
22
+ help="define the output directory for OMOP-format tsv files")
23
+ @click.option("--write-mode",
24
+ default='w',
25
+ type=click.Choice(['w','a']),
26
+ help="force write-mode on output files")
27
+ @click.option("--person-file",
28
+ required=True,
29
+ help="File containing person_ids in the first column")
30
+ @click.option("--omop-ddl-file",
31
+ required=False,
32
+ help="File containing OHDSI ddl statements for OMOP tables")
33
+ @click.option("--omop-config-file",
34
+ required=False,
35
+ help="File containing additional / override json config for omop outputs")
36
+ @click.option("--omop-version",
37
+ required=False,
38
+ help="Quoted string containing opmop version - eg '5.3'")
39
+ @click.option("--saved-person-id-file",
40
+ default=None,
41
+ required=False,
42
+ help="Full path to person id file used to save person_id state and share person_ids between data sets")
43
+ @click.option("--use-input-person-ids",
44
+ required=False,
45
+ default='N',
46
+ help="Use person ids as input without generating new integers")
47
+ @click.option("--last-used-ids-file",
48
+ default=None,
49
+ required=False,
50
+ help="Full path to last used ids file for OMOP tables - format: tablename\tlast_used_id, \nwhere last_used_id must be an integer")
51
+ @click.option("--log-file-threshold",
52
+ required=False,
53
+ default=0,
54
+ help="Lower outcount limit for logfile output")
55
+ @click.argument("input-dir",
56
+ required=False,
57
+ nargs=-1)
58
+ def mapstream(rules_file, output_dir, write_mode,
59
+ person_file, omop_ddl_file, omop_config_file,
60
+ omop_version, saved_person_id_file, use_input_person_ids,
61
+ last_used_ids_file, log_file_threshold, input_dir):
62
+ """
63
+ Map to output using input streams
64
+ """
65
+ # Initialisation
66
+ # - check for values in optional arguments
67
+ # - read in configuration files
68
+ # - check main directories for existence
69
+ # - handle saved persion ids
70
+ # - initialise metrics
71
+ if (omop_ddl_file == None) and (omop_config_file == None) and (omop_version != None):
72
+ omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
73
+ omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
74
+ omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
75
+
76
+ if os.path.isdir(input_dir[0]) == False:
77
+ print("Not a directory, input dir {0}".format(input_dir[0]))
78
+ sys.exit(1)
79
+
80
+ if os.path.isdir(output_dir) == False:
81
+ print("Not a directory, output dir {0}".format(output_dir))
82
+ sys.exit(1)
83
+
84
+ if saved_person_id_file == None:
85
+ saved_person_id_file = output_dir + "/" + "person_ids.tsv"
86
+ if os.path.exists(saved_person_id_file):
87
+ os.remove(saved_person_id_file)
88
+
89
+ starttime = time.time()
90
+ omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
91
+ mappingrules = tools.mappingrules.MappingRules(rules_file, omopcdm)
92
+ metrics = tools.metrics.Metrics(mappingrules.get_dataset_name(), log_file_threshold)
93
+ nowtime = time.time()
94
+
95
+ print("--------------------------------------------------------------------------------")
96
+ print("Loaded mapping rules from: {0} in {1:.5f} secs".format(rules_file, (nowtime - starttime)))
97
+ output_files = mappingrules.get_all_outfile_names()
98
+ record_numbers = {}
99
+ for output_file in output_files:
100
+ record_numbers[output_file] = 1
101
+
102
+ fhd = {}
103
+ tgtcolmaps = {}
104
+
105
+ try:
106
+ # Saved-person-file existence test, reload if found, return last used integer
107
+ if os.path.isfile(saved_person_id_file):
108
+ person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
109
+ else:
110
+ person_lookup = {}
111
+ last_used_integer = 1
112
+ if last_used_ids_file != None:
113
+ if os.path.isfile(last_used_ids_file):
114
+ record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
115
+
116
+ person_lookup, rejected_person_count = load_person_ids(person_file, person_lookup, mappingrules, use_input_person_ids, last_used_integer)
117
+ fhpout = open(saved_person_id_file, mode="w")
118
+ fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
119
+ for person_id, person_assigned_id in person_lookup.items():
120
+ fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
121
+ fhpout.close()
122
+ # Initialise output files, output a header for each
123
+ for tgtfile in output_files:
124
+ fhd[tgtfile] = open(output_dir + "/" + tgtfile + ".tsv", mode=write_mode)
125
+ if write_mode == 'w':
126
+ outhdr = omopcdm.get_omop_column_list(tgtfile)
127
+ fhd[tgtfile].write("\t".join(outhdr) + "\n")
128
+ tgtcolmaps[tgtfile] = omopcdm.get_omop_column_map(tgtfile)
129
+
130
+ except IOError as e:
131
+ print("I/O - error({0}): {1} -> {2}".format(e.errno, e.strerror, str(e)))
132
+ exit()
133
+
134
+ print("person_id stats: total loaded {0}, reject count {1}".format(len(person_lookup), rejected_person_count))
135
+
136
+ # Compare files found in the input_dir with those expected based on mapping rules
137
+ existing_input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
138
+ rules_input_files = mappingrules.get_all_infile_names()
139
+ # Log mismatches but continue
140
+ for infile in existing_input_files:
141
+ if infile not in rules_input_files:
142
+ msg = "ERROR: no mapping rules found for existing input file - {0}".format(infile)
143
+ print(msg)
144
+ for infile in rules_input_files:
145
+ if infile not in existing_input_files:
146
+ msg = "ERROR: no data for mapped input file - {0}".format(infile)
147
+ print(msg)
148
+
149
+ # set up overall counts
150
+ rejidcounts = {}
151
+ rejdatecounts = {}
152
+ print(rules_input_files)
153
+
154
+ # set up per-input counts
155
+ for srcfilename in rules_input_files:
156
+ rejidcounts[srcfilename] = 0
157
+ rejdatecounts[srcfilename] = 0
158
+
159
+ # main processing loop, for each input file
160
+ for srcfilename in rules_input_files:
161
+ outcounts = {}
162
+ rejcounts = {}
163
+ rcount = 0
164
+
165
+ try:
166
+ fh = open(input_dir[0] + "/" + srcfilename, mode="r", encoding="utf-8-sig")
167
+ csvr = csv.reader(fh)
168
+ except IOError as e:
169
+ print("Unable to open: {0}".format(input_dir[0] + "/" + srcfilename))
170
+ print("I/O error({0}): {1}".format(e.errno, e.strerror))
171
+ continue
172
+
173
+ tgtfiles, src_to_tgt = mappingrules.parse_rules_src_to_tgt(srcfilename)
174
+ infile_datetime_source, infile_person_id_source = mappingrules.get_infile_date_person_id(srcfilename)
175
+ for tgtfile in tgtfiles:
176
+ outcounts[tgtfile] = 0
177
+ rejcounts[tgtfile] = 0
178
+ datacolsall = []
179
+ hdrdata = next(csvr)
180
+ dflist = mappingrules.get_infile_data_fields(srcfilename)
181
+ for colname in hdrdata:
182
+ datacolsall.append(colname)
183
+ inputcolmap = omopcdm.get_column_map(hdrdata)
184
+ pers_id_col = inputcolmap[infile_person_id_source]
185
+ datetime_col = inputcolmap[infile_datetime_source]
186
+ print("--------------------------------------------------------------------------------")
187
+ print("Processing input: {0}".format(srcfilename))
188
+
189
+ # for each input record
190
+ for indata in csvr:
191
+ key = srcfilename + "~all~all~all~"
192
+ metrics.increment_key_count(key, "input_count")
193
+ rcount += 1
194
+ strdate = indata[datetime_col].split(" ")[0]
195
+ fulldate = parse_date(strdate)
196
+ if fulldate != None:
197
+ indata[datetime_col] = fulldate
198
+ else:
199
+ metrics.increment_key_count(key, "invalid_date_fields")
200
+ continue
201
+
202
+ for tgtfile in tgtfiles:
203
+ tgtcolmap = tgtcolmaps[tgtfile]
204
+ auto_num_col = omopcdm.get_omop_auto_number_field(tgtfile)
205
+ pers_id_col = omopcdm.get_omop_person_id_field(tgtfile)
206
+
207
+ datacols = datacolsall
208
+ if tgtfile in dflist:
209
+ datacols = dflist[tgtfile]
210
+
211
+ for datacol in datacols:
212
+ built_records, outrecords, metrics = get_target_records(tgtfile, tgtcolmap, src_to_tgt, datacol, indata, inputcolmap, srcfilename, omopcdm, metrics)
213
+ if built_records == True:
214
+ for outrecord in outrecords:
215
+ if auto_num_col != None:
216
+ outrecord[tgtcolmap[auto_num_col]] = str(record_numbers[tgtfile])
217
+ record_numbers[tgtfile] += 1
218
+ if (outrecord[tgtcolmap[pers_id_col]]) in person_lookup:
219
+ outrecord[tgtcolmap[pers_id_col]] = person_lookup[outrecord[tgtcolmap[pers_id_col]]]
220
+ outcounts[tgtfile] += 1
221
+ key = srcfilename + "~all~all~all~"
222
+ metrics.increment_key_count(key, "output_count")
223
+ key = "all~all~" + tgtfile + "~all~"
224
+ metrics.increment_key_count(key, "output_count")
225
+ key = srcfilename + "~all~" + tgtfile + "~all~"
226
+ metrics.increment_key_count(key, "output_count")
227
+ if tgtfile == "person":
228
+ key = srcfilename + "~all~" + tgtfile + "~" + outrecord[1] +"~"
229
+ metrics.increment_key_count(key, "output_count")
230
+ key = srcfilename + "~" + datacol +"~" + tgtfile + "~" + outrecord[1] + "~" + outrecord[2]
231
+ metrics.increment_key_count(key, "output_count")
232
+ else:
233
+ key = srcfilename + "~" + datacol +"~" + tgtfile + "~" + outrecord[2] + "~"
234
+ metrics.increment_key_count(key, "output_count")
235
+ key = srcfilename + "~all~" + tgtfile + "~" + outrecord[2] + "~"
236
+ metrics.increment_key_count(key, "output_count")
237
+ key = "all~all~" + tgtfile + "~" + outrecord[2] + "~"
238
+ metrics.increment_key_count(key, "output_count")
239
+ key = "all~all~all~" + outrecord[2] + "~"
240
+ metrics.increment_key_count(key, "output_count")
241
+ fhd[tgtfile].write("\t".join(outrecord) + "\n")
242
+ else:
243
+ key = srcfilename + "~all~" + tgtfile + "~all~"
244
+ metrics.increment_key_count(key, "invalid_person_ids")
245
+ rejidcounts[srcfilename] += 1
246
+
247
+ fh.close()
248
+
249
+ nowtime= time.time()
250
+ print("INPUT file data : {0}: input count {1}, time since start {2:.5} secs".format(srcfilename, str(rcount), (nowtime - starttime)))
251
+ for outtablename, count in outcounts.items():
252
+ print("TARGET: {0}: output count {1}".format(outtablename, str(count)))
253
+ # END main processing loop
254
+
255
+ print("--------------------------------------------------------------------------------")
256
+ data_summary = metrics.get_mapstream_summary()
257
+ try:
258
+ dsfh = open(output_dir + "/summary_mapstream.tsv", mode="w")
259
+ dsfh.write(data_summary)
260
+ dsfh.close()
261
+ except IOError as e:
262
+ print("I/O error({0}): {1}".format(e.errno, e.strerror))
263
+ print("Unable to write file")
264
+
265
+ # END mapstream
266
+ nowtime = time.time()
267
+ print("Elapsed time = {0:.5f} secs".format(nowtime - starttime))
268
+
269
+ def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srccolmap, srcfilename, omopcdm, metrics):
270
+ """
271
+ build all target records for a given input field
272
+ """
273
+ build_records = False
274
+ tgtrecords = []
275
+ date_col_data = omopcdm.get_omop_datetime_linked_fields(tgtfilename)
276
+ date_component_data = omopcdm.get_omop_date_field_components(tgtfilename)
277
+ notnull_numeric_fields = omopcdm.get_omop_notnull_numeric_fields(tgtfilename)
278
+
279
+ srckey = srcfilename + "~" + srcfield + "~" + tgtfilename
280
+ summarykey = srcfilename + "~" + srcfield + "~" + tgtfilename + "~all~"
281
+ if valid_value(str(srcdata[srccolmap[srcfield]])):
282
+ srcfullkey = srcfilename + "~" + srcfield + "~" + str(srcdata[srccolmap[srcfield]]) + "~" + tgtfilename
283
+ dictkeys = []
284
+ if srcfullkey in rulesmap:
285
+ build_records = True
286
+ dictkeys.append(srcfullkey)
287
+ if srckey in rulesmap:
288
+ build_records = True
289
+ dictkeys.append(srckey)
290
+ if build_records == True:
291
+ for dictkey in dictkeys:
292
+ for out_data_elem in rulesmap[dictkey]:
293
+ valid_data_elem = True
294
+ tgtarray = ['']*len(tgtcolmap)
295
+ for req_integer in notnull_numeric_fields:
296
+ tgtarray[tgtcolmap[req_integer]] = "0"
297
+ for infield, outfield_list in out_data_elem.items():
298
+ for output_col_data in outfield_list:
299
+ if "~" in output_col_data:
300
+ outcol, term = output_col_data.split("~")
301
+ tgtarray[tgtcolmap[outcol]] = term
302
+ else:
303
+ tgtarray[tgtcolmap[output_col_data]] = srcdata[srccolmap[infield]]
304
+ if output_col_data in date_component_data:
305
+ strdate = srcdata[srccolmap[infield]].split(" ")[0]
306
+ dt = get_datetime_value(strdate)
307
+ if dt != None:
308
+ year_field = date_component_data[output_col_data]["year"]
309
+ month_field = date_component_data[output_col_data]["month"]
310
+ day_field = date_component_data[output_col_data]["day"]
311
+ tgtarray[tgtcolmap[year_field]] = str(dt.year)
312
+ tgtarray[tgtcolmap[month_field]] = str(dt.month)
313
+ tgtarray[tgtcolmap[day_field]] = str(dt.day)
314
+ fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
315
+ tgtarray[tgtcolmap[output_col_data]] = fulldate
316
+ else:
317
+ metrics.increment_key_count(summarykey, "invalid_date_fields")
318
+ valid_data_elem = False
319
+ elif output_col_data in date_col_data:
320
+ fulldate = srcdata[srccolmap[infield]]
321
+ tgtarray[tgtcolmap[output_col_data]] = fulldate
322
+ tgtarray[tgtcolmap[date_col_data[output_col_data]]] = fulldate
323
+ if valid_data_elem == True:
324
+ tgtrecords.append(tgtarray)
325
+ else:
326
+ metrics.increment_key_count(summarykey, "invalid_source_fields")
327
+
328
+
329
+ return build_records, tgtrecords, metrics
330
+
331
+ def valid_value(item):
332
+ """
333
+ Check if an item is non blank (null)
334
+ """
335
+ if item.strip() == "":
336
+ return(False)
337
+ return(True)
338
+
339
+ def valid_date_value(item):
340
+ """
341
+ Check if a date item is non null and parses as ISO (YYYY-MM-DD), reverse-ISO
342
+ or dd/mm/yyyy or mm/dd/yyyy
343
+ """
344
+ if item.strip() == "":
345
+ return(False)
346
+ if not valid_iso_date(item) and not valid_reverse_iso_date(item) and not valid_uk_date(item):
347
+ #print("Bad date : {0}".format(item))
348
+ return(False)
349
+ return(True)
350
+
351
+ def get_datetime_value(item):
352
+ """
353
+ Check if a date item is non null and parses as ISO (YYYY-MM-DD), reverse-ISO
354
+ or dd/mm/yyyy or mm/dd/yyyy
355
+ """
356
+ dt = None
357
+ # Does the date parse as an ISO date?
358
+ try:
359
+ dt = datetime.datetime.strptime(item, "%Y-%m-%d")
360
+ except ValueError:
361
+ pass
362
+ if dt != None:
363
+ return(dt)
364
+
365
+ # Does the date parse as a reverse ISO date?
366
+ try:
367
+ dt = datetime.datetime.strptime(item, "%d-%m-%Y")
368
+ except ValueError:
369
+ pass
370
+
371
+ if dt != None:
372
+ return(dt)
373
+
374
+ # Does the date parse as a UK old-style date?
375
+ try:
376
+ dt = datetime.datetime.strptime(item, "%d/%m/%Y")
377
+ except ValueError:
378
+ pass
379
+
380
+ if dt != None:
381
+ return(dt)
382
+
383
+ return None
384
+
385
+ def parse_date(item):
386
+ """
387
+ Crude hand-coded check on date format
388
+ """
389
+ datedata = item.split("-")
390
+ if len(datedata) != 3:
391
+ datedata = item.split("/")
392
+ if len(datedata) != 3:
393
+ return None
394
+ if len(datedata[2]) == 4:
395
+ return("{0}-{1}-{2}".format(datedata[2], datedata[1], datedata[0]))
396
+ return("{0}-{1}-{2}".format(datedata[0], datedata[1], datedata[2]))
397
+
398
+
399
+ def valid_iso_date(item):
400
+ """
401
+ Check if a date item is non null and parses as ISO (YYYY-MM-DD)
402
+ """
403
+ try:
404
+ datetime.datetime.strptime(item, "%Y-%m-%d")
405
+ except ValueError:
406
+ return(False)
407
+
408
+ return(True)
409
+
410
+ def valid_reverse_iso_date(item):
411
+ """
412
+ Check if a date item is non null and parses as reverse ISO (DD-MM-YYYY)
413
+ """
414
+ try:
415
+ datetime.datetime.strptime(item, "%d-%m-%Y")
416
+ except ValueError:
417
+ return(False)
418
+
419
+ return(True)
420
+
421
+ def valid_uk_date(item):
422
+ """
423
+ Check if a date item is non null and parses as UK format (DD/MM/YYYY)
424
+ """
425
+ try:
426
+ datetime.datetime.strptime(item, "%d/%m/%Y")
427
+ except ValueError:
428
+ return(False)
429
+
430
+ return(True)
431
+
432
+ def load_last_used_ids(last_used_ids_file, last_used_ids):
433
+ fh = open(last_used_ids_file, mode="r", encoding="utf-8-sig")
434
+ csvr = csv.reader(fh, delimiter="\t")
435
+
436
+ for last_ids_data in csvr:
437
+ last_used_ids[last_ids_data[0]] = int(last_ids_data[1]) + 1
438
+
439
+ fh.close()
440
+ return last_used_ids
441
+
442
+ def load_saved_person_ids(person_file):
443
+ fh = open(person_file, mode="r", encoding="utf-8-sig")
444
+ csvr = csv.reader(fh, delimiter="\t")
445
+ last_int = 1
446
+ person_ids = {}
447
+
448
+ next(csvr)
449
+ for persondata in csvr:
450
+ person_ids[persondata[0]] = persondata[1]
451
+ last_int += 1
452
+
453
+ fh.close()
454
+ return person_ids, last_int
455
+
456
+ def load_person_ids(person_file, person_ids, mappingrules, use_input_person_ids, person_number=1, delim=","):
457
+ fh = open(person_file, mode="r", encoding="utf-8-sig")
458
+ csvr = csv.reader(fh, delimiter=delim)
459
+ person_columns = {}
460
+ person_col_in_hdr_number = 0
461
+ reject_count = 0
462
+
463
+ personhdr = next(csvr)
464
+ print(personhdr)
465
+
466
+ # Make a dictionary of column names vs their positions
467
+ for col in personhdr:
468
+ person_columns[col] = person_col_in_hdr_number
469
+ person_col_in_hdr_number += 1
470
+
471
+ birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info("person")
472
+ print("Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source))
473
+ person_col = person_columns[person_id_source]
474
+
475
+ for persondata in csvr:
476
+ if not valid_value(persondata[person_columns[person_id_source]]):
477
+ reject_count += 1
478
+ continue
479
+ if not valid_date_value(persondata[person_columns[birth_datetime_source]]):
480
+ reject_count += 1
481
+ continue
482
+ if persondata[person_col] not in person_ids:
483
+ if use_input_person_ids == "N":
484
+ person_ids[persondata[person_col]] = str(person_number)
485
+ person_number += 1
486
+ else:
487
+ person_ids[persondata[person_col]] = str(persondata[person_col])
488
+ fh.close()
489
+
490
+ return person_ids, reject_count
491
+
492
+ @click.group(help="Commands for using python configurations to run the ETL transformation.")
493
+ def py():
494
+ pass
495
+
496
+ run.add_command(mapstream,"mapstream")