carrot-transform 0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) [2024] [Philip Duncan Appleby]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,28 @@
1
+ Metadata-Version: 2.1
2
+ Name: carrot-transform
3
+ Version: 0.3
4
+ Summary: Carrot somple transformer, input rules and data csvs, output OMOP
5
+ Author-email: PD Appleby <pdappleby@gmail.com>
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.9
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+
13
+ # carrot-transform
14
+
15
+ TODO:
16
+ * Document carrot-transform
17
+ * Add more comments in-code
18
+ * Handle capture of ddl and json config via the command-line as optional args
19
+
20
+ Reduction in complexity over the original CaRROT-CDM version for the Transform part of *ETL* - In practice *Extract* is always
21
+ performed by Data Partners, *Load* by database bulk-load software.
22
+
23
+ Statistics
24
+
25
+ External libraries imported (approximate)
26
+
27
+ carrot-cdm 61
28
+ carrot-transform 12
@@ -0,0 +1,19 @@
1
+ carrottransform/__init__.py,sha256=cQJKTCpG2qmKxDl-VtSWQ3_WFjyzg4u_8nZacWAHFcU,73
2
+ carrottransform/_version.py,sha256=GIcaSIQ2wetvh_X8XcZC4nmbIniXNzgn9zFpgXoMWW8,70
3
+ carrottransform/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ carrottransform/cli/command.py,sha256=xYTaJsVZyRYv0CzUwrh7ZPK8hhGyC3MDfvVYxHcXYSM,508
5
+ carrottransform/cli/subcommands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ carrottransform/cli/subcommands/run.py,sha256=J081wG4C6gQYNB_ahejyxtoNA_ZI6Aq5YOopWEtAWLw,20384
7
+ carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql,sha256=fXrPfdL3IzU5ux55ogsQKjjd-c1KzdP_N2A_JjlY3gk,18084
8
+ carrottransform/config/omop.json,sha256=WiA1XeEd9K3dH3DRN1uJAzjzQpslGlmL-AxJ9z1PDQI,1687
9
+ carrottransform/tools/__init__.py,sha256=b3JuCwgJVx0rqx5igB8hNNKO0ktlbQjHGHwy-vzpdo0,198
10
+ carrottransform/tools/file_helpers.py,sha256=15iNY7qDMXc1p_KHb77ZnV4Tx7wi-vkiufZE4tz6DiM,276
11
+ carrottransform/tools/mappingrules.py,sha256=ru7sExFHEQA0eVbY68P-HQyGtZLUM1NxC_AWKIzgQzQ,6335
12
+ carrottransform/tools/metrics.py,sha256=r6Q2-rt9C13D5fTiwEdxuHx_NjyHpy1zMhcHxCZpZfc,5505
13
+ carrottransform/tools/omopcdm.py,sha256=BdftE6-E0oJwGIrOfrjP8gpEAIR5JQhR1DVzWDrzNO8,7365
14
+ carrot_transform-0.3.dist-info/LICENSE,sha256=pqIiuuTs6Na-oFd10MMsZoZmdfhfUhHeOtQzgzSkcaw,1082
15
+ carrot_transform-0.3.dist-info/METADATA,sha256=7u8s94DZd8CfODLRBI_HygqzGNadhlilT5iUa2QOdV8,865
16
+ carrot_transform-0.3.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
17
+ carrot_transform-0.3.dist-info/entry_points.txt,sha256=WSJqmgB8PEK8iMl3IFEMBYuyXtzHX5PaKbG13R54AH4,75
18
+ carrot_transform-0.3.dist-info/top_level.txt,sha256=UXPSohnlYfzndis3fEcl6f-dg80qwrKdPjnnSsggEUs,16
19
+ carrot_transform-0.3.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.3.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ carrot-transform = carrottransform.cli.command:transform
@@ -0,0 +1 @@
1
+ carrottransform
@@ -0,0 +1,5 @@
1
+ from ._version import __version__
2
+
3
+ params = {
4
+ 'version':__version__,
5
+ }
@@ -0,0 +1,2 @@
1
+ # TODO - pick this up automatically when building
2
+ __version__ = '0.3'
File without changes
@@ -0,0 +1,21 @@
1
+ # Package entry point - sets up the "run" subcommand
2
+ from .subcommands.run import run
3
+
4
+ import carrottransform as c
5
+ import click
6
+
7
+ @click.group(invoke_without_command=True)
8
+ @click.option("--version","-v",is_flag=True)
9
+ @click.pass_context
10
+ def transform(ctx,version):
11
+ if ctx.invoked_subcommand == None :
12
+ if version:
13
+ click.echo(c.__version__)
14
+ else:
15
+ click.echo(ctx.get_help())
16
+ return
17
+
18
+ transform.add_command(run, "run")
19
+
20
+ if __name__ == "__main__":
21
+ transform()
File without changes
@@ -0,0 +1,484 @@
1
+ import csv
2
+ import os, time
3
+ import datetime
4
+ import fnmatch
5
+ import sys
6
+ import click
7
+ import json
8
+ import importlib.resources
9
+ import carrottransform
10
+ import carrottransform.tools as tools
11
+
12
+ @click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
13
+ def run():
14
+ pass
15
+
16
+ @click.command()
17
+ @click.option("--rules-file",
18
+ required=True,
19
+ help="json file containing mapping rules")
20
+ @click.option("--output-dir",
21
+ default=None,
22
+ help="define the output directory for OMOP-format tsv files")
23
+ @click.option("--write-mode",
24
+ default='w',
25
+ type=click.Choice(['w','a']),
26
+ help="force write-mode on output files")
27
+ @click.option("--person-file",
28
+ required=True,
29
+ help="File containing person_ids in the first column")
30
+ @click.option("--omop-version",
31
+ required=True,
32
+ help="Quoted string containing opmop version - eg '5.3'")
33
+ @click.option("--saved-person-id-file",
34
+ default=None,
35
+ required=False,
36
+ help="Full path to person id file used to save person_id state and share person_ids between data sets")
37
+ @click.option("--use-input-person-ids",
38
+ required=False,
39
+ default='N',
40
+ help="Use person ids as input without generating new integers")
41
+ @click.option("--last-used-ids-file",
42
+ default=None,
43
+ required=False,
44
+ help="Full path to last used ids file for OMOP tables - format: tablename\tlast_used_id, \nwhere last_used_id must be an integer")
45
+ @click.option("--log-file-threshold",
46
+ required=False,
47
+ default=0,
48
+ help="Lower outcount limit for logfile output")
49
+ @click.argument("input-dir",
50
+ required=False,
51
+ nargs=-1)
52
+ def mapstream(rules_file, output_dir, write_mode, person_file, omop_version, saved_person_id_file, use_input_person_ids, last_used_ids_file, log_file_threshold, input_dir):
53
+ """
54
+ Map to output using input streams
55
+ """
56
+ omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
57
+ omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
58
+ omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
59
+
60
+ if os.path.isdir(input_dir[0]) == False:
61
+ print("Not a directory, input dir {0}".format(input_dir[0]))
62
+ sys.exit(1)
63
+
64
+ if os.path.isdir(output_dir) == False:
65
+ print("Not a directory, output dir {0}".format(output_dir))
66
+ sys.exit(1)
67
+
68
+ if saved_person_id_file == None:
69
+ saved_person_id_file = output_dir + "/" + "person_ids.tsv"
70
+ if os.path.exists(saved_person_id_file):
71
+ os.remove(saved_person_id_file)
72
+
73
+ starttime = time.time()
74
+ omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
75
+ #print(omopcdm.dump_ddl())
76
+ mappingrules = tools.mappingrules.MappingRules(rules_file, omopcdm)
77
+ metrics = tools.metrics.Metrics(mappingrules.get_dataset_name(), log_file_threshold)
78
+ nowtime = time.time()
79
+
80
+ print("--------------------------------------------------------------------------------")
81
+ print("Loaded mapping rules from: {0} after {1:.5f} secs".format(rules_file, (nowtime - starttime)))
82
+ output_files = mappingrules.get_all_outfile_names()
83
+ record_numbers = {}
84
+ for output_file in output_files:
85
+ record_numbers[output_file] = 1
86
+
87
+ fhd = {}
88
+ tgtcolmaps = {}
89
+
90
+ try:
91
+ # Add in a saved-person-file existence test and reload from it is necessary returning the last used integer
92
+ if os.path.isfile(saved_person_id_file):
93
+ person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
94
+ else:
95
+ person_lookup = {}
96
+ last_used_integer = 1
97
+ if last_used_ids_file != None:
98
+ if os.path.isfile(last_used_ids_file):
99
+ record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
100
+
101
+ #fhp = open(person_file, mode="r", encoding="utf-8-sig")
102
+ #csvrp = csv.reader(fhp)
103
+ person_lookup, rejected_person_count = load_person_ids(person_file, person_lookup, mappingrules, use_input_person_ids, last_used_integer)
104
+ fhpout = open(saved_person_id_file, mode="w")
105
+ fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
106
+ for person_id, person_assigned_id in person_lookup.items():
107
+ fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
108
+ fhpout.close()
109
+ for tgtfile in output_files:
110
+ fhd[tgtfile] = open(output_dir + "/" + tgtfile + ".tsv", mode=write_mode)
111
+ if write_mode == 'w':
112
+ outhdr = omopcdm.get_omop_column_list(tgtfile)
113
+ fhd[tgtfile].write("\t".join(outhdr) + "\n")
114
+ tgtcolmaps[tgtfile] = omopcdm.get_omop_column_map(tgtfile)
115
+
116
+ except IOError as e:
117
+ print("I/O - error({0}): {1} -> {2}".format(e.errno, e.strerror, str(e)))
118
+ exit()
119
+
120
+ print("person_id stats: total loaded {0}, reject count {1}".format(len(person_lookup), rejected_person_count))
121
+
122
+ # TODO get this list of input files from the parsed rules
123
+ existing_input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
124
+ rules_input_files = mappingrules.get_all_infile_names()
125
+ for infile in existing_input_files:
126
+ if infile not in rules_input_files:
127
+ msg = "ERROR: no mapping rules found for existing input file - {0}".format(infile)
128
+ print(msg)
129
+ metrics.add_log_data(msg)
130
+ for infile in rules_input_files:
131
+ if infile not in existing_input_files:
132
+ msg = "ERROR: no data for mapped input file - {0}".format(infile)
133
+ print(msg)
134
+ metrics.add_log_data(msg)
135
+ rejidcounts = {}
136
+ rejdatecounts = {}
137
+ #src_tgt_counts = {}
138
+ print(rules_input_files)
139
+
140
+ for srcfilename in rules_input_files:
141
+ rejidcounts[srcfilename] = 0
142
+ rejdatecounts[srcfilename] = 0
143
+
144
+ for srcfilename in rules_input_files:
145
+ outcounts = {}
146
+ rejcounts = {}
147
+ rcount = 0
148
+
149
+ try:
150
+ fh = open(input_dir[0] + "/" + srcfilename, mode="r", encoding="utf-8-sig")
151
+ csvr = csv.reader(fh)
152
+ except IOError as e:
153
+ print("Unable to open: {0}".format(input_dir[0] + "/" + srcfilename))
154
+ print("I/O error({0}): {1}".format(e.errno, e.strerror))
155
+ continue
156
+
157
+ tgtfiles, src_to_tgt = mappingrules.parse_rules_src_to_tgt(srcfilename)
158
+ infile_datetime_source, infile_person_id_source = mappingrules.get_infile_date_person_id(srcfilename)
159
+ for tgtfile in tgtfiles:
160
+ outcounts[tgtfile] = 0
161
+ rejcounts[tgtfile] = 0
162
+ datacolsall = []
163
+ hdrdata = next(csvr)
164
+ dflist = mappingrules.get_infile_data_fields(srcfilename)
165
+ for colname in hdrdata:
166
+ datacolsall.append(colname)
167
+ inputcolmap = omopcdm.get_column_map(hdrdata)
168
+ pers_id_col = inputcolmap[infile_person_id_source]
169
+ datetime_col = inputcolmap[infile_datetime_source]
170
+ print("--------------------------------------------------------------------------------")
171
+ print("Processing input: {0}".format(srcfilename))
172
+ # print("Processing input: {0}, All input cols = {1}, Data cols = {2}".format(srcfilename, str(datacolsall), str(dflist)))
173
+
174
+ for indata in csvr:
175
+ #indata = inputline.strip().split(",")
176
+ key = srcfilename + "~all~all~all~"
177
+ metrics.increment_key_count(key, "input_count")
178
+ rcount += 1
179
+ strdate = indata[datetime_col].split(" ")[0]
180
+ fulldate = parse_date(strdate)
181
+ if fulldate != None:
182
+ #fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
183
+ indata[datetime_col] = fulldate
184
+ else:
185
+ metrics.increment_key_count(key, "invalid_date_fields")
186
+ continue
187
+
188
+ for tgtfile in tgtfiles:
189
+ tgtcolmap = tgtcolmaps[tgtfile]
190
+ auto_num_col = omopcdm.get_omop_auto_number_field(tgtfile)
191
+ pers_id_col = omopcdm.get_omop_person_id_field(tgtfile)
192
+
193
+ datacols = datacolsall
194
+ if tgtfile in dflist:
195
+ datacols = dflist[tgtfile]
196
+
197
+ for datacol in datacols:
198
+ built_records, outrecords, metrics = get_target_records(tgtfile, tgtcolmap, src_to_tgt, datacol, indata, inputcolmap, srcfilename, omopcdm, metrics)
199
+ if built_records == True:
200
+ for outrecord in outrecords:
201
+ if auto_num_col != None:
202
+ outrecord[tgtcolmap[auto_num_col]] = str(record_numbers[tgtfile])
203
+ record_numbers[tgtfile] += 1
204
+ if (outrecord[tgtcolmap[pers_id_col]]) in person_lookup:
205
+ outrecord[tgtcolmap[pers_id_col]] = person_lookup[outrecord[tgtcolmap[pers_id_col]]]
206
+ outcounts[tgtfile] += 1
207
+ key = srcfilename + "~all~all~all~"
208
+ metrics.increment_key_count(key, "output_count")
209
+ key = "all~all~" + tgtfile + "~all~"
210
+ metrics.increment_key_count(key, "output_count")
211
+ key = srcfilename + "~all~" + tgtfile + "~all~"
212
+ metrics.increment_key_count(key, "output_count")
213
+ if tgtfile == "person":
214
+ key = srcfilename + "~all~" + tgtfile + "~" + outrecord[1] +"~"
215
+ metrics.increment_key_count(key, "output_count")
216
+ key = srcfilename + "~" + datacol +"~" + tgtfile + "~" + outrecord[1] + "~" + outrecord[2]
217
+ metrics.increment_key_count(key, "output_count")
218
+ else:
219
+ key = srcfilename + "~" + datacol +"~" + tgtfile + "~" + outrecord[2] + "~"
220
+ metrics.increment_key_count(key, "output_count")
221
+ key = srcfilename + "~all~" + tgtfile + "~" + outrecord[2] + "~"
222
+ metrics.increment_key_count(key, "output_count")
223
+ key = "all~all~" + tgtfile + "~" + outrecord[2] + "~"
224
+ metrics.increment_key_count(key, "output_count")
225
+ key = "all~all~all~" + outrecord[2] + "~"
226
+ metrics.increment_key_count(key, "output_count")
227
+ fhd[tgtfile].write("\t".join(outrecord) + "\n")
228
+ else:
229
+ key = srcfilename + "~all~" + tgtfile + "~all~"
230
+ metrics.increment_key_count(key, "invalid_person_ids")
231
+ rejidcounts[srcfilename] += 1
232
+
233
+ fh.close()
234
+
235
+ nowtime= time.time()
236
+ print("INPUT file data : {0}: input count {1}, time since start {2:.5} secs".format(srcfilename, str(rcount), (nowtime - starttime)))
237
+ for outtablename, count in outcounts.items():
238
+ print("TARGET: {0}: output count {1}".format(outtablename, str(count)))
239
+
240
+ print("--------------------------------------------------------------------------------")
241
+ data_summary = metrics.get_mapstream_summary()
242
+ log_report = metrics.get_log_data()
243
+ try:
244
+ dsfh = open(output_dir + "/summary_mapstream.tsv", mode="w")
245
+ dsfh.write(data_summary)
246
+ dsfh.close()
247
+ logfh = open(output_dir + "/error_report.txt", mode="w")
248
+ logfh.write(log_report)
249
+ logfh.close()
250
+ except IOError as e:
251
+ print("I/O error({0}): {1}".format(e.errno, e.strerror))
252
+ print("Unable to write file")
253
+
254
+ nowtime = time.time()
255
+ print("Elapsed time = {0:.5f} secs".format(nowtime - starttime))
256
+ #profiler.disable()
257
+ #stats = pstats.Stats(profiler).sort_stats('ncalls')
258
+ #stats.print_stats()
259
+
260
+ def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srccolmap, srcfilename, omopcdm, metrics):
261
+ build_records = False
262
+ tgtrecords = []
263
+ date_col_data = omopcdm.get_omop_datetime_linked_fields(tgtfilename)
264
+ date_component_data = omopcdm.get_omop_date_field_components(tgtfilename)
265
+ notnull_numeric_fields = omopcdm.get_omop_notnull_numeric_fields(tgtfilename)
266
+
267
+ srckey = srcfilename + "~" + srcfield + "~" + tgtfilename
268
+ summarykey = srcfilename + "~" + srcfield + "~" + tgtfilename + "~all~"
269
+ if valid_value(str(srcdata[srccolmap[srcfield]])):
270
+ srcfullkey = srcfilename + "~" + srcfield + "~" + str(srcdata[srccolmap[srcfield]]) + "~" + tgtfilename
271
+ dictkeys = []
272
+ if srcfullkey in rulesmap:
273
+ build_records = True
274
+ dictkeys.append(srcfullkey)
275
+ if srckey in rulesmap:
276
+ build_records = True
277
+ dictkeys.append(srckey)
278
+ if build_records == True:
279
+ for dictkey in dictkeys:
280
+ for out_data_elem in rulesmap[dictkey]:
281
+ valid_data_elem = True
282
+ tgtarray = ['']*len(tgtcolmap)
283
+ for req_integer in notnull_numeric_fields:
284
+ tgtarray[tgtcolmap[req_integer]] = "0"
285
+ for infield, outfield_list in out_data_elem.items():
286
+ for output_col_data in outfield_list:
287
+ if "~" in output_col_data:
288
+ outcol, term = output_col_data.split("~")
289
+ tgtarray[tgtcolmap[outcol]] = term
290
+ else:
291
+ tgtarray[tgtcolmap[output_col_data]] = srcdata[srccolmap[infield]]
292
+ if output_col_data in date_component_data:
293
+ strdate = srcdata[srccolmap[infield]].split(" ")[0]
294
+ dt = get_datetime_value(strdate)
295
+ if dt != None:
296
+ year_field = date_component_data[output_col_data]["year"]
297
+ month_field = date_component_data[output_col_data]["month"]
298
+ day_field = date_component_data[output_col_data]["day"]
299
+ tgtarray[tgtcolmap[year_field]] = str(dt.year)
300
+ tgtarray[tgtcolmap[month_field]] = str(dt.month)
301
+ tgtarray[tgtcolmap[day_field]] = str(dt.day)
302
+ fulldate = "{0}-{1:02}-{2:02}".format(dt.year, dt.month, dt.day)
303
+ tgtarray[tgtcolmap[output_col_data]] = fulldate
304
+ else:
305
+ metrics.increment_key_count(summarykey, "invalid_date_fields")
306
+ valid_data_elem = False
307
+ elif output_col_data in date_col_data:
308
+ fulldate = srcdata[srccolmap[infield]]
309
+ tgtarray[tgtcolmap[output_col_data]] = fulldate
310
+ tgtarray[tgtcolmap[date_col_data[output_col_data]]] = fulldate
311
+ if valid_data_elem == True:
312
+ tgtrecords.append(tgtarray)
313
+ else:
314
+ metrics.increment_key_count(summarykey, "invalid_source_fields")
315
+
316
+
317
+ return build_records, tgtrecords, metrics
318
+
319
+ def valid_value(item):
320
+ """
321
+ Check if an item is non blank (null)
322
+ """
323
+ if item.strip() == "":
324
+ return(False)
325
+ return(True)
326
+
327
+ def valid_date_value(item):
328
+ """
329
+ Check if a date item is non null and parses as ISO (YYYY-MM-DD), reverse-ISO
330
+ or dd/mm/yyyy or mm/dd/yyyy
331
+ """
332
+ if item.strip() == "":
333
+ return(False)
334
+ if not valid_iso_date(item) and not valid_reverse_iso_date(item) and not valid_uk_date(item):
335
+ #print("Bad date : {0}".format(item))
336
+ return(False)
337
+ return(True)
338
+
339
+ def get_datetime_value(item):
340
+ """
341
+ Check if a date item is non null and parses as ISO (YYYY-MM-DD), reverse-ISO
342
+ or dd/mm/yyyy or mm/dd/yyyy
343
+ """
344
+ dt = None
345
+ # Does the date parse as an ISO date?
346
+ try:
347
+ dt = datetime.datetime.strptime(item, "%Y-%m-%d")
348
+ except ValueError:
349
+ pass
350
+ if dt != None:
351
+ return(dt)
352
+
353
+ # Does the date parse as a reverse ISO date?
354
+ try:
355
+ dt = datetime.datetime.strptime(item, "%d-%m-%Y")
356
+ except ValueError:
357
+ pass
358
+
359
+ if dt != None:
360
+ return(dt)
361
+
362
+ # Does the date parse as a UK old-style date?
363
+ try:
364
+ dt = datetime.datetime.strptime(item, "%d/%m/%Y")
365
+ except ValueError:
366
+ pass
367
+
368
+ if dt != None:
369
+ return(dt)
370
+
371
+ return None
372
+
373
+ def parse_date(item):
374
+ """
375
+ Crude hand-coded check on date format
376
+ """
377
+ datedata = item.split("-")
378
+ if len(datedata) != 3:
379
+ datedata = item.split("/")
380
+ if len(datedata) != 3:
381
+ return None
382
+ if len(datedata[2]) == 4:
383
+ return("{0}-{1}-{2}".format(datedata[2], datedata[1], datedata[0]))
384
+ return("{0}-{1}-{2}".format(datedata[0], datedata[1], datedata[2]))
385
+
386
+
387
+ def valid_iso_date(item):
388
+ """
389
+ Check if a date item is non null and parses as ISO (YYYY-MM-DD)
390
+ """
391
+ try:
392
+ datetime.datetime.strptime(item, "%Y-%m-%d")
393
+ except ValueError:
394
+ return(False)
395
+
396
+ return(True)
397
+
398
+ def valid_reverse_iso_date(item):
399
+ """
400
+ Check if a date item is non null and parses as reverse ISO (DD-MM-YYYY)
401
+ """
402
+ try:
403
+ datetime.datetime.strptime(item, "%d-%m-%Y")
404
+ except ValueError:
405
+ return(False)
406
+
407
+ return(True)
408
+
409
+ def valid_uk_date(item):
410
+ """
411
+ Check if a date item is non null and parses as UK format (DD/MM/YYYY)
412
+ """
413
+ try:
414
+ datetime.datetime.strptime(item, "%d/%m/%Y")
415
+ except ValueError:
416
+ return(False)
417
+
418
+ return(True)
419
+
420
+ def load_last_used_ids(last_used_ids_file, last_used_ids):
421
+ fh = open(last_used_ids_file, mode="r", encoding="utf-8-sig")
422
+ csvr = csv.reader(fh, delimiter="\t")
423
+
424
+ for last_ids_data in csvr:
425
+ last_used_ids[last_ids_data[0]] = int(last_ids_data[1]) + 1
426
+
427
+ fh.close()
428
+ return last_used_ids
429
+
430
+ def load_saved_person_ids(person_file):
431
+ fh = open(person_file, mode="r", encoding="utf-8-sig")
432
+ csvr = csv.reader(fh, delimiter="\t")
433
+ last_int = 1
434
+ person_ids = {}
435
+
436
+ next(csvr)
437
+ for persondata in csvr:
438
+ person_ids[persondata[0]] = persondata[1]
439
+ last_int += 1
440
+
441
+ fh.close()
442
+ return person_ids, last_int
443
+
444
+ def load_person_ids(person_file, person_ids, mappingrules, use_input_person_ids, person_number=1, delim=","):
445
+ fh = open(person_file, mode="r", encoding="utf-8-sig")
446
+ csvr = csv.reader(fh, delimiter=delim)
447
+ person_columns = {}
448
+ person_col_in_hdr_number = 0
449
+ reject_count = 0
450
+
451
+ personhdr = next(csvr)
452
+ print(personhdr)
453
+
454
+ # Make a dictionary of column names vs their positions
455
+ for col in personhdr:
456
+ person_columns[col] = person_col_in_hdr_number
457
+ person_col_in_hdr_number += 1
458
+
459
+ birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info("person")
460
+ print("Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source))
461
+ person_col = person_columns[person_id_source]
462
+
463
+ for persondata in csvr:
464
+ if not valid_value(persondata[person_columns[person_id_source]]):
465
+ reject_count += 1
466
+ continue
467
+ if not valid_date_value(persondata[person_columns[birth_datetime_source]]):
468
+ reject_count += 1
469
+ continue
470
+ if persondata[person_col] not in person_ids:
471
+ if use_input_person_ids == "N":
472
+ person_ids[persondata[person_col]] = str(person_number)
473
+ person_number += 1
474
+ else:
475
+ person_ids[persondata[person_col]] = str(persondata[person_col])
476
+ fh.close()
477
+
478
+ return person_ids, reject_count
479
+
480
+ @click.group(help="Commands for using python configurations to run the ETL transformation.")
481
+ def py():
482
+ pass
483
+
484
+ run.add_command(mapstream,"mapstream")