carrot-transform 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,106 @@
1
+ Metadata-Version: 2.3
2
+ Name: carrot_transform
3
+ Version: 0.3.4
4
+ Summary:
5
+ Author: anwarfg
6
+ Author-email: 913028+anwarfg@users.noreply.github.com
7
+ Requires-Python: >=3.10,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Requires-Dist: click (>=8.1.7,<9.0.0)
14
+ Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
15
+ Requires-Dist: pandas (>=2.2.3,<3.0.0)
16
+ Requires-Dist: pytest (>=8.3.4,<9.0.0)
17
+ Description-Content-Type: text/markdown
18
+
19
+ <p align="center">
20
+ <a href="https://carrot.ac.uk/" target="_blank">
21
+ <picture>
22
+ <source media="(prefers-color-scheme: dark)" srcset="/images/logo-dark.png">
23
+ <img alt="Carrot Logo" src="/images/logo-primary.png" width="280"/>
24
+ </picture>
25
+ </a>
26
+ </p>
27
+
28
+ <p align="center">
29
+
30
+ <a href="https://github.com/Health-Informatics-UoN/carrot-transform/releases">
31
+ <img src="https://img.shields.io/github/v/release/Health-Informatics-UoN/carrot-transform" alt="Release">
32
+ </a>
33
+ <a href="https://opensource.org/license/mit">
34
+ <img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License">
35
+ </a>
36
+ </p>
37
+
38
+
39
+ <div align="center">
40
+ <strong>
41
+ <h2>Streamlined Data Transformation to OMOP</h2><br />
42
+ <a href="https://carrot.ac.uk/">Carrot Transform</a> automates data transformation processes and facilitates the standardisation of datasets to the OMOP vocabulary, simplifying the integration of diverse data sources.
43
+ <br />
44
+ </strong>
45
+ </div>
46
+
47
+ <p align="center">
48
+ <br />
49
+ <a href="https://carrot.ac.uk/transform" rel="dofollow"><strong>Explore the docs »</strong></a>
50
+ <br />
51
+ <br />
52
+
53
+ <a href="https://carrot.ac.uk/">Carrot Mapper</a> is a webapp which allows the user to use the metadata (as output by [WhiteRabbit](https://github.com/OHDSI/WhiteRabbit)) from a dataset to produce mapping rules to the OMOP standard, in the JSON format. These can be ingested by [Carrot Transform](https://carrot.ac.uk/transform/quickstart) to perform the mapping of the contents of the dataset to OMOP.
54
+
55
+ Carrot Transform transforms input data into tab separated variable files of standard OMOP tables, with concepts mapped according to the provided rules (generated from Carrot Mapper).
56
+
57
+ ## Quick Start for Developers
58
+
59
+ To have the project up and running, please follow the [Quick Start Guide](https://carrot.ac.uk/transform/quickstart).
60
+
61
+ ## Release Procedure
62
+ To release a new version of `carrot-transform` follow these steps:
63
+
64
+ ### 1. Prepare the repository
65
+ - First ensure that repository is clean and all required changes have been merged.
66
+ - Pull the latest changes from `main` with `git pull origin main`.
67
+
68
+ ### 2. Create a release branch
69
+
70
+ - Now create a new feature branch name `release/v<NEW-VERSION>` (e.g. `release/v0.2.0`).
71
+
72
+ ### 3. Update the version number
73
+ - Use poetry to bump the version. For example, for a minor version update invoke:
74
+ ```bash
75
+ poetry version minor
76
+ ```
77
+ - Commit and push the changes (to the release feature branch):
78
+ ```bash
79
+ NEW_VERSION=$(poetry version -s)
80
+ git add pyproject.toml
81
+ git commit -m "Bump version to $NEW_VERSION"
82
+ git push --set-upstream origin release/v$NEW_VERSION
83
+ ```
84
+
85
+ ### 4. Create pull request
86
+ - Open a pull request from `release/v$NEW_VERSION` to `main` and await approval.
87
+ ### 5. Merge and tag
88
+ - After approval merge the the feature branch to `main`.
89
+ - Checkout to `main`, pull updates, and create a tag corresponding to the new version number.
90
+ ```bash
91
+ git checkout main
92
+ git pull origin main
93
+ git tag -a "$NEW_VERSION" -m "Release $NEW_VERSION"
94
+ git push origin "$NEW_VERSION"
95
+ ```
96
+
97
+ ### 6. Create a release
98
+ - We must now link the tag to a release in the GitHub repository. To do this from the command line first install GitHub command line tools `gh` and then invoke:
99
+ ```bash
100
+ gh release create "$TAG" --title "$TAG" --notes "Release for $VERSION"
101
+ ```
102
+
103
+ - Alternatively, follow the instructions in the [GitHub documentation](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository) to manually create a release.
104
+ ## License
105
+
106
+ This repository's source code is available under the [MIT license](LICENSE).
@@ -0,0 +1,24 @@
1
+ carrottransform/__init__.py,sha256=cQJKTCpG2qmKxDl-VtSWQ3_WFjyzg4u_8nZacWAHFcU,73
2
+ carrottransform/_version.py,sha256=bm7SM-_MN0gstlNsCDO6dAajKcjQD-NxI_xpvfRx0Ts,172
3
+ carrottransform/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ carrottransform/cli/command.py,sha256=xYTaJsVZyRYv0CzUwrh7ZPK8hhGyC3MDfvVYxHcXYSM,508
5
+ carrottransform/cli/subcommands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ carrottransform/cli/subcommands/run.py,sha256=r2XanTvy4QowPbziZ5lqs-Tm8CAzCquL7DRy4lTT9Ak,23977
7
+ carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql,sha256=fXrPfdL3IzU5ux55ogsQKjjd-c1KzdP_N2A_JjlY3gk,18084
8
+ carrottransform/config/omop.json,sha256=OT3jvfPjKhjsDnQcQw1OAEOHhQLoHXNxTj_MDwNbYqo,1934
9
+ carrottransform/examples/test/inputs/Covid19_test.csv,sha256=d5t7Lfhkwbfe3Uk2IBqB2ZT5o0h9QaeraC8E5-IMERo,67521
10
+ carrottransform/examples/test/inputs/Demographics.csv,sha256=_ukUTpD4g751sL_mSL3f26T_Edd2kvH-evwm54VfXJI,85237
11
+ carrottransform/examples/test/inputs/Symptoms.csv,sha256=5dvGv16PNJJO_lFc0reRmQbE3m7iWfWajl51JDsqg0M,78447
12
+ carrottransform/examples/test/inputs/covid19_antibody.csv,sha256=SPCpyqpTbVq9987jXZ8AS4FEkrchRMAIYhTQJjfpwfY,98927
13
+ carrottransform/examples/test/inputs/vaccine.csv,sha256=_gcM-SIymyt2Dkkr_zGmQI9keIdmDm-gDI_QvXXLFrY,44037
14
+ carrottransform/examples/test/rules/rules_14June2021.json,sha256=n2OYNFhbx-NLhmqjAad6RsfXjQFknZIgQ7a5uyJF0Co,13226
15
+ carrottransform/tools/__init__.py,sha256=b3JuCwgJVx0rqx5igB8hNNKO0ktlbQjHGHwy-vzpdo0,198
16
+ carrottransform/tools/file_helpers.py,sha256=xlODDAUpsx0H4sweGZ81ttjJjNQGn2spNUa1Fndotw8,316
17
+ carrottransform/tools/mappingrules.py,sha256=IiZx24G27Rag-YgV-4jDxprJea9Ce7SZUbjxMm0n49k,7040
18
+ carrottransform/tools/metrics.py,sha256=LOzm80-YIVM9mvgvQXRpyArl2nSfSTTW9DikqJ5M2Yg,5700
19
+ carrottransform/tools/omopcdm.py,sha256=MwS_MwwBrypwjbFLuxoE0xlddWIi0T3BEPgN9LPkGAs,8508
20
+ carrot_transform-0.3.4.dist-info/LICENSE,sha256=pqIiuuTs6Na-oFd10MMsZoZmdfhfUhHeOtQzgzSkcaw,1082
21
+ carrot_transform-0.3.4.dist-info/METADATA,sha256=mbB8-GgOH6EnJXDr2j46Q97R3ID4Dro9IbgAFcJVAXY,4219
22
+ carrot_transform-0.3.4.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
23
+ carrot_transform-0.3.4.dist-info/entry_points.txt,sha256=z7qmjTl7C8shrYiPBy6yZo9RRZ31Jcvo6L8ntdqbs2E,74
24
+ carrot_transform-0.3.4.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.0.1
2
+ Generator: poetry-core 2.1.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ carrot-transform=carrottransform.cli.command:transform
3
+
@@ -1,2 +1,6 @@
1
- # TODO - pick this up automatically when building
2
- __version__ = '0.3.2'
1
+ from importlib.metadata import version
2
+
3
+ try:
4
+ __version__ = version("carrot_transform") # Defined in the pyproject.toml
5
+ except Exception:
6
+ __version__ = "unknown"
@@ -8,6 +8,9 @@ import json
8
8
  import importlib.resources
9
9
  import carrottransform
10
10
  import carrottransform.tools as tools
11
+ from carrottransform.tools.omopcdm import OmopCDM
12
+ from typing import Iterator, IO
13
+
11
14
 
12
15
  @click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
13
16
  def run():
@@ -35,7 +38,7 @@ def run():
35
38
  help="File containing additional / override json config for omop outputs")
36
39
  @click.option("--omop-version",
37
40
  required=False,
38
- help="Quoted string containing opmop version - eg '5.3'")
41
+ help="Quoted string containing omop version - eg '5.3'")
39
42
  @click.option("--saved-person-id-file",
40
43
  default=None,
41
44
  required=False,
@@ -66,28 +69,26 @@ def mapstream(rules_file, output_dir, write_mode,
66
69
  # - check for values in optional arguments
67
70
  # - read in configuration files
68
71
  # - check main directories for existence
69
- # - handle saved persion ids
70
- # - initialise metrics
71
- if (omop_ddl_file == None) and (omop_config_file == None) and (omop_version != None):
72
- omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
73
- omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
74
- omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
75
-
76
- if os.path.isdir(input_dir[0]) == False:
77
- print("Not a directory, input dir {0}".format(input_dir[0]))
78
- sys.exit(1)
79
-
80
- if os.path.isdir(output_dir) == False:
81
- print("Not a directory, output dir {0}".format(output_dir))
82
- sys.exit(1)
83
-
84
- if saved_person_id_file == None:
85
- saved_person_id_file = output_dir + "/" + "person_ids.tsv"
86
- if os.path.exists(saved_person_id_file):
87
- os.remove(saved_person_id_file)
72
+ # - handle saved person ids
73
+ # - initialise metrics
74
+ print(rules_file, output_dir, write_mode,
75
+ person_file, omop_ddl_file, omop_config_file,
76
+ omop_version, saved_person_id_file, use_input_person_ids,
77
+ last_used_ids_file, log_file_threshold, input_dir)
78
+
79
+ ## set omop filenames
80
+ omop_config_file, omop_ddl_file = set_omop_filenames(omop_ddl_file, omop_config_file, omop_version)
81
+ ## check directories are valid
82
+ check_dir_isvalid(input_dir)
83
+ check_dir_isvalid(output_dir)
84
+
85
+ saved_person_id_file = set_saved_person_id_file(saved_person_id_file, output_dir)
88
86
 
89
87
  starttime = time.time()
88
+ ## create OmopCDM object, which contains attributes and methods for the omop data tables.
90
89
  omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
90
+
91
+ ## mapping rules determine the ouput files? which input files and fields in the source data, AND the mappings to omop concepts
91
92
  mappingrules = tools.mappingrules.MappingRules(rules_file, omopcdm)
92
93
  metrics = tools.metrics.Metrics(mappingrules.get_dataset_name(), log_file_threshold)
93
94
  nowtime = time.time()
@@ -95,36 +96,41 @@ def mapstream(rules_file, output_dir, write_mode,
95
96
  print("--------------------------------------------------------------------------------")
96
97
  print("Loaded mapping rules from: {0} in {1:.5f} secs".format(rules_file, (nowtime - starttime)))
97
98
  output_files = mappingrules.get_all_outfile_names()
99
+
100
+ ## set record number
101
+ ## will keep track of the current record number in each file, e.g., measurement_id, observation_id.
98
102
  record_numbers = {}
99
103
  for output_file in output_files:
100
104
  record_numbers[output_file] = 1
105
+ if last_used_ids_file != None:
106
+ if os.path.isfile(last_used_ids_file):
107
+ record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
101
108
 
102
109
  fhd = {}
103
110
  tgtcolmaps = {}
104
111
 
112
+
113
+
105
114
  try:
106
- # Saved-person-file existence test, reload if found, return last used integer
107
- if os.path.isfile(saved_person_id_file):
108
- person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
109
- else:
110
- person_lookup = {}
111
- last_used_integer = 1
112
- if last_used_ids_file != None:
113
- if os.path.isfile(last_used_ids_file):
114
- record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
115
-
116
- person_lookup, rejected_person_count = load_person_ids(person_file, person_lookup, mappingrules, use_input_person_ids, last_used_integer)
117
- fhpout = open(saved_person_id_file, mode="w")
118
- fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
119
- for person_id, person_assigned_id in person_lookup.items():
120
- fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
121
- fhpout.close()
122
- # Initialise output files, output a header for each
115
+ ## get all person_ids from file and either renumber with an int or take directly, and add to a dict
116
+ person_lookup, rejected_person_count = load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_person_ids)
117
+ ## open person_ids output file
118
+ with open(saved_person_id_file, mode="w") as fhpout:
119
+ ## write the header to the file
120
+ fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
121
+ ##iterate through the ids and write them to the file.
122
+ for person_id, person_assigned_id in person_lookup.items():
123
+ fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
124
+
125
+ ## Initialise output files (adding them to a dict), output a header for each
126
+ ## these aren't being closed deliberately
123
127
  for tgtfile in output_files:
124
128
  fhd[tgtfile] = open(output_dir + "/" + tgtfile + ".tsv", mode=write_mode)
125
129
  if write_mode == 'w':
126
130
  outhdr = omopcdm.get_omop_column_list(tgtfile)
127
131
  fhd[tgtfile].write("\t".join(outhdr) + "\n")
132
+ ## maps all omop columns for each file into a dict containing the column name and the index
133
+ ## so tgtcolmaps is a dict of dicts.
128
134
  tgtcolmaps[tgtfile] = omopcdm.get_omop_column_map(tgtfile)
129
135
 
130
136
  except IOError as e:
@@ -133,43 +139,35 @@ def mapstream(rules_file, output_dir, write_mode,
133
139
 
134
140
  print("person_id stats: total loaded {0}, reject count {1}".format(len(person_lookup), rejected_person_count))
135
141
 
136
- # Compare files found in the input_dir with those expected based on mapping rules
142
+ ## Compare files found in the input_dir with those expected based on mapping rules
137
143
  existing_input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
138
144
  rules_input_files = mappingrules.get_all_infile_names()
139
- # Log mismatches but continue
140
- for infile in existing_input_files:
141
- if infile not in rules_input_files:
142
- msg = "ERROR: no mapping rules found for existing input file - {0}".format(infile)
143
- print(msg)
144
- for infile in rules_input_files:
145
- if infile not in existing_input_files:
146
- msg = "ERROR: no data for mapped input file - {0}".format(infile)
147
- print(msg)
148
145
 
149
- # set up overall counts
146
+ ## Log mismatches but continue
147
+ check_files_in_rules_exist(rules_input_files, existing_input_files)
148
+
149
+ ## set up overall counts
150
150
  rejidcounts = {}
151
151
  rejdatecounts = {}
152
152
  print(rules_input_files)
153
153
 
154
- # set up per-input counts
154
+ ## set up per-input counts
155
155
  for srcfilename in rules_input_files:
156
156
  rejidcounts[srcfilename] = 0
157
157
  rejdatecounts[srcfilename] = 0
158
158
 
159
- # main processing loop, for each input file
159
+ ## main processing loop, for each input file
160
160
  for srcfilename in rules_input_files:
161
161
  outcounts = {}
162
162
  rejcounts = {}
163
163
  rcount = 0
164
164
 
165
- try:
166
- fh = open(input_dir[0] + "/" + srcfilename, mode="r", encoding="utf-8-sig")
167
- csvr = csv.reader(fh)
168
- except IOError as e:
169
- print("Unable to open: {0}".format(input_dir[0] + "/" + srcfilename))
170
- print("I/O error({0}): {1}".format(e.errno, e.strerror))
165
+ fh, csvr = open_file(input_dir[0], srcfilename)
166
+ if fh is None:
171
167
  continue
172
168
 
169
+
170
+ ## create dict for input file, giving the data and output file
173
171
  tgtfiles, src_to_tgt = mappingrules.parse_rules_src_to_tgt(srcfilename)
174
172
  infile_datetime_source, infile_person_id_source = mappingrules.get_infile_date_person_id(srcfilename)
175
173
  for tgtfile in tgtfiles:
@@ -185,12 +183,13 @@ def mapstream(rules_file, output_dir, write_mode,
185
183
  datetime_col = inputcolmap[infile_datetime_source]
186
184
  print("--------------------------------------------------------------------------------")
187
185
  print("Processing input: {0}".format(srcfilename))
188
-
186
+
189
187
  # for each input record
190
188
  for indata in csvr:
191
189
  key = srcfilename + "~all~all~all~"
192
190
  metrics.increment_key_count(key, "input_count")
193
191
  rcount += 1
192
+ # if there is a date, parse it - read it is a string and convert to YYYY-MM-DD
194
193
  strdate = indata[datetime_col].split(" ")[0]
195
194
  fulldate = parse_date(strdate)
196
195
  if fulldate != None:
@@ -214,30 +213,15 @@ def mapstream(rules_file, output_dir, write_mode,
214
213
  for outrecord in outrecords:
215
214
  if auto_num_col != None:
216
215
  outrecord[tgtcolmap[auto_num_col]] = str(record_numbers[tgtfile])
216
+ ### most of the rest of this section is actually to do with metrics
217
217
  record_numbers[tgtfile] += 1
218
218
  if (outrecord[tgtcolmap[pers_id_col]]) in person_lookup:
219
219
  outrecord[tgtcolmap[pers_id_col]] = person_lookup[outrecord[tgtcolmap[pers_id_col]]]
220
220
  outcounts[tgtfile] += 1
221
- key = srcfilename + "~all~all~all~"
222
- metrics.increment_key_count(key, "output_count")
223
- key = "all~all~" + tgtfile + "~all~"
224
- metrics.increment_key_count(key, "output_count")
225
- key = srcfilename + "~all~" + tgtfile + "~all~"
226
- metrics.increment_key_count(key, "output_count")
227
- if tgtfile == "person":
228
- key = srcfilename + "~all~" + tgtfile + "~" + outrecord[1] +"~"
229
- metrics.increment_key_count(key, "output_count")
230
- key = srcfilename + "~" + datacol +"~" + tgtfile + "~" + outrecord[1] + "~" + outrecord[2]
231
- metrics.increment_key_count(key, "output_count")
232
- else:
233
- key = srcfilename + "~" + datacol +"~" + tgtfile + "~" + outrecord[2] + "~"
234
- metrics.increment_key_count(key, "output_count")
235
- key = srcfilename + "~all~" + tgtfile + "~" + outrecord[2] + "~"
236
- metrics.increment_key_count(key, "output_count")
237
- key = "all~all~" + tgtfile + "~" + outrecord[2] + "~"
238
- metrics.increment_key_count(key, "output_count")
239
- key = "all~all~all~" + outrecord[2] + "~"
240
- metrics.increment_key_count(key, "output_count")
221
+
222
+ increment_key_counts(srcfilename, metrics, tgtfile, datacol, outrecord)
223
+
224
+ # write the line to the file
241
225
  fhd[tgtfile].write("\t".join(outrecord) + "\n")
242
226
  else:
243
227
  key = srcfilename + "~all~" + tgtfile + "~all~"
@@ -266,7 +250,39 @@ def mapstream(rules_file, output_dir, write_mode,
266
250
  nowtime = time.time()
267
251
  print("Elapsed time = {0:.5f} secs".format(nowtime - starttime))
268
252
 
269
- def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srccolmap, srcfilename, omopcdm, metrics):
253
+ def increment_key_counts(srcfilename: str, metrics: tools.metrics.Metrics, tgtfile: str, datacol: str, outrecord: list[str]) -> None:
254
+ key = srcfilename + "~all~all~all~"
255
+ metrics.increment_key_count(key, "output_count")
256
+
257
+ key = "all~all~" + tgtfile + "~all~"
258
+ metrics.increment_key_count(key, "output_count")
259
+
260
+ key = srcfilename + "~all~" + tgtfile + "~all~"
261
+ metrics.increment_key_count(key, "output_count")
262
+
263
+ if tgtfile == "person":
264
+ key = srcfilename + "~all~" + tgtfile + "~" + outrecord[1] + "~"
265
+ metrics.increment_key_count(key, "output_count")
266
+
267
+ key = srcfilename + "~" + datacol + "~" + tgtfile + "~" + outrecord[1] + "~" + outrecord[2]
268
+ metrics.increment_key_count(key, "output_count")
269
+ else:
270
+ key = srcfilename + "~" + datacol + "~" + tgtfile + "~" + outrecord[2] + "~"
271
+ metrics.increment_key_count(key, "output_count")
272
+
273
+ key = srcfilename + "~all~" + tgtfile + "~" + outrecord[2] + "~"
274
+ metrics.increment_key_count(key, "output_count")
275
+
276
+ key = "all~all~" + tgtfile + "~" + outrecord[2] + "~"
277
+ metrics.increment_key_count(key, "output_count")
278
+
279
+ key = "all~all~all~" + outrecord[2] + "~"
280
+ metrics.increment_key_count(key, "output_count")
281
+ return
282
+
283
+
284
+ def get_target_records(tgtfilename: str, tgtcolmap: dict[str, dict[str, int]], rulesmap: dict[str, list[dict[str, list[str]]]], srcfield: str, srcdata: list[str], srccolmap: dict[str, int], srcfilename: str, omopcdm: OmopCDM, metrics: tools.metrics.Metrics) -> \
285
+ tuple[bool, list[str], tools.metrics.Metrics]:
270
286
  """
271
287
  build all target records for a given input field
272
288
  """
@@ -279,6 +295,7 @@ def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srcc
279
295
  srckey = srcfilename + "~" + srcfield + "~" + tgtfilename
280
296
  summarykey = srcfilename + "~" + srcfield + "~" + tgtfilename + "~all~"
281
297
  if valid_value(str(srcdata[srccolmap[srcfield]])):
298
+ ## check if either or both of the srckey and summarykey are in the rules
282
299
  srcfullkey = srcfilename + "~" + srcfield + "~" + str(srcdata[srccolmap[srcfield]]) + "~" + tgtfilename
283
300
  dictkeys = []
284
301
  if srcfullkey in rulesmap:
@@ -291,6 +308,7 @@ def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srcc
291
308
  for dictkey in dictkeys:
292
309
  for out_data_elem in rulesmap[dictkey]:
293
310
  valid_data_elem = True
311
+ ## create empty list to store the data. Populate numerical data elements with 0 instead of empty string.
294
312
  tgtarray = ['']*len(tgtcolmap)
295
313
  for req_integer in notnull_numeric_fields:
296
314
  tgtarray[tgtcolmap[req_integer]] = "0"
@@ -302,6 +320,7 @@ def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srcc
302
320
  else:
303
321
  tgtarray[tgtcolmap[output_col_data]] = srcdata[srccolmap[infield]]
304
322
  if output_col_data in date_component_data:
323
+ ## parse the date and store it in the proper format
305
324
  strdate = srcdata[srccolmap[infield]].split(" ")[0]
306
325
  dt = get_datetime_value(strdate)
307
326
  if dt != None:
@@ -453,7 +472,9 @@ def load_saved_person_ids(person_file):
453
472
  fh.close()
454
473
  return person_ids, last_int
455
474
 
456
- def load_person_ids(person_file, person_ids, mappingrules, use_input_person_ids, person_number=1, delim=","):
475
+ def load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_person_ids, delim=","):
476
+ person_ids, person_number = get_person_lookup(saved_person_id_file)
477
+
457
478
  fh = open(person_file, mode="r", encoding="utf-8-sig")
458
479
  csvr = csv.reader(fh, delimiter=delim)
459
480
  person_columns = {}
@@ -468,23 +489,25 @@ def load_person_ids(person_file, person_ids, mappingrules, use_input_person_ids,
468
489
  person_columns[col] = person_col_in_hdr_number
469
490
  person_col_in_hdr_number += 1
470
491
 
492
+ ## check the mapping rules for person to find where to get the person data) i.e., which column in the person file contains dob, sex
471
493
  birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info("person")
472
494
  print("Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source))
495
+ ## get the column index of the PersonID from the input file
473
496
  person_col = person_columns[person_id_source]
474
497
 
475
498
  for persondata in csvr:
476
- if not valid_value(persondata[person_columns[person_id_source]]):
499
+ if not valid_value(persondata[person_columns[person_id_source]]): #just checking that the id is not an empty string
477
500
  reject_count += 1
478
501
  continue
479
502
  if not valid_date_value(persondata[person_columns[birth_datetime_source]]):
480
503
  reject_count += 1
481
504
  continue
482
- if persondata[person_col] not in person_ids:
505
+ if persondata[person_col] not in person_ids: #if not already in person_ids dict, add it
483
506
  if use_input_person_ids == "N":
484
- person_ids[persondata[person_col]] = str(person_number)
507
+ person_ids[persondata[person_col]] = str(person_number) #create a new integer person_id
485
508
  person_number += 1
486
509
  else:
487
- person_ids[persondata[person_col]] = str(persondata[person_col])
510
+ person_ids[persondata[person_col]] = str(persondata[person_col]) #use existing person_id
488
511
  fh.close()
489
512
 
490
513
  return person_ids, reject_count
@@ -493,4 +516,62 @@ def load_person_ids(person_file, person_ids, mappingrules, use_input_person_ids,
493
516
  def py():
494
517
  pass
495
518
 
519
+ def check_dir_isvalid(directory: str | tuple[str, ...]) -> None:
520
+ ## check output dir is valid
521
+ if type(directory) is tuple:
522
+ directory = directory[0]
523
+
524
+ if not os.path.isdir(directory):
525
+ print("Not a directory, dir {0}".format(directory))
526
+ sys.exit(1)
527
+
528
+ def set_saved_person_id_file(saved_person_id_file: str, output_dir: str) -> str:
529
+ ## check if there is a saved person id file set in options - if not, check if the file exists and remove it
530
+ if saved_person_id_file is None:
531
+ saved_person_id_file = output_dir + "/" + "person_ids.tsv"
532
+ if os.path.exists(saved_person_id_file):
533
+ os.remove(saved_person_id_file)
534
+ return saved_person_id_file
535
+
536
+
537
+ def check_files_in_rules_exist(rules_input_files: list[str], existing_input_files: list[str]) -> None:
538
+ for infile in existing_input_files:
539
+ if infile not in rules_input_files:
540
+ msg = "WARNING: no mapping rules found for existing input file - {0}".format(infile)
541
+ print(msg)
542
+ for infile in rules_input_files:
543
+ if infile not in existing_input_files:
544
+ msg = "WARNING: no data for mapped input file - {0}".format(infile)
545
+ print(msg)
546
+
547
+ def open_file(directory: str, filename: str) -> tuple[IO[str], Iterator[list[str]]] | None:
548
+ #def open_file(directory: str, filename: str):
549
+ try:
550
+ fh = open(directory + "/" + filename, mode="r", encoding="utf-8-sig")
551
+ csvr = csv.reader(fh)
552
+ return fh, csvr
553
+ except IOError as e:
554
+ print("Unable to open: {0}".format(directory + "/" + filename))
555
+ print("I/O error({0}): {1}".format(e.errno, e.strerror))
556
+ return None
557
+
558
+ def set_omop_filenames(omop_ddl_file: str, omop_config_file: str, omop_version: str) -> tuple[str, str]:
559
+ if (omop_ddl_file is None) and (omop_config_file is None) and (omop_version is not None):
560
+ omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
561
+ omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
562
+ omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
563
+ return omop_config_file, omop_ddl_file
564
+
565
+ def get_person_lookup(saved_person_id_file: str) -> tuple[dict[str, str], int]:
566
+ # Saved-person-file existence test, reload if found, return last used integer
567
+ if os.path.isfile(saved_person_id_file):
568
+ person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
569
+ else:
570
+ person_lookup = {}
571
+ last_used_integer = 1
572
+ return person_lookup, last_used_integer
573
+
496
574
  run.add_command(mapstream,"mapstream")
575
+
576
+ if __name__== '__main__':
577
+ mapstream()