carrot-transform 0.3.4__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

Files changed (47) hide show
  1. carrot_transform-0.4.0/.gitignore +14 -0
  2. carrot_transform-0.3.4/README.md → carrot_transform-0.4.0/PKG-INFO +45 -4
  3. carrot_transform-0.3.4/PKG-INFO → carrot_transform-0.4.0/README.md +32 -20
  4. {carrot_transform-0.3.4 → carrot_transform-0.4.0}/carrottransform/__init__.py +1 -1
  5. {carrot_transform-0.3.4 → carrot_transform-0.4.0}/carrottransform/_version.py +2 -2
  6. {carrot_transform-0.3.4 → carrot_transform-0.4.0}/carrottransform/cli/command.py +9 -5
  7. carrot_transform-0.4.0/carrottransform/cli/subcommands/run.py +436 -0
  8. carrot_transform-0.4.0/carrottransform/cli/subcommands/run_v2.py +145 -0
  9. carrot_transform-0.4.0/carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
  10. carrot_transform-0.4.0/carrottransform/examples/test/rules/v1.json +280 -0
  11. carrot_transform-0.4.0/carrottransform/examples/test/rules/v2.json +115 -0
  12. carrot_transform-0.4.0/carrottransform/tools/__init__.py +7 -0
  13. carrot_transform-0.4.0/carrottransform/tools/args.py +128 -0
  14. carrot_transform-0.4.0/carrottransform/tools/click.py +21 -0
  15. carrot_transform-0.4.0/carrottransform/tools/concept_helpers.py +61 -0
  16. carrot_transform-0.4.0/carrottransform/tools/core.py +163 -0
  17. carrot_transform-0.4.0/carrottransform/tools/date_helpers.py +79 -0
  18. carrot_transform-0.4.0/carrottransform/tools/file_helpers.py +185 -0
  19. carrot_transform-0.4.0/carrottransform/tools/logger.py +19 -0
  20. carrot_transform-0.4.0/carrottransform/tools/mapping_types.py +32 -0
  21. carrot_transform-0.4.0/carrottransform/tools/mappingrules.py +427 -0
  22. carrot_transform-0.4.0/carrottransform/tools/metrics.py +354 -0
  23. {carrot_transform-0.3.4 → carrot_transform-0.4.0}/carrottransform/tools/omopcdm.py +42 -32
  24. carrot_transform-0.4.0/carrottransform/tools/orchestrator.py +381 -0
  25. carrot_transform-0.4.0/carrottransform/tools/person_helpers.py +126 -0
  26. carrot_transform-0.4.0/carrottransform/tools/record_builder.py +413 -0
  27. carrot_transform-0.4.0/carrottransform/tools/stream_helpers.py +71 -0
  28. carrot_transform-0.4.0/carrottransform/tools/types.py +71 -0
  29. carrot_transform-0.4.0/carrottransform/tools/validation.py +62 -0
  30. carrot_transform-0.4.0/pyproject.toml +55 -0
  31. carrot_transform-0.3.4/carrottransform/cli/subcommands/run.py +0 -577
  32. carrot_transform-0.3.4/carrottransform/tools/__init__.py +0 -17
  33. carrot_transform-0.3.4/carrottransform/tools/file_helpers.py +0 -15
  34. carrot_transform-0.3.4/carrottransform/tools/mappingrules.py +0 -161
  35. carrot_transform-0.3.4/carrottransform/tools/metrics.py +0 -129
  36. carrot_transform-0.3.4/pyproject.toml +0 -23
  37. {carrot_transform-0.3.4 → carrot_transform-0.4.0}/LICENSE +0 -0
  38. {carrot_transform-0.3.4 → carrot_transform-0.4.0}/carrottransform/cli/__init__.py +0 -0
  39. {carrot_transform-0.3.4 → carrot_transform-0.4.0}/carrottransform/cli/subcommands/__init__.py +0 -0
  40. {carrot_transform-0.3.4 → carrot_transform-0.4.0}/carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql +0 -0
  41. {carrot_transform-0.3.4 → carrot_transform-0.4.0}/carrottransform/config/omop.json +0 -0
  42. {carrot_transform-0.3.4 → carrot_transform-0.4.0}/carrottransform/examples/test/inputs/Covid19_test.csv +0 -0
  43. {carrot_transform-0.3.4 → carrot_transform-0.4.0}/carrottransform/examples/test/inputs/Demographics.csv +0 -0
  44. {carrot_transform-0.3.4 → carrot_transform-0.4.0}/carrottransform/examples/test/inputs/Symptoms.csv +0 -0
  45. {carrot_transform-0.3.4 → carrot_transform-0.4.0}/carrottransform/examples/test/inputs/covid19_antibody.csv +0 -0
  46. {carrot_transform-0.3.4 → carrot_transform-0.4.0}/carrottransform/examples/test/inputs/vaccine.csv +0 -0
  47. {carrot_transform-0.3.4 → carrot_transform-0.4.0}/carrottransform/examples/test/rules/rules_14June2021.json +0 -0
@@ -0,0 +1,14 @@
1
+ data
2
+ .ipynb_checkpoints
3
+ *_pycache_*
4
+ *.whl
5
+ *.spec
6
+ *egg-info
7
+ dist
8
+ .DS_store
9
+ build
10
+ *.env
11
+ temp
12
+ .python-version
13
+
14
+ *.orig
@@ -1,12 +1,25 @@
1
+ Metadata-Version: 2.4
2
+ Name: carrot_transform
3
+ Version: 0.4.0
4
+ Author-email: anwarfg <913028+anwarfg@users.noreply.github.com>
5
+ License-Expression: MIT
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.10
8
+ Requires-Dist: click<9,>=8.1.7
9
+ Requires-Dist: ruff>=0.12.0
10
+ Description-Content-Type: text/markdown
11
+
12
+
1
13
  <p align="center">
2
14
  <a href="https://carrot.ac.uk/" target="_blank">
3
15
  <picture>
4
- <source media="(prefers-color-scheme: dark)" srcset="/images/logo-dark.png">
5
- <img alt="Carrot Logo" src="/images/logo-primary.png" width="280"/>
16
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/Health-Informatics-UoN/carrot-transform/refs/heads/main/images/logo-dark.png">
17
+ <img alt="Carrot Logo" src="https://raw.githubusercontent.com/Health-Informatics-UoN/carrot-transform/refs/heads/main/images/logo-primary.png" width="280"/>
6
18
  </picture>
7
19
  </a>
8
20
  </p>
9
21
 
22
+
10
23
  <p align="center">
11
24
 
12
25
  <a href="https://github.com/Health-Informatics-UoN/carrot-transform/releases">
@@ -36,11 +49,36 @@
36
49
 
37
50
  Carrot Transform transforms input data into tab separated variable files of standard OMOP tables, with concepts mapped according to the provided rules (generated from Carrot Mapper).
38
51
 
39
- ## Quick Start for Developers
52
+ ## Quick Start
40
53
 
41
54
  To have the project up and running, please follow the [Quick Start Guide](https://carrot.ac.uk/transform/quickstart).
42
55
 
56
+ If you need to perform development, [there's a brief guide here](https://carrot.ac.uk/transform/development) to get the tool up and running.
57
+
58
+ ## Formatting and Linting
59
+
60
+ This project is using [ruff](https://docs.astral.sh/ruff/) to check formatting and linting.
61
+ The only dependency is the [`uv` command line tool.](https://docs.astral.sh/uv/)
62
+ The `.vscode/tasks.json` file contains a task to run this tool for the currently open file.
63
+ The commands can be run on thier own (in the root folder) like this ...
64
+
65
+ ```bash
66
+ # reformat all the files in `./`
67
+ λ uv run ruff format .
68
+
69
+ # run linting checks all the files in `./`
70
+ λ uv run ruff check .
71
+
72
+ # check and fix all the files in `./`
73
+ λ uv run ruff check --fix .
74
+
75
+ # check and fix all the files in `./` but do so so more eggrsively
76
+ λ uv run ruff check --fix --unsafe-fixes .
77
+ ```
78
+
79
+
43
80
  ## Release Procedure
81
+
44
82
  To release a new version of `carrot-transform` follow these steps:
45
83
 
46
84
  ### 1. Prepare the repository
@@ -66,6 +104,7 @@ git push --set-upstream origin release/v$NEW_VERSION
66
104
 
67
105
  ### 4. Create pull request
68
106
  - Open a pull request from `release/v$NEW_VERSION` to `main` and await approval.
107
+
69
108
  ### 5. Merge and tag
70
109
  - After approval merge the the feature branch to `main`.
71
110
  - Checkout to `main`, pull updates, and create a tag corresponding to the new version number.
@@ -77,12 +116,14 @@ git push origin "$NEW_VERSION"
77
116
  ```
78
117
 
79
118
  ### 6. Create a release
119
+
80
120
  - We must now link the tag to a release in the GitHub repository. To do this from the command line first install GitHub command line tools `gh` and then invoke:
81
121
  ```bash
82
122
  gh release create "$TAG" --title "$TAG" --notes "Release for $VERSION"
83
123
  ```
84
124
 
85
125
  - Alternatively, follow the instructions in the [GitHub documentation](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository) to manually create a release.
126
+
86
127
  ## License
87
128
 
88
- This repository's source code is available under the [MIT license](LICENSE).
129
+ This repository's source code is available under the [MIT license](LICENSE).
@@ -1,30 +1,14 @@
1
- Metadata-Version: 2.3
2
- Name: carrot_transform
3
- Version: 0.3.4
4
- Summary:
5
- Author: anwarfg
6
- Author-email: 913028+anwarfg@users.noreply.github.com
7
- Requires-Python: >=3.10,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.10
10
- Classifier: Programming Language :: Python :: 3.11
11
- Classifier: Programming Language :: Python :: 3.12
12
- Classifier: Programming Language :: Python :: 3.13
13
- Requires-Dist: click (>=8.1.7,<9.0.0)
14
- Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
15
- Requires-Dist: pandas (>=2.2.3,<3.0.0)
16
- Requires-Dist: pytest (>=8.3.4,<9.0.0)
17
- Description-Content-Type: text/markdown
18
1
 
19
2
  <p align="center">
20
3
  <a href="https://carrot.ac.uk/" target="_blank">
21
4
  <picture>
22
- <source media="(prefers-color-scheme: dark)" srcset="/images/logo-dark.png">
23
- <img alt="Carrot Logo" src="/images/logo-primary.png" width="280"/>
5
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/Health-Informatics-UoN/carrot-transform/refs/heads/main/images/logo-dark.png">
6
+ <img alt="Carrot Logo" src="https://raw.githubusercontent.com/Health-Informatics-UoN/carrot-transform/refs/heads/main/images/logo-primary.png" width="280"/>
24
7
  </picture>
25
8
  </a>
26
9
  </p>
27
10
 
11
+
28
12
  <p align="center">
29
13
 
30
14
  <a href="https://github.com/Health-Informatics-UoN/carrot-transform/releases">
@@ -54,11 +38,36 @@ Description-Content-Type: text/markdown
54
38
 
55
39
  Carrot Transform transforms input data into tab separated variable files of standard OMOP tables, with concepts mapped according to the provided rules (generated from Carrot Mapper).
56
40
 
57
- ## Quick Start for Developers
41
+ ## Quick Start
58
42
 
59
43
  To have the project up and running, please follow the [Quick Start Guide](https://carrot.ac.uk/transform/quickstart).
60
44
 
45
+ If you need to perform development, [there's a brief guide here](https://carrot.ac.uk/transform/development) to get the tool up and running.
46
+
47
+ ## Formatting and Linting
48
+
49
+ This project is using [ruff](https://docs.astral.sh/ruff/) to check formatting and linting.
50
+ The only dependency is the [`uv` command line tool.](https://docs.astral.sh/uv/)
51
+ The `.vscode/tasks.json` file contains a task to run this tool for the currently open file.
52
+ The commands can be run on thier own (in the root folder) like this ...
53
+
54
+ ```bash
55
+ # reformat all the files in `./`
56
+ λ uv run ruff format .
57
+
58
+ # run linting checks all the files in `./`
59
+ λ uv run ruff check .
60
+
61
+ # check and fix all the files in `./`
62
+ λ uv run ruff check --fix .
63
+
64
+ # check and fix all the files in `./` but do so so more eggrsively
65
+ λ uv run ruff check --fix --unsafe-fixes .
66
+ ```
67
+
68
+
61
69
  ## Release Procedure
70
+
62
71
  To release a new version of `carrot-transform` follow these steps:
63
72
 
64
73
  ### 1. Prepare the repository
@@ -84,6 +93,7 @@ git push --set-upstream origin release/v$NEW_VERSION
84
93
 
85
94
  ### 4. Create pull request
86
95
  - Open a pull request from `release/v$NEW_VERSION` to `main` and await approval.
96
+
87
97
  ### 5. Merge and tag
88
98
  - After approval merge the the feature branch to `main`.
89
99
  - Checkout to `main`, pull updates, and create a tag corresponding to the new version number.
@@ -95,12 +105,14 @@ git push origin "$NEW_VERSION"
95
105
  ```
96
106
 
97
107
  ### 6. Create a release
108
+
98
109
  - We must now link the tag to a release in the GitHub repository. To do this from the command line first install GitHub command line tools `gh` and then invoke:
99
110
  ```bash
100
111
  gh release create "$TAG" --title "$TAG" --notes "Release for $VERSION"
101
112
  ```
102
113
 
103
114
  - Alternatively, follow the instructions in the [GitHub documentation](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository) to manually create a release.
115
+
104
116
  ## License
105
117
 
106
118
  This repository's source code is available under the [MIT license](LICENSE).
@@ -1,5 +1,5 @@
1
1
  from ._version import __version__
2
2
 
3
3
  params = {
4
- 'version':__version__,
4
+ "version": __version__,
5
5
  }
@@ -1,6 +1,6 @@
1
1
  from importlib.metadata import version
2
2
 
3
3
  try:
4
- __version__ = version("carrot_transform") # Defined in the pyproject.toml
4
+ __version__ = version("carrot_transform") # Defined in the pyproject.toml
5
5
  except Exception:
6
- __version__ = "unknown"
6
+ __version__ = "unknown"
@@ -1,21 +1,25 @@
1
1
  # Package entry point - sets up the "run" subcommand
2
2
  from .subcommands.run import run
3
+ from .subcommands.run_v2 import run_v2
3
4
 
4
5
  import carrottransform as c
5
6
  import click
6
7
 
8
+
7
9
  @click.group(invoke_without_command=True)
8
- @click.option("--version","-v",is_flag=True)
10
+ @click.option("--version", "-v", is_flag=True)
9
11
  @click.pass_context
10
- def transform(ctx,version):
11
- if ctx.invoked_subcommand == None :
12
+ def transform(ctx, version):
13
+ if ctx.invoked_subcommand is None:
12
14
  if version:
13
15
  click.echo(c.__version__)
14
16
  else:
15
- click.echo(ctx.get_help())
17
+ click.echo(ctx.get_help())
16
18
  return
17
19
 
20
+
18
21
  transform.add_command(run, "run")
22
+ transform.add_command(run_v2, "run_v2")
19
23
 
20
24
  if __name__ == "__main__":
21
- transform()
25
+ transform()
@@ -0,0 +1,436 @@
1
+ import sys
2
+ import time
3
+ from pathlib import Path
4
+ import click
5
+
6
+ import carrottransform.tools as tools
7
+ from carrottransform.tools.click import PathArgs
8
+ from carrottransform.tools.file_helpers import (
9
+ check_dir_isvalid,
10
+ check_files_in_rules_exist,
11
+ open_file,
12
+ resolve_paths,
13
+ set_omop_filenames,
14
+ )
15
+ from carrottransform.tools.logger import logger_setup
16
+ from carrottransform.tools.core import (
17
+ get_target_records,
18
+ )
19
+ from carrottransform.tools.date_helpers import normalise_to8601
20
+ from carrottransform.tools.person_helpers import (
21
+ load_last_used_ids,
22
+ load_person_ids,
23
+ set_saved_person_id_file,
24
+ )
25
+ from carrottransform.tools.args import person_rules_check, OnlyOnePersonInputAllowed
26
+
27
+ logger = logger_setup()
28
+
29
+
30
+ @click.command()
31
+ @click.option(
32
+ "--rules-file",
33
+ type=PathArgs,
34
+ required=True,
35
+ help="json file containing mapping rules",
36
+ )
37
+ @click.option(
38
+ "--output-dir",
39
+ type=PathArgs,
40
+ default=None,
41
+ required=True,
42
+ help="define the output directory for OMOP-format tsv files",
43
+ )
44
+ @click.option(
45
+ "--write-mode",
46
+ default="w",
47
+ type=click.Choice(["w", "a"]),
48
+ help="force write-mode on output files",
49
+ )
50
+ @click.option(
51
+ "--person-file",
52
+ type=PathArgs,
53
+ required=True,
54
+ help="File containing person_ids in the first column",
55
+ )
56
+ @click.option(
57
+ "--omop-ddl-file",
58
+ type=PathArgs,
59
+ required=False,
60
+ help="File containing OHDSI ddl statements for OMOP tables",
61
+ )
62
+ @click.option(
63
+ "--omop-config-file",
64
+ type=PathArgs,
65
+ required=False,
66
+ help="File containing additional / override json config for omop outputs",
67
+ )
68
+ @click.option(
69
+ "--omop-version",
70
+ required=False,
71
+ help="Quoted string containing omop version - eg '5.3'",
72
+ )
73
+ @click.option(
74
+ "--saved-person-id-file",
75
+ type=PathArgs,
76
+ default=None,
77
+ required=False,
78
+ help="Full path to person id file used to save person_id state and share person_ids between data sets",
79
+ )
80
+ @click.option(
81
+ "--use-input-person-ids",
82
+ required=False,
83
+ default="N",
84
+ help="Use person ids as input without generating new integers",
85
+ )
86
+ @click.option(
87
+ "--last-used-ids-file",
88
+ type=PathArgs,
89
+ default=None,
90
+ required=False,
91
+ help="Full path to last used ids file for OMOP tables - format: tablename\tlast_used_id, \nwhere last_used_id must be an integer",
92
+ )
93
+ @click.option(
94
+ "--log-file-threshold",
95
+ required=False,
96
+ default=0,
97
+ help="Lower outcount limit for logfile output",
98
+ )
99
+ @click.option("--input-dir", type=PathArgs, required=True, help="Input directories")
100
+ def mapstream(
101
+ rules_file: Path,
102
+ output_dir: Path,
103
+ write_mode,
104
+ person_file: Path,
105
+ omop_ddl_file: Path,
106
+ omop_config_file: Path,
107
+ omop_version,
108
+ saved_person_id_file: Path,
109
+ use_input_person_ids,
110
+ last_used_ids_file: Path,
111
+ log_file_threshold,
112
+ input_dir: Path,
113
+ ):
114
+ """
115
+ Map to output using input streams
116
+ """
117
+
118
+ # Resolve any @package paths in the arguments
119
+ [
120
+ rules_file,
121
+ output_dir,
122
+ person_file,
123
+ omop_ddl_file,
124
+ omop_config_file,
125
+ saved_person_id_file,
126
+ last_used_ids_file,
127
+ input_dir,
128
+ ] = resolve_paths(
129
+ [
130
+ rules_file,
131
+ output_dir,
132
+ person_file,
133
+ omop_ddl_file,
134
+ omop_config_file,
135
+ saved_person_id_file,
136
+ last_used_ids_file,
137
+ input_dir,
138
+ ]
139
+ )
140
+
141
+ # Initialisation
142
+ # - check for values in optional arguments
143
+ # - read in configuration files
144
+ # - check main directories for existence
145
+ # - handle saved person ids
146
+ # - initialise metrics
147
+ logger.info(
148
+ ",".join(
149
+ map(
150
+ str,
151
+ [
152
+ rules_file,
153
+ output_dir,
154
+ write_mode,
155
+ person_file,
156
+ omop_ddl_file,
157
+ omop_config_file,
158
+ omop_version,
159
+ saved_person_id_file,
160
+ use_input_person_ids,
161
+ last_used_ids_file,
162
+ log_file_threshold,
163
+ input_dir,
164
+ ],
165
+ )
166
+ )
167
+ )
168
+
169
+ # check on the rules file
170
+ if (rules_file is None) or (not rules_file.is_file()):
171
+ logger.exception(f"rules file was set to `{rules_file=}` and is missing")
172
+ sys.exit(-1)
173
+
174
+ ## set omop filenames
175
+ omop_config_file, omop_ddl_file = set_omop_filenames(
176
+ omop_ddl_file, omop_config_file, omop_version
177
+ )
178
+ ## check directories are valid
179
+ check_dir_isvalid(input_dir) # Input directory must exist - we need the files in it
180
+ check_dir_isvalid(
181
+ output_dir, create_if_missing=True
182
+ ) # Create output directory if needed
183
+
184
+ saved_person_id_file = set_saved_person_id_file(saved_person_id_file, output_dir)
185
+
186
+ ## check on the person_file_rules
187
+ try:
188
+ person_rules_check(rules_file=rules_file, person_file=person_file)
189
+ except OnlyOnePersonInputAllowed as e:
190
+ inputs = list(sorted(list(e._inputs)))
191
+
192
+ logger.error(
193
+ f"Person properties were mapped from ({inputs}) but can only come from the person file {person_file.name=}"
194
+ )
195
+ sys.exit(-1)
196
+ except Exception as e:
197
+ logger.exception(f"person_file_rules check failed: {e}")
198
+ sys.exit(-1)
199
+
200
+ start_time = time.time()
201
+ ## create OmopCDM object, which contains attributes and methods for the omop data tables.
202
+ omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
203
+
204
+ ## mapping rules determine the ouput files? which input files and fields in the source data, AND the mappings to omop concepts
205
+ mappingrules = tools.mappingrules.MappingRules(rules_file, omopcdm)
206
+ metrics = tools.metrics.Metrics(mappingrules.get_dataset_name(), log_file_threshold)
207
+
208
+ logger.info(
209
+ "--------------------------------------------------------------------------------"
210
+ )
211
+ logger.info(
212
+ f"Loaded mapping rules from: {rules_file} in {time.time() - start_time:.5f} secs"
213
+ )
214
+
215
+ output_files = mappingrules.get_all_outfile_names()
216
+
217
+ ## set record number
218
+ ## will keep track of the current record number in each file, e.g., measurement_id, observation_id.
219
+ record_numbers = {}
220
+ for output_file in output_files:
221
+ record_numbers[output_file] = 1
222
+ if (last_used_ids_file is not None) and last_used_ids_file.is_file():
223
+ record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
224
+
225
+ fhd = {}
226
+ tgtcolmaps = {}
227
+
228
+ try:
229
+ ## get all person_ids from file and either renumber with an int or take directly, and add to a dict
230
+ person_lookup, rejected_person_count = load_person_ids(
231
+ saved_person_id_file, person_file, mappingrules, use_input_person_ids
232
+ )
233
+
234
+ ## open person_ids output file
235
+ with saved_person_id_file.open(mode="w") as fhpout:
236
+ ## write the header to the file
237
+ fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
238
+ ##iterate through the ids and write them to the file.
239
+ for person_id, person_assigned_id in person_lookup.items():
240
+ fhpout.write(f"{str(person_id)}\t{str(person_assigned_id)}\n")
241
+
242
+ ## Initialise output files (adding them to a dict), output a header for each
243
+ ## these aren't being closed deliberately
244
+ for tgtfile in output_files:
245
+ fhd[tgtfile] = (
246
+ (output_dir / tgtfile).with_suffix(".tsv").open(mode=write_mode)
247
+ )
248
+ if write_mode == "w":
249
+ outhdr = omopcdm.get_omop_column_list(tgtfile)
250
+ fhd[tgtfile].write("\t".join(outhdr) + "\n")
251
+ ## maps all omop columns for each file into a dict containing the column name and the index
252
+ ## so tgtcolmaps is a dict of dicts.
253
+ tgtcolmaps[tgtfile] = omopcdm.get_omop_column_map(tgtfile)
254
+
255
+ except IOError as e:
256
+ logger.exception(f"I/O - error({e.errno}): {e.strerror} -> {str(e)}")
257
+ exit()
258
+
259
+ logger.info(
260
+ f"person_id stats: total loaded {len(person_lookup)}, reject count {rejected_person_count}"
261
+ )
262
+
263
+ ## Compare files found in the input_dir with those expected based on mapping rules
264
+ existing_input_files = [f.name for f in input_dir.glob("*.csv")]
265
+ rules_input_files = mappingrules.get_all_infile_names()
266
+
267
+ ## Log mismatches but continue
268
+ check_files_in_rules_exist(rules_input_files, existing_input_files)
269
+
270
+ ## set up overall counts
271
+ rejidcounts = {}
272
+ rejdatecounts = {}
273
+ logger.info(rules_input_files)
274
+
275
+ ## set up per-input counts
276
+ for srcfilename in rules_input_files:
277
+ rejidcounts[srcfilename] = 0
278
+ rejdatecounts[srcfilename] = 0
279
+
280
+ ## main processing loop, for each input file
281
+ for srcfilename in rules_input_files:
282
+ rcount = 0
283
+
284
+ fhcsvr = open_file(input_dir / srcfilename)
285
+ if fhcsvr is None: # check if it's none before unpacking
286
+ raise Exception(f"Couldn't find file {srcfilename} in {input_dir}")
287
+ fh, csvr = fhcsvr # unpack now because we can't unpack none
288
+
289
+ ## create dict for input file, giving the data and output file
290
+ tgtfiles, src_to_tgt = mappingrules.parse_rules_src_to_tgt(srcfilename)
291
+ infile_datetime_source, infile_person_id_source = (
292
+ mappingrules.get_infile_date_person_id(srcfilename)
293
+ )
294
+
295
+ outcounts = {}
296
+ rejcounts = {}
297
+ for tgtfile in tgtfiles:
298
+ outcounts[tgtfile] = 0
299
+ rejcounts[tgtfile] = 0
300
+
301
+ datacolsall = []
302
+ csv_column_headers = next(csvr)
303
+ dflist = mappingrules.get_infile_data_fields(srcfilename)
304
+ for colname in csv_column_headers:
305
+ datacolsall.append(colname)
306
+ inputcolmap = omopcdm.get_column_map(csv_column_headers)
307
+ pers_id_col = inputcolmap[infile_person_id_source]
308
+ datetime_col = inputcolmap[infile_datetime_source]
309
+
310
+ logger.info(
311
+ "--------------------------------------------------------------------------------"
312
+ )
313
+ logger.info(f"Processing input: {srcfilename}")
314
+
315
+ # for each input record
316
+ for indata in csvr:
317
+ metrics.increment_key_count(
318
+ source=srcfilename,
319
+ fieldname="all",
320
+ tablename="all",
321
+ concept_id="all",
322
+ additional="",
323
+ count_type="input_count",
324
+ )
325
+ rcount += 1
326
+
327
+ # if there is a date, parse it - read it is a string and convert to YYYY-MM-DD HH:MM:SS
328
+ fulldate = normalise_to8601(indata[datetime_col])
329
+ if fulldate is not None:
330
+ indata[datetime_col] = fulldate
331
+ else:
332
+ metrics.increment_key_count(
333
+ source=srcfilename,
334
+ fieldname="all",
335
+ tablename="all",
336
+ concept_id="all",
337
+ additional="",
338
+ count_type="input_date_fields",
339
+ )
340
+ continue
341
+
342
+ for tgtfile in tgtfiles:
343
+ tgtcolmap = tgtcolmaps[tgtfile]
344
+ auto_num_col = omopcdm.get_omop_auto_number_field(tgtfile)
345
+ pers_id_col = omopcdm.get_omop_person_id_field(tgtfile)
346
+
347
+ datacols = datacolsall
348
+ if tgtfile in dflist:
349
+ datacols = dflist[tgtfile]
350
+
351
+ for datacol in datacols:
352
+ built_records, outrecords, metrics = get_target_records(
353
+ tgtfile,
354
+ tgtcolmap,
355
+ src_to_tgt,
356
+ datacol,
357
+ indata,
358
+ inputcolmap,
359
+ srcfilename,
360
+ omopcdm,
361
+ metrics,
362
+ )
363
+
364
+ if built_records:
365
+ for outrecord in outrecords:
366
+ if auto_num_col is not None:
367
+ outrecord[tgtcolmap[auto_num_col]] = str(
368
+ record_numbers[tgtfile]
369
+ )
370
+ ### most of the rest of this section is actually to do with metrics
371
+ record_numbers[tgtfile] += 1
372
+
373
+ if (outrecord[tgtcolmap[pers_id_col]]) in person_lookup:
374
+ outrecord[tgtcolmap[pers_id_col]] = person_lookup[
375
+ outrecord[tgtcolmap[pers_id_col]]
376
+ ]
377
+ outcounts[tgtfile] += 1
378
+
379
+ metrics.increment_with_datacol(
380
+ source_path=srcfilename,
381
+ target_file=tgtfile,
382
+ datacol=datacol,
383
+ out_record=outrecord,
384
+ )
385
+
386
+ # write the line to the file
387
+ fhd[tgtfile].write("\t".join(outrecord) + "\n")
388
+ else:
389
+ metrics.increment_key_count(
390
+ source=srcfilename,
391
+ fieldname="all",
392
+ tablename=tgtfile,
393
+ concept_id="all",
394
+ additional="",
395
+ count_type="invalid_person_ids",
396
+ )
397
+ rejidcounts[srcfilename] += 1
398
+
399
+ if tgtfile == "person":
400
+ break
401
+
402
+ fh.close()
403
+
404
+ logger.info(
405
+ f"INPUT file data : {srcfilename}: input count {str(rcount)}, time since start {time.time() - start_time:.5} secs"
406
+ )
407
+ for outtablename, count in outcounts.items():
408
+ logger.info(f"TARGET: {outtablename}: output count {str(count)}")
409
+ # END main processing loop
410
+
411
+ logger.info(
412
+ "--------------------------------------------------------------------------------"
413
+ )
414
+
415
+ data_summary = metrics.get_mapstream_summary()
416
+ try:
417
+ dsfh = (output_dir / "summary_mapstream.tsv").open(mode="w")
418
+ dsfh.write(data_summary)
419
+ dsfh.close()
420
+ except IOError as e:
421
+ logger.exception(f"I/O error({e.errno}): {e.strerror}")
422
+ logger.exception("Unable to write file")
423
+ raise e
424
+
425
+ # END mapstream
426
+ logger.info(f"Elapsed time = {time.time() - start_time:.5f} secs")
427
+
428
+
429
+ @click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
430
+ def run():
431
+ pass
432
+
433
+
434
+ run.add_command(mapstream, "mapstream")
435
+ if __name__ == "__main__":
436
+ run()