carrot-transform 0.3.2__tar.gz → 0.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of carrot-transform might be problematic. Click here for more details.
- carrot_transform-0.3.4/PKG-INFO +106 -0
- carrot_transform-0.3.4/README.md +88 -0
- carrot_transform-0.3.4/carrottransform/_version.py +6 -0
- {carrot_transform-0.3.2 → carrot_transform-0.3.4}/carrottransform/cli/subcommands/run.py +164 -83
- {carrot_transform-0.3.2 → carrot_transform-0.3.4}/carrottransform/config/omop.json +6 -0
- carrot_transform-0.3.4/carrottransform/examples/test/inputs/Covid19_test.csv +801 -0
- carrot_transform-0.3.4/carrottransform/examples/test/inputs/Demographics.csv +1001 -0
- carrot_transform-0.3.4/carrottransform/examples/test/inputs/Symptoms.csv +801 -0
- carrot_transform-0.3.4/carrottransform/examples/test/inputs/covid19_antibody.csv +1001 -0
- carrot_transform-0.3.4/carrottransform/examples/test/inputs/vaccine.csv +501 -0
- carrot_transform-0.3.4/carrottransform/examples/test/rules/rules_14June2021.json +300 -0
- {carrot_transform-0.3.2 → carrot_transform-0.3.4}/carrottransform/tools/mappingrules.py +8 -8
- {carrot_transform-0.3.2 → carrot_transform-0.3.4}/carrottransform/tools/omopcdm.py +9 -2
- carrot_transform-0.3.4/pyproject.toml +23 -0
- carrot_transform-0.3.2/.github/workflows/pypi.publish.yml +0 -55
- carrot_transform-0.3.2/.gitignore +0 -12
- carrot_transform-0.3.2/MANIFEST.in +0 -2
- carrot_transform-0.3.2/PKG-INFO +0 -28
- carrot_transform-0.3.2/README.md +0 -16
- carrot_transform-0.3.2/carrot_transform.egg-info/PKG-INFO +0 -28
- carrot_transform-0.3.2/carrot_transform.egg-info/SOURCES.txt +0 -25
- carrot_transform-0.3.2/carrot_transform.egg-info/dependency_links.txt +0 -1
- carrot_transform-0.3.2/carrot_transform.egg-info/entry_points.txt +0 -2
- carrot_transform-0.3.2/carrot_transform.egg-info/top_level.txt +0 -1
- carrot_transform-0.3.2/carrot_transform.py +0 -5
- carrot_transform-0.3.2/carrottransform/_version.py +0 -2
- carrot_transform-0.3.2/pyproject.toml +0 -20
- carrot_transform-0.3.2/setup.cfg +0 -4
- {carrot_transform-0.3.2 → carrot_transform-0.3.4}/LICENSE +0 -0
- {carrot_transform-0.3.2 → carrot_transform-0.3.4}/carrottransform/__init__.py +0 -0
- {carrot_transform-0.3.2 → carrot_transform-0.3.4}/carrottransform/cli/__init__.py +0 -0
- {carrot_transform-0.3.2 → carrot_transform-0.3.4}/carrottransform/cli/command.py +0 -0
- {carrot_transform-0.3.2 → carrot_transform-0.3.4}/carrottransform/cli/subcommands/__init__.py +0 -0
- {carrot_transform-0.3.2 → carrot_transform-0.3.4}/carrottransform/config/OMOPCDM_postgresql_5.3_ddl.sql +0 -0
- {carrot_transform-0.3.2 → carrot_transform-0.3.4}/carrottransform/tools/__init__.py +0 -0
- {carrot_transform-0.3.2 → carrot_transform-0.3.4}/carrottransform/tools/file_helpers.py +0 -0
- {carrot_transform-0.3.2 → carrot_transform-0.3.4}/carrottransform/tools/metrics.py +0 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: carrot_transform
|
|
3
|
+
Version: 0.3.4
|
|
4
|
+
Summary:
|
|
5
|
+
Author: anwarfg
|
|
6
|
+
Author-email: 913028+anwarfg@users.noreply.github.com
|
|
7
|
+
Requires-Python: >=3.10,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Requires-Dist: click (>=8.1.7,<9.0.0)
|
|
14
|
+
Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
|
|
15
|
+
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
16
|
+
Requires-Dist: pytest (>=8.3.4,<9.0.0)
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
<p align="center">
|
|
20
|
+
<a href="https://carrot.ac.uk/" target="_blank">
|
|
21
|
+
<picture>
|
|
22
|
+
<source media="(prefers-color-scheme: dark)" srcset="/images/logo-dark.png">
|
|
23
|
+
<img alt="Carrot Logo" src="/images/logo-primary.png" width="280"/>
|
|
24
|
+
</picture>
|
|
25
|
+
</a>
|
|
26
|
+
</p>
|
|
27
|
+
|
|
28
|
+
<p align="center">
|
|
29
|
+
|
|
30
|
+
<a href="https://github.com/Health-Informatics-UoN/carrot-transform/releases">
|
|
31
|
+
<img src="https://img.shields.io/github/v/release/Health-Informatics-UoN/carrot-transform" alt="Release">
|
|
32
|
+
</a>
|
|
33
|
+
<a href="https://opensource.org/license/mit">
|
|
34
|
+
<img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License">
|
|
35
|
+
</a>
|
|
36
|
+
</p>
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
<div align="center">
|
|
40
|
+
<strong>
|
|
41
|
+
<h2>Streamlined Data Transformation to OMOP</h2><br />
|
|
42
|
+
<a href="https://carrot.ac.uk/">Carrot Transform</a> automates data transformation processes and facilitates the standardisation of datasets to the OMOP vocabulary, simplifying the integration of diverse data sources.
|
|
43
|
+
<br />
|
|
44
|
+
</strong>
|
|
45
|
+
</div>
|
|
46
|
+
|
|
47
|
+
<p align="center">
|
|
48
|
+
<br />
|
|
49
|
+
<a href="https://carrot.ac.uk/transform" rel="dofollow"><strong>Explore the docs »</strong></a>
|
|
50
|
+
<br />
|
|
51
|
+
<br />
|
|
52
|
+
|
|
53
|
+
<a href="https://carrot.ac.uk/">Carrot Mapper</a> is a webapp which allows the user to use the metadata (as output by [WhiteRabbit](https://github.com/OHDSI/WhiteRabbit)) from a dataset to produce mapping rules to the OMOP standard, in the JSON format. These can be ingested by [Carrot Transform](https://carrot.ac.uk/transform/quickstart) to perform the mapping of the contents of the dataset to OMOP.
|
|
54
|
+
|
|
55
|
+
Carrot Transform transforms input data into tab separated variable files of standard OMOP tables, with concepts mapped according to the provided rules (generated from Carrot Mapper).
|
|
56
|
+
|
|
57
|
+
## Quick Start for Developers
|
|
58
|
+
|
|
59
|
+
To have the project up and running, please follow the [Quick Start Guide](https://carrot.ac.uk/transform/quickstart).
|
|
60
|
+
|
|
61
|
+
## Release Procedure
|
|
62
|
+
To release a new version of `carrot-transform` follow these steps:
|
|
63
|
+
|
|
64
|
+
### 1. Prepare the repository
|
|
65
|
+
- First ensure that repository is clean and all required changes have been merged.
|
|
66
|
+
- Pull the latest changes from `main` with `git pull origin main`.
|
|
67
|
+
|
|
68
|
+
### 2. Create a release branch
|
|
69
|
+
|
|
70
|
+
- Now create a new feature branch name `release/v<NEW-VERSION>` (e.g. `release/v0.2.0`).
|
|
71
|
+
|
|
72
|
+
### 3. Update the version number
|
|
73
|
+
- Use poetry to bump the version. For example, for a minor version update invoke:
|
|
74
|
+
```bash
|
|
75
|
+
poetry version minor
|
|
76
|
+
```
|
|
77
|
+
- Commit and push the changes (to the release feature branch):
|
|
78
|
+
```bash
|
|
79
|
+
NEW_VERSION=$(poetry version -s)
|
|
80
|
+
git add pyproject.toml
|
|
81
|
+
git commit -m "Bump version to $NEW_VERSION"
|
|
82
|
+
git push --set-upstream origin release/v$NEW_VERSION
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### 4. Create pull request
|
|
86
|
+
- Open a pull request from `release/v$NEW_VERSION` to `main` and await approval.
|
|
87
|
+
### 5. Merge and tag
|
|
88
|
+
- After approval merge the the feature branch to `main`.
|
|
89
|
+
- Checkout to `main`, pull updates, and create a tag corresponding to the new version number.
|
|
90
|
+
```bash
|
|
91
|
+
git checkout main
|
|
92
|
+
git pull origin main
|
|
93
|
+
git tag -a "$NEW_VERSION" -m "Release $NEW_VERSION"
|
|
94
|
+
git push origin "$NEW_VERSION"
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### 6. Create a release
|
|
98
|
+
- We must now link the tag to a release in the GitHub repository. To do this from the command line first install GitHub command line tools `gh` and then invoke:
|
|
99
|
+
```bash
|
|
100
|
+
gh release create "$TAG" --title "$TAG" --notes "Release for $VERSION"
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
- Alternatively, follow the instructions in the [GitHub documentation](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository) to manually create a release.
|
|
104
|
+
## License
|
|
105
|
+
|
|
106
|
+
This repository's source code is available under the [MIT license](LICENSE).
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<a href="https://carrot.ac.uk/" target="_blank">
|
|
3
|
+
<picture>
|
|
4
|
+
<source media="(prefers-color-scheme: dark)" srcset="/images/logo-dark.png">
|
|
5
|
+
<img alt="Carrot Logo" src="/images/logo-primary.png" width="280"/>
|
|
6
|
+
</picture>
|
|
7
|
+
</a>
|
|
8
|
+
</p>
|
|
9
|
+
|
|
10
|
+
<p align="center">
|
|
11
|
+
|
|
12
|
+
<a href="https://github.com/Health-Informatics-UoN/carrot-transform/releases">
|
|
13
|
+
<img src="https://img.shields.io/github/v/release/Health-Informatics-UoN/carrot-transform" alt="Release">
|
|
14
|
+
</a>
|
|
15
|
+
<a href="https://opensource.org/license/mit">
|
|
16
|
+
<img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License">
|
|
17
|
+
</a>
|
|
18
|
+
</p>
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
<div align="center">
|
|
22
|
+
<strong>
|
|
23
|
+
<h2>Streamlined Data Transformation to OMOP</h2><br />
|
|
24
|
+
<a href="https://carrot.ac.uk/">Carrot Transform</a> automates data transformation processes and facilitates the standardisation of datasets to the OMOP vocabulary, simplifying the integration of diverse data sources.
|
|
25
|
+
<br />
|
|
26
|
+
</strong>
|
|
27
|
+
</div>
|
|
28
|
+
|
|
29
|
+
<p align="center">
|
|
30
|
+
<br />
|
|
31
|
+
<a href="https://carrot.ac.uk/transform" rel="dofollow"><strong>Explore the docs »</strong></a>
|
|
32
|
+
<br />
|
|
33
|
+
<br />
|
|
34
|
+
|
|
35
|
+
<a href="https://carrot.ac.uk/">Carrot Mapper</a> is a webapp which allows the user to use the metadata (as output by [WhiteRabbit](https://github.com/OHDSI/WhiteRabbit)) from a dataset to produce mapping rules to the OMOP standard, in the JSON format. These can be ingested by [Carrot Transform](https://carrot.ac.uk/transform/quickstart) to perform the mapping of the contents of the dataset to OMOP.
|
|
36
|
+
|
|
37
|
+
Carrot Transform transforms input data into tab separated variable files of standard OMOP tables, with concepts mapped according to the provided rules (generated from Carrot Mapper).
|
|
38
|
+
|
|
39
|
+
## Quick Start for Developers
|
|
40
|
+
|
|
41
|
+
To have the project up and running, please follow the [Quick Start Guide](https://carrot.ac.uk/transform/quickstart).
|
|
42
|
+
|
|
43
|
+
## Release Procedure
|
|
44
|
+
To release a new version of `carrot-transform` follow these steps:
|
|
45
|
+
|
|
46
|
+
### 1. Prepare the repository
|
|
47
|
+
- First ensure that repository is clean and all required changes have been merged.
|
|
48
|
+
- Pull the latest changes from `main` with `git pull origin main`.
|
|
49
|
+
|
|
50
|
+
### 2. Create a release branch
|
|
51
|
+
|
|
52
|
+
- Now create a new feature branch name `release/v<NEW-VERSION>` (e.g. `release/v0.2.0`).
|
|
53
|
+
|
|
54
|
+
### 3. Update the version number
|
|
55
|
+
- Use poetry to bump the version. For example, for a minor version update invoke:
|
|
56
|
+
```bash
|
|
57
|
+
poetry version minor
|
|
58
|
+
```
|
|
59
|
+
- Commit and push the changes (to the release feature branch):
|
|
60
|
+
```bash
|
|
61
|
+
NEW_VERSION=$(poetry version -s)
|
|
62
|
+
git add pyproject.toml
|
|
63
|
+
git commit -m "Bump version to $NEW_VERSION"
|
|
64
|
+
git push --set-upstream origin release/v$NEW_VERSION
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### 4. Create pull request
|
|
68
|
+
- Open a pull request from `release/v$NEW_VERSION` to `main` and await approval.
|
|
69
|
+
### 5. Merge and tag
|
|
70
|
+
- After approval merge the the feature branch to `main`.
|
|
71
|
+
- Checkout to `main`, pull updates, and create a tag corresponding to the new version number.
|
|
72
|
+
```bash
|
|
73
|
+
git checkout main
|
|
74
|
+
git pull origin main
|
|
75
|
+
git tag -a "$NEW_VERSION" -m "Release $NEW_VERSION"
|
|
76
|
+
git push origin "$NEW_VERSION"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### 6. Create a release
|
|
80
|
+
- We must now link the tag to a release in the GitHub repository. To do this from the command line first install GitHub command line tools `gh` and then invoke:
|
|
81
|
+
```bash
|
|
82
|
+
gh release create "$TAG" --title "$TAG" --notes "Release for $VERSION"
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
- Alternatively, follow the instructions in the [GitHub documentation](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository) to manually create a release.
|
|
86
|
+
## License
|
|
87
|
+
|
|
88
|
+
This repository's source code is available under the [MIT license](LICENSE).
|
|
@@ -8,6 +8,9 @@ import json
|
|
|
8
8
|
import importlib.resources
|
|
9
9
|
import carrottransform
|
|
10
10
|
import carrottransform.tools as tools
|
|
11
|
+
from carrottransform.tools.omopcdm import OmopCDM
|
|
12
|
+
from typing import Iterator, IO
|
|
13
|
+
|
|
11
14
|
|
|
12
15
|
@click.group(help="Commands for mapping data to the OMOP CommonDataModel (CDM).")
|
|
13
16
|
def run():
|
|
@@ -35,7 +38,7 @@ def run():
|
|
|
35
38
|
help="File containing additional / override json config for omop outputs")
|
|
36
39
|
@click.option("--omop-version",
|
|
37
40
|
required=False,
|
|
38
|
-
help="Quoted string containing
|
|
41
|
+
help="Quoted string containing omop version - eg '5.3'")
|
|
39
42
|
@click.option("--saved-person-id-file",
|
|
40
43
|
default=None,
|
|
41
44
|
required=False,
|
|
@@ -66,28 +69,26 @@ def mapstream(rules_file, output_dir, write_mode,
|
|
|
66
69
|
# - check for values in optional arguments
|
|
67
70
|
# - read in configuration files
|
|
68
71
|
# - check main directories for existence
|
|
69
|
-
# - handle saved
|
|
70
|
-
# - initialise metrics
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
if saved_person_id_file == None:
|
|
85
|
-
saved_person_id_file = output_dir + "/" + "person_ids.tsv"
|
|
86
|
-
if os.path.exists(saved_person_id_file):
|
|
87
|
-
os.remove(saved_person_id_file)
|
|
72
|
+
# - handle saved person ids
|
|
73
|
+
# - initialise metrics
|
|
74
|
+
print(rules_file, output_dir, write_mode,
|
|
75
|
+
person_file, omop_ddl_file, omop_config_file,
|
|
76
|
+
omop_version, saved_person_id_file, use_input_person_ids,
|
|
77
|
+
last_used_ids_file, log_file_threshold, input_dir)
|
|
78
|
+
|
|
79
|
+
## set omop filenames
|
|
80
|
+
omop_config_file, omop_ddl_file = set_omop_filenames(omop_ddl_file, omop_config_file, omop_version)
|
|
81
|
+
## check directories are valid
|
|
82
|
+
check_dir_isvalid(input_dir)
|
|
83
|
+
check_dir_isvalid(output_dir)
|
|
84
|
+
|
|
85
|
+
saved_person_id_file = set_saved_person_id_file(saved_person_id_file, output_dir)
|
|
88
86
|
|
|
89
87
|
starttime = time.time()
|
|
88
|
+
## create OmopCDM object, which contains attributes and methods for the omop data tables.
|
|
90
89
|
omopcdm = tools.omopcdm.OmopCDM(omop_ddl_file, omop_config_file)
|
|
90
|
+
|
|
91
|
+
## mapping rules determine the ouput files? which input files and fields in the source data, AND the mappings to omop concepts
|
|
91
92
|
mappingrules = tools.mappingrules.MappingRules(rules_file, omopcdm)
|
|
92
93
|
metrics = tools.metrics.Metrics(mappingrules.get_dataset_name(), log_file_threshold)
|
|
93
94
|
nowtime = time.time()
|
|
@@ -95,36 +96,41 @@ def mapstream(rules_file, output_dir, write_mode,
|
|
|
95
96
|
print("--------------------------------------------------------------------------------")
|
|
96
97
|
print("Loaded mapping rules from: {0} in {1:.5f} secs".format(rules_file, (nowtime - starttime)))
|
|
97
98
|
output_files = mappingrules.get_all_outfile_names()
|
|
99
|
+
|
|
100
|
+
## set record number
|
|
101
|
+
## will keep track of the current record number in each file, e.g., measurement_id, observation_id.
|
|
98
102
|
record_numbers = {}
|
|
99
103
|
for output_file in output_files:
|
|
100
104
|
record_numbers[output_file] = 1
|
|
105
|
+
if last_used_ids_file != None:
|
|
106
|
+
if os.path.isfile(last_used_ids_file):
|
|
107
|
+
record_numbers = load_last_used_ids(last_used_ids_file, record_numbers)
|
|
101
108
|
|
|
102
109
|
fhd = {}
|
|
103
110
|
tgtcolmaps = {}
|
|
104
111
|
|
|
112
|
+
|
|
113
|
+
|
|
105
114
|
try:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
|
|
119
|
-
for person_id, person_assigned_id in person_lookup.items():
|
|
120
|
-
fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
|
|
121
|
-
fhpout.close()
|
|
122
|
-
# Initialise output files, output a header for each
|
|
115
|
+
## get all person_ids from file and either renumber with an int or take directly, and add to a dict
|
|
116
|
+
person_lookup, rejected_person_count = load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_person_ids)
|
|
117
|
+
## open person_ids output file
|
|
118
|
+
with open(saved_person_id_file, mode="w") as fhpout:
|
|
119
|
+
## write the header to the file
|
|
120
|
+
fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
|
|
121
|
+
##iterate through the ids and write them to the file.
|
|
122
|
+
for person_id, person_assigned_id in person_lookup.items():
|
|
123
|
+
fhpout.write("{0}\t{1}\n".format(str(person_id), str(person_assigned_id)))
|
|
124
|
+
|
|
125
|
+
## Initialise output files (adding them to a dict), output a header for each
|
|
126
|
+
## these aren't being closed deliberately
|
|
123
127
|
for tgtfile in output_files:
|
|
124
128
|
fhd[tgtfile] = open(output_dir + "/" + tgtfile + ".tsv", mode=write_mode)
|
|
125
129
|
if write_mode == 'w':
|
|
126
130
|
outhdr = omopcdm.get_omop_column_list(tgtfile)
|
|
127
131
|
fhd[tgtfile].write("\t".join(outhdr) + "\n")
|
|
132
|
+
## maps all omop columns for each file into a dict containing the column name and the index
|
|
133
|
+
## so tgtcolmaps is a dict of dicts.
|
|
128
134
|
tgtcolmaps[tgtfile] = omopcdm.get_omop_column_map(tgtfile)
|
|
129
135
|
|
|
130
136
|
except IOError as e:
|
|
@@ -133,43 +139,35 @@ def mapstream(rules_file, output_dir, write_mode,
|
|
|
133
139
|
|
|
134
140
|
print("person_id stats: total loaded {0}, reject count {1}".format(len(person_lookup), rejected_person_count))
|
|
135
141
|
|
|
136
|
-
|
|
142
|
+
## Compare files found in the input_dir with those expected based on mapping rules
|
|
137
143
|
existing_input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
|
|
138
144
|
rules_input_files = mappingrules.get_all_infile_names()
|
|
139
|
-
# Log mismatches but continue
|
|
140
|
-
for infile in existing_input_files:
|
|
141
|
-
if infile not in rules_input_files:
|
|
142
|
-
msg = "ERROR: no mapping rules found for existing input file - {0}".format(infile)
|
|
143
|
-
print(msg)
|
|
144
|
-
for infile in rules_input_files:
|
|
145
|
-
if infile not in existing_input_files:
|
|
146
|
-
msg = "ERROR: no data for mapped input file - {0}".format(infile)
|
|
147
|
-
print(msg)
|
|
148
145
|
|
|
149
|
-
|
|
146
|
+
## Log mismatches but continue
|
|
147
|
+
check_files_in_rules_exist(rules_input_files, existing_input_files)
|
|
148
|
+
|
|
149
|
+
## set up overall counts
|
|
150
150
|
rejidcounts = {}
|
|
151
151
|
rejdatecounts = {}
|
|
152
152
|
print(rules_input_files)
|
|
153
153
|
|
|
154
|
-
|
|
154
|
+
## set up per-input counts
|
|
155
155
|
for srcfilename in rules_input_files:
|
|
156
156
|
rejidcounts[srcfilename] = 0
|
|
157
157
|
rejdatecounts[srcfilename] = 0
|
|
158
158
|
|
|
159
|
-
|
|
159
|
+
## main processing loop, for each input file
|
|
160
160
|
for srcfilename in rules_input_files:
|
|
161
161
|
outcounts = {}
|
|
162
162
|
rejcounts = {}
|
|
163
163
|
rcount = 0
|
|
164
164
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
csvr = csv.reader(fh)
|
|
168
|
-
except IOError as e:
|
|
169
|
-
print("Unable to open: {0}".format(input_dir[0] + "/" + srcfilename))
|
|
170
|
-
print("I/O error({0}): {1}".format(e.errno, e.strerror))
|
|
165
|
+
fh, csvr = open_file(input_dir[0], srcfilename)
|
|
166
|
+
if fh is None:
|
|
171
167
|
continue
|
|
172
168
|
|
|
169
|
+
|
|
170
|
+
## create dict for input file, giving the data and output file
|
|
173
171
|
tgtfiles, src_to_tgt = mappingrules.parse_rules_src_to_tgt(srcfilename)
|
|
174
172
|
infile_datetime_source, infile_person_id_source = mappingrules.get_infile_date_person_id(srcfilename)
|
|
175
173
|
for tgtfile in tgtfiles:
|
|
@@ -185,12 +183,13 @@ def mapstream(rules_file, output_dir, write_mode,
|
|
|
185
183
|
datetime_col = inputcolmap[infile_datetime_source]
|
|
186
184
|
print("--------------------------------------------------------------------------------")
|
|
187
185
|
print("Processing input: {0}".format(srcfilename))
|
|
188
|
-
|
|
186
|
+
|
|
189
187
|
# for each input record
|
|
190
188
|
for indata in csvr:
|
|
191
189
|
key = srcfilename + "~all~all~all~"
|
|
192
190
|
metrics.increment_key_count(key, "input_count")
|
|
193
191
|
rcount += 1
|
|
192
|
+
# if there is a date, parse it - read it is a string and convert to YYYY-MM-DD
|
|
194
193
|
strdate = indata[datetime_col].split(" ")[0]
|
|
195
194
|
fulldate = parse_date(strdate)
|
|
196
195
|
if fulldate != None:
|
|
@@ -214,30 +213,15 @@ def mapstream(rules_file, output_dir, write_mode,
|
|
|
214
213
|
for outrecord in outrecords:
|
|
215
214
|
if auto_num_col != None:
|
|
216
215
|
outrecord[tgtcolmap[auto_num_col]] = str(record_numbers[tgtfile])
|
|
216
|
+
### most of the rest of this section is actually to do with metrics
|
|
217
217
|
record_numbers[tgtfile] += 1
|
|
218
218
|
if (outrecord[tgtcolmap[pers_id_col]]) in person_lookup:
|
|
219
219
|
outrecord[tgtcolmap[pers_id_col]] = person_lookup[outrecord[tgtcolmap[pers_id_col]]]
|
|
220
220
|
outcounts[tgtfile] += 1
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
key = srcfilename + "~all~" + tgtfile + "~all~"
|
|
226
|
-
metrics.increment_key_count(key, "output_count")
|
|
227
|
-
if tgtfile == "person":
|
|
228
|
-
key = srcfilename + "~all~" + tgtfile + "~" + outrecord[1] +"~"
|
|
229
|
-
metrics.increment_key_count(key, "output_count")
|
|
230
|
-
key = srcfilename + "~" + datacol +"~" + tgtfile + "~" + outrecord[1] + "~" + outrecord[2]
|
|
231
|
-
metrics.increment_key_count(key, "output_count")
|
|
232
|
-
else:
|
|
233
|
-
key = srcfilename + "~" + datacol +"~" + tgtfile + "~" + outrecord[2] + "~"
|
|
234
|
-
metrics.increment_key_count(key, "output_count")
|
|
235
|
-
key = srcfilename + "~all~" + tgtfile + "~" + outrecord[2] + "~"
|
|
236
|
-
metrics.increment_key_count(key, "output_count")
|
|
237
|
-
key = "all~all~" + tgtfile + "~" + outrecord[2] + "~"
|
|
238
|
-
metrics.increment_key_count(key, "output_count")
|
|
239
|
-
key = "all~all~all~" + outrecord[2] + "~"
|
|
240
|
-
metrics.increment_key_count(key, "output_count")
|
|
221
|
+
|
|
222
|
+
increment_key_counts(srcfilename, metrics, tgtfile, datacol, outrecord)
|
|
223
|
+
|
|
224
|
+
# write the line to the file
|
|
241
225
|
fhd[tgtfile].write("\t".join(outrecord) + "\n")
|
|
242
226
|
else:
|
|
243
227
|
key = srcfilename + "~all~" + tgtfile + "~all~"
|
|
@@ -266,7 +250,39 @@ def mapstream(rules_file, output_dir, write_mode,
|
|
|
266
250
|
nowtime = time.time()
|
|
267
251
|
print("Elapsed time = {0:.5f} secs".format(nowtime - starttime))
|
|
268
252
|
|
|
269
|
-
def
|
|
253
|
+
def increment_key_counts(srcfilename: str, metrics: tools.metrics.Metrics, tgtfile: str, datacol: str, outrecord: list[str]) -> None:
|
|
254
|
+
key = srcfilename + "~all~all~all~"
|
|
255
|
+
metrics.increment_key_count(key, "output_count")
|
|
256
|
+
|
|
257
|
+
key = "all~all~" + tgtfile + "~all~"
|
|
258
|
+
metrics.increment_key_count(key, "output_count")
|
|
259
|
+
|
|
260
|
+
key = srcfilename + "~all~" + tgtfile + "~all~"
|
|
261
|
+
metrics.increment_key_count(key, "output_count")
|
|
262
|
+
|
|
263
|
+
if tgtfile == "person":
|
|
264
|
+
key = srcfilename + "~all~" + tgtfile + "~" + outrecord[1] + "~"
|
|
265
|
+
metrics.increment_key_count(key, "output_count")
|
|
266
|
+
|
|
267
|
+
key = srcfilename + "~" + datacol + "~" + tgtfile + "~" + outrecord[1] + "~" + outrecord[2]
|
|
268
|
+
metrics.increment_key_count(key, "output_count")
|
|
269
|
+
else:
|
|
270
|
+
key = srcfilename + "~" + datacol + "~" + tgtfile + "~" + outrecord[2] + "~"
|
|
271
|
+
metrics.increment_key_count(key, "output_count")
|
|
272
|
+
|
|
273
|
+
key = srcfilename + "~all~" + tgtfile + "~" + outrecord[2] + "~"
|
|
274
|
+
metrics.increment_key_count(key, "output_count")
|
|
275
|
+
|
|
276
|
+
key = "all~all~" + tgtfile + "~" + outrecord[2] + "~"
|
|
277
|
+
metrics.increment_key_count(key, "output_count")
|
|
278
|
+
|
|
279
|
+
key = "all~all~all~" + outrecord[2] + "~"
|
|
280
|
+
metrics.increment_key_count(key, "output_count")
|
|
281
|
+
return
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def get_target_records(tgtfilename: str, tgtcolmap: dict[str, dict[str, int]], rulesmap: dict[str, list[dict[str, list[str]]]], srcfield: str, srcdata: list[str], srccolmap: dict[str, int], srcfilename: str, omopcdm: OmopCDM, metrics: tools.metrics.Metrics) -> \
|
|
285
|
+
tuple[bool, list[str], tools.metrics.Metrics]:
|
|
270
286
|
"""
|
|
271
287
|
build all target records for a given input field
|
|
272
288
|
"""
|
|
@@ -279,6 +295,7 @@ def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srcc
|
|
|
279
295
|
srckey = srcfilename + "~" + srcfield + "~" + tgtfilename
|
|
280
296
|
summarykey = srcfilename + "~" + srcfield + "~" + tgtfilename + "~all~"
|
|
281
297
|
if valid_value(str(srcdata[srccolmap[srcfield]])):
|
|
298
|
+
## check if either or both of the srckey and summarykey are in the rules
|
|
282
299
|
srcfullkey = srcfilename + "~" + srcfield + "~" + str(srcdata[srccolmap[srcfield]]) + "~" + tgtfilename
|
|
283
300
|
dictkeys = []
|
|
284
301
|
if srcfullkey in rulesmap:
|
|
@@ -291,6 +308,7 @@ def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srcc
|
|
|
291
308
|
for dictkey in dictkeys:
|
|
292
309
|
for out_data_elem in rulesmap[dictkey]:
|
|
293
310
|
valid_data_elem = True
|
|
311
|
+
## create empty list to store the data. Populate numerical data elements with 0 instead of empty string.
|
|
294
312
|
tgtarray = ['']*len(tgtcolmap)
|
|
295
313
|
for req_integer in notnull_numeric_fields:
|
|
296
314
|
tgtarray[tgtcolmap[req_integer]] = "0"
|
|
@@ -302,6 +320,7 @@ def get_target_records(tgtfilename, tgtcolmap, rulesmap, srcfield, srcdata, srcc
|
|
|
302
320
|
else:
|
|
303
321
|
tgtarray[tgtcolmap[output_col_data]] = srcdata[srccolmap[infield]]
|
|
304
322
|
if output_col_data in date_component_data:
|
|
323
|
+
## parse the date and store it in the proper format
|
|
305
324
|
strdate = srcdata[srccolmap[infield]].split(" ")[0]
|
|
306
325
|
dt = get_datetime_value(strdate)
|
|
307
326
|
if dt != None:
|
|
@@ -453,7 +472,9 @@ def load_saved_person_ids(person_file):
|
|
|
453
472
|
fh.close()
|
|
454
473
|
return person_ids, last_int
|
|
455
474
|
|
|
456
|
-
def load_person_ids(
|
|
475
|
+
def load_person_ids(saved_person_id_file, person_file, mappingrules, use_input_person_ids, delim=","):
|
|
476
|
+
person_ids, person_number = get_person_lookup(saved_person_id_file)
|
|
477
|
+
|
|
457
478
|
fh = open(person_file, mode="r", encoding="utf-8-sig")
|
|
458
479
|
csvr = csv.reader(fh, delimiter=delim)
|
|
459
480
|
person_columns = {}
|
|
@@ -468,23 +489,25 @@ def load_person_ids(person_file, person_ids, mappingrules, use_input_person_ids,
|
|
|
468
489
|
person_columns[col] = person_col_in_hdr_number
|
|
469
490
|
person_col_in_hdr_number += 1
|
|
470
491
|
|
|
492
|
+
## check the mapping rules for person to find where to get the person data) i.e., which column in the person file contains dob, sex
|
|
471
493
|
birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info("person")
|
|
472
494
|
print("Load Person Data {0}, {1}".format(birth_datetime_source, person_id_source))
|
|
495
|
+
## get the column index of the PersonID from the input file
|
|
473
496
|
person_col = person_columns[person_id_source]
|
|
474
497
|
|
|
475
498
|
for persondata in csvr:
|
|
476
|
-
if not valid_value(persondata[person_columns[person_id_source]]):
|
|
499
|
+
if not valid_value(persondata[person_columns[person_id_source]]): #just checking that the id is not an empty string
|
|
477
500
|
reject_count += 1
|
|
478
501
|
continue
|
|
479
502
|
if not valid_date_value(persondata[person_columns[birth_datetime_source]]):
|
|
480
503
|
reject_count += 1
|
|
481
504
|
continue
|
|
482
|
-
if persondata[person_col] not in person_ids:
|
|
505
|
+
if persondata[person_col] not in person_ids: #if not already in person_ids dict, add it
|
|
483
506
|
if use_input_person_ids == "N":
|
|
484
|
-
person_ids[persondata[person_col]] = str(person_number)
|
|
507
|
+
person_ids[persondata[person_col]] = str(person_number) #create a new integer person_id
|
|
485
508
|
person_number += 1
|
|
486
509
|
else:
|
|
487
|
-
person_ids[persondata[person_col]] = str(persondata[person_col])
|
|
510
|
+
person_ids[persondata[person_col]] = str(persondata[person_col]) #use existing person_id
|
|
488
511
|
fh.close()
|
|
489
512
|
|
|
490
513
|
return person_ids, reject_count
|
|
@@ -493,4 +516,62 @@ def load_person_ids(person_file, person_ids, mappingrules, use_input_person_ids,
|
|
|
493
516
|
def py():
|
|
494
517
|
pass
|
|
495
518
|
|
|
519
|
+
def check_dir_isvalid(directory: str | tuple[str, ...]) -> None:
|
|
520
|
+
## check output dir is valid
|
|
521
|
+
if type(directory) is tuple:
|
|
522
|
+
directory = directory[0]
|
|
523
|
+
|
|
524
|
+
if not os.path.isdir(directory):
|
|
525
|
+
print("Not a directory, dir {0}".format(directory))
|
|
526
|
+
sys.exit(1)
|
|
527
|
+
|
|
528
|
+
def set_saved_person_id_file(saved_person_id_file: str, output_dir: str) -> str:
|
|
529
|
+
## check if there is a saved person id file set in options - if not, check if the file exists and remove it
|
|
530
|
+
if saved_person_id_file is None:
|
|
531
|
+
saved_person_id_file = output_dir + "/" + "person_ids.tsv"
|
|
532
|
+
if os.path.exists(saved_person_id_file):
|
|
533
|
+
os.remove(saved_person_id_file)
|
|
534
|
+
return saved_person_id_file
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def check_files_in_rules_exist(rules_input_files: list[str], existing_input_files: list[str]) -> None:
|
|
538
|
+
for infile in existing_input_files:
|
|
539
|
+
if infile not in rules_input_files:
|
|
540
|
+
msg = "WARNING: no mapping rules found for existing input file - {0}".format(infile)
|
|
541
|
+
print(msg)
|
|
542
|
+
for infile in rules_input_files:
|
|
543
|
+
if infile not in existing_input_files:
|
|
544
|
+
msg = "WARNING: no data for mapped input file - {0}".format(infile)
|
|
545
|
+
print(msg)
|
|
546
|
+
|
|
547
|
+
def open_file(directory: str, filename: str) -> tuple[IO[str], Iterator[list[str]]] | None:
|
|
548
|
+
#def open_file(directory: str, filename: str):
|
|
549
|
+
try:
|
|
550
|
+
fh = open(directory + "/" + filename, mode="r", encoding="utf-8-sig")
|
|
551
|
+
csvr = csv.reader(fh)
|
|
552
|
+
return fh, csvr
|
|
553
|
+
except IOError as e:
|
|
554
|
+
print("Unable to open: {0}".format(directory + "/" + filename))
|
|
555
|
+
print("I/O error({0}): {1}".format(e.errno, e.strerror))
|
|
556
|
+
return None
|
|
557
|
+
|
|
558
|
+
def set_omop_filenames(omop_ddl_file: str, omop_config_file: str, omop_version: str) -> tuple[str, str]:
|
|
559
|
+
if (omop_ddl_file is None) and (omop_config_file is None) and (omop_version is not None):
|
|
560
|
+
omop_config_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/omop.json'
|
|
561
|
+
omop_ddl_file_name = "OMOPCDM_postgresql_" + omop_version + "_ddl.sql"
|
|
562
|
+
omop_ddl_file = str(importlib.resources.files('carrottransform')) + '/' + 'config/' + omop_ddl_file_name
|
|
563
|
+
return omop_config_file, omop_ddl_file
|
|
564
|
+
|
|
565
|
+
def get_person_lookup(saved_person_id_file: str) -> tuple[dict[str, str], int]:
|
|
566
|
+
# Saved-person-file existence test, reload if found, return last used integer
|
|
567
|
+
if os.path.isfile(saved_person_id_file):
|
|
568
|
+
person_lookup, last_used_integer = load_saved_person_ids(saved_person_id_file)
|
|
569
|
+
else:
|
|
570
|
+
person_lookup = {}
|
|
571
|
+
last_used_integer = 1
|
|
572
|
+
return person_lookup, last_used_integer
|
|
573
|
+
|
|
496
574
|
run.add_command(mapstream,"mapstream")
|
|
575
|
+
|
|
576
|
+
if __name__== '__main__':
|
|
577
|
+
mapstream()
|
|
@@ -26,6 +26,10 @@
|
|
|
26
26
|
"visit_occurrence": {
|
|
27
27
|
"visit_start_datetime": "visit_start_date",
|
|
28
28
|
"visit_end_datetime": "visit_end_date"
|
|
29
|
+
},
|
|
30
|
+
"device_exposure": {
|
|
31
|
+
"device_exposure_start_datetime": "device_exposure_start_date",
|
|
32
|
+
"device_exposure_end_datetime": "device_exposure_end_date"
|
|
29
33
|
}
|
|
30
34
|
},
|
|
31
35
|
"date_field_components": {
|
|
@@ -46,6 +50,7 @@
|
|
|
46
50
|
"person": "person_id",
|
|
47
51
|
"procedure_occurrence": "person_id",
|
|
48
52
|
"specimen": "person_id",
|
|
53
|
+
"device_exposure": "person_id",
|
|
49
54
|
"visit_occurrence": "person_id"
|
|
50
55
|
},
|
|
51
56
|
"auto_number_field": {
|
|
@@ -56,6 +61,7 @@
|
|
|
56
61
|
"observation": "observation_id",
|
|
57
62
|
"procedure_occurrence": "procedure_occurrence_id",
|
|
58
63
|
"specimen": "specimen_id",
|
|
64
|
+
"device_exposure": "device_exposure_id",
|
|
59
65
|
"visit_occurrence": "visit_occurrence_id"
|
|
60
66
|
}
|
|
61
67
|
}
|