carrot-transform 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

Files changed (32) hide show
  1. {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
  2. carrot_transform-0.4.0.dist-info/RECORD +41 -0
  3. {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
  4. carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
  5. carrottransform/__init__.py +1 -1
  6. carrottransform/_version.py +2 -2
  7. carrottransform/cli/command.py +9 -5
  8. carrottransform/cli/subcommands/run.py +214 -526
  9. carrottransform/cli/subcommands/run_v2.py +145 -0
  10. carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
  11. carrottransform/examples/test/rules/v1.json +280 -0
  12. carrottransform/examples/test/rules/v2.json +115 -0
  13. carrottransform/tools/__init__.py +4 -14
  14. carrottransform/tools/args.py +128 -0
  15. carrottransform/tools/concept_helpers.py +61 -0
  16. carrottransform/tools/core.py +163 -0
  17. carrottransform/tools/date_helpers.py +79 -0
  18. carrottransform/tools/file_helpers.py +153 -9
  19. carrottransform/tools/logger.py +19 -0
  20. carrottransform/tools/mapping_types.py +32 -0
  21. carrottransform/tools/mappingrules.py +297 -34
  22. carrottransform/tools/metrics.py +162 -109
  23. carrottransform/tools/omopcdm.py +37 -32
  24. carrottransform/tools/orchestrator.py +381 -0
  25. carrottransform/tools/person_helpers.py +126 -0
  26. carrottransform/tools/record_builder.py +413 -0
  27. carrottransform/tools/stream_helpers.py +71 -0
  28. carrottransform/tools/types.py +71 -0
  29. carrottransform/tools/validation.py +62 -0
  30. carrot_transform-0.3.5.dist-info/RECORD +0 -25
  31. carrot_transform-0.3.5.dist-info/entry_points.txt +0 -3
  32. {carrot_transform-0.3.5.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,163 @@
1
+ import carrottransform.tools as tools
2
+ from carrottransform.tools.omopcdm import OmopCDM
3
+ from carrottransform.tools.logger import logger_setup
4
+ from carrottransform.tools.validation import valid_value
5
+ from carrottransform.tools.date_helpers import get_datetime_value
6
+
7
+ logger = logger_setup()
8
+
9
+
10
+ def get_target_records(
11
+ tgtfilename: str,
12
+ tgtcolmap: dict[str, int],
13
+ rulesmap: dict[str, list[dict[str, list[str]]]],
14
+ srcfield: str,
15
+ srcdata: list[str],
16
+ srccolmap: dict[str, int],
17
+ srcfilename: str,
18
+ omopcdm: OmopCDM,
19
+ metrics: tools.metrics.Metrics,
20
+ ) -> tuple[bool, list[list[str]], tools.metrics.Metrics]:
21
+ """
22
+ build all target records for a given input field
23
+ """
24
+ build_records = False
25
+ tgtrecords = []
26
+ # Get field definitions from OMOP CDM
27
+ date_col_data = omopcdm.get_omop_datetime_linked_fields(tgtfilename)
28
+ date_component_data = omopcdm.get_omop_date_field_components(tgtfilename)
29
+ notnull_numeric_fields = omopcdm.get_omop_notnull_numeric_fields(tgtfilename)
30
+
31
+ # Build keys to look up rules
32
+ srckey = f"{srcfilename}~{srcfield}~{tgtfilename}"
33
+
34
+ # Check if source field has a value
35
+ if valid_value(str(srcdata[srccolmap[srcfield]])):
36
+ ## check if either or both of the srckey and summarykey are in the rules
37
+ srcfullkey = (
38
+ srcfilename
39
+ + "~"
40
+ + srcfield
41
+ + "~"
42
+ + str(srcdata[srccolmap[srcfield]])
43
+ + "~"
44
+ + tgtfilename
45
+ )
46
+
47
+ dictkeys = []
48
+ # Check if we have rules for either the full key or just the source field
49
+ if tgtfilename == "person":
50
+ build_records = True
51
+ dictkeys.append(srcfilename + "~person")
52
+ elif srcfullkey in rulesmap:
53
+ build_records = True
54
+ dictkeys.append(srcfullkey)
55
+ if srckey in rulesmap:
56
+ build_records = True
57
+ dictkeys.append(srckey)
58
+
59
+ if build_records:
60
+ # Process each matching rule
61
+ for dictkey in dictkeys:
62
+ for out_data_elem in rulesmap[dictkey]:
63
+ valid_data_elem = True
64
+ ## create empty list to store the data. Populate numerical data elements with 0 instead of empty string.
65
+ tgtarray = [""] * len(tgtcolmap)
66
+ # Initialize numeric fields to 0
67
+ for req_integer in notnull_numeric_fields:
68
+ tgtarray[tgtcolmap[req_integer]] = "0"
69
+
70
+ # Process each field mapping
71
+ for infield, outfield_list in out_data_elem.items():
72
+ if tgtfilename == "person" and isinstance(outfield_list, dict):
73
+ # Handle term mappings for person records
74
+ input_value = srcdata[srccolmap[infield]]
75
+ if str(input_value) in outfield_list:
76
+ for output_col_data in outfield_list[str(input_value)]:
77
+ if "~" in output_col_data:
78
+ # Handle mapped values (like gender codes)
79
+ outcol, term = output_col_data.split("~")
80
+ tgtarray[tgtcolmap[outcol]] = term
81
+ else:
82
+ # Direct field copy
83
+ tgtarray[tgtcolmap[output_col_data]] = srcdata[
84
+ srccolmap[infield]
85
+ ]
86
+ else:
87
+ # Handle direct field copies and non-person records
88
+ for output_col_data in outfield_list:
89
+ if "~" in output_col_data:
90
+ # Handle mapped values (like gender codes)
91
+ outcol, term = output_col_data.split("~")
92
+ tgtarray[tgtcolmap[outcol]] = term
93
+ else:
94
+ # Direct field copy
95
+ tgtarray[tgtcolmap[output_col_data]] = srcdata[
96
+ srccolmap[infield]
97
+ ]
98
+
99
+ # get the value. this is out 8061 value that was previously normalised
100
+ source_date = srcdata[srccolmap[infield]]
101
+
102
+ # Special handling for date fields
103
+ if output_col_data in date_component_data:
104
+ # this side of the if/else seems to be fore birthdates which're split up into four fields
105
+
106
+ # parse the date and store it in the old format ... as a way to branch
107
+ # ... this check might be redudant. the datetime values should be ones that have already been normalised
108
+ dt = get_datetime_value(source_date.split(" ")[0])
109
+ if dt is None:
110
+ # if (as above) dt isn't going to be None than this branch shouldn't happen
111
+ # maybe brithdates can be None?
112
+
113
+ metrics.increment_key_count(
114
+ source=srcfilename,
115
+ fieldname=srcfield,
116
+ tablename=tgtfilename,
117
+ concept_id="all",
118
+ additional="",
119
+ count_type="invalid_date_fields",
120
+ )
121
+ valid_data_elem = False
122
+ else:
123
+ year_field = date_component_data[output_col_data][
124
+ "year"
125
+ ]
126
+ month_field = date_component_data[output_col_data][
127
+ "month"
128
+ ]
129
+ day_field = date_component_data[output_col_data][
130
+ "day"
131
+ ]
132
+ tgtarray[tgtcolmap[year_field]] = str(dt.year)
133
+ tgtarray[tgtcolmap[month_field]] = str(dt.month)
134
+ tgtarray[tgtcolmap[day_field]] = str(dt.day)
135
+
136
+ tgtarray[tgtcolmap[output_col_data]] = source_date
137
+
138
+ elif (
139
+ output_col_data in date_col_data
140
+ ): # date_col_data for key $K$ is where $only_date(srcdata[K])$ should be copied and is there for all dates
141
+ # this fork of the if/else seems to be for non-birthdates which're handled differently
142
+
143
+ # copy the full value into this "full value"
144
+ tgtarray[tgtcolmap[output_col_data]] = source_date
145
+
146
+ # select the first 10 chars which will be YYYY-MM-DD
147
+ tgtarray[tgtcolmap[date_col_data[output_col_data]]] = (
148
+ source_date[:10]
149
+ )
150
+
151
+ if valid_data_elem:
152
+ tgtrecords.append(tgtarray)
153
+ else:
154
+ metrics.increment_key_count(
155
+ source=srcfilename,
156
+ fieldname=srcfield,
157
+ tablename=tgtfilename,
158
+ concept_id="all",
159
+ additional="",
160
+ count_type="invalid_source_fields",
161
+ )
162
+
163
+ return build_records, tgtrecords, metrics
@@ -0,0 +1,79 @@
1
+ import datetime
2
+ import re
3
+
4
+
5
+ def get_datetime_value(item: str) -> datetime.datetime | None:
6
+ """
7
+ Check if a date item is non-null and parses as ISO (YYYY-MM-DD), reverse-ISO (DD-MM-YYYY),
8
+ or UK format (DD/MM/YYYY).
9
+ Returns a datetime object if successful, None otherwise.
10
+ """
11
+ date_formats = [
12
+ "%Y-%m-%d", # ISO format (YYYY-MM-DD)
13
+ "%d-%m-%Y", # Reverse ISO format (DD-MM-YYYY)
14
+ "%d/%m/%Y", # UK old-style format (DD/MM/YYYY)
15
+ ]
16
+
17
+ for date_format in date_formats:
18
+ try:
19
+ return datetime.datetime.strptime(item, date_format)
20
+ except ValueError:
21
+ continue
22
+
23
+ # If we get here, none of the formats worked
24
+ return None
25
+
26
+
27
+ def normalise_to8601(item: str) -> str:
28
+ """parses, normalises, and formats a date value using regexes
29
+
30
+ could use just one regex but that seems bad.
31
+ """
32
+
33
+ both = item.split(" ")
34
+
35
+ match = re.match(r"(?P<year>\d{4})[-/](?P<month>\d{2})[-/](?P<day>\d{2})", both[0])
36
+ if not match:
37
+ match = re.match(
38
+ r"(?P<day>\d{2})[-/](?P<month>\d{2})[-/](?P<year>\d{4})", both[0]
39
+ )
40
+
41
+ if not match:
42
+ raise Exception(f"invalid date format {item=}")
43
+
44
+ data = match.groupdict()
45
+ year, month, day = data["year"], data["month"], data["day"]
46
+ value = str(int(year)).zfill(4)
47
+ value += "-"
48
+ value += str(int(month)).zfill(2)
49
+ value += "-"
50
+ value += str(int(day)).zfill(2)
51
+ value += " "
52
+
53
+ if 2 == len(both):
54
+ match = re.match(
55
+ r"(?P<hour>\d{2}):(?P<minute>\d{2})(:(?P<second>\d{2})(\.\d{6})?)?", both[1]
56
+ )
57
+ if match:
58
+ data = match.groupdict()
59
+ hour, minute, second = data["hour"], data["minute"], data["second"]
60
+ else:
61
+ hour, minute, second = None, None, None
62
+
63
+ # concat the time_suffix
64
+ if hour is not None:
65
+ if minute is None:
66
+ raise Exception(
67
+ f"unrecognized format seems to have 'hours' but not 'minutes' {item=}"
68
+ )
69
+
70
+ value += str(int(hour)).zfill(2)
71
+ value += ":"
72
+ value += str(int(minute)).zfill(2)
73
+ value += ":"
74
+ value += str(int(second if second is not None else "0")).zfill(2)
75
+
76
+ if ":" not in value:
77
+ value += "00:00:00"
78
+
79
+ return value
@@ -1,11 +1,11 @@
1
+ import csv
1
2
  import json
2
3
  import logging
3
- import os
4
4
  import sys
5
- import json
6
5
  import importlib.resources as resources
7
- from typing import List, Optional
6
+ from typing import IO, Iterator, List, Optional, Dict, TextIO, Tuple, cast
8
7
  from pathlib import Path
8
+ from carrottransform.tools.omopcdm import OmopCDM
9
9
 
10
10
  logger = logging.getLogger(__name__)
11
11
 
@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
16
16
  def load_json(f_in: Path):
17
17
  try:
18
18
  data = json.load(f_in.open())
19
- except Exception as err:
19
+ except Exception:
20
20
  logger.exception("{0} not found. Or cannot parse as json".format(f_in))
21
21
  sys.exit()
22
22
 
@@ -26,16 +26,160 @@ def load_json(f_in: Path):
26
26
  def resolve_paths(args: List[Optional[Path]]) -> List[Optional[Path]]:
27
27
  """Resolve special path syntaxes in command line arguments."""
28
28
  try:
29
- with resources.files('carrottransform').joinpath('__init__.py') as f:
30
- package_path = f.parent
29
+ # Fix for Traversable parent issue - convert to Path first
30
+ package_files = resources.files("carrottransform")
31
+ package_path = Path(str(package_files)).resolve()
31
32
  except Exception:
32
33
  # Fallback for development environment
33
34
  import carrottransform
35
+
34
36
  package_path = Path(carrottransform.__file__).resolve().parent
35
-
37
+
36
38
  # Handle None values and replace @carrot with the actual package path
37
- prefix = '@carrot'
39
+ prefix = "@carrot"
38
40
  return [
39
- package_path / Path(str(arg).replace(prefix, '').lstrip('/')) if arg is not None and str(arg).startswith(prefix) else arg
41
+ (
42
+ package_path
43
+ / Path(str(arg).replace(prefix, "").replace("\\", "/").lstrip("/"))
44
+ if arg is not None and str(arg).startswith(prefix)
45
+ else arg
46
+ )
40
47
  for arg in args
41
48
  ]
49
+
50
+
51
+ def check_dir_isvalid(directory: Path, create_if_missing: bool = False) -> None:
52
+ """Check if directory is valid, optionally create it if missing.
53
+
54
+ Args:
55
+ directory: Directory path as string or tuple
56
+ create_if_missing: If True, create directory if it doesn't exist
57
+ """
58
+ ## if not a directory, create it if requested (including parents. This option is for the output directory only).
59
+ if not directory.is_dir():
60
+ if create_if_missing:
61
+ try:
62
+ ## deliberately not using the exist_ok option, as we want to know whether it was created or not to provide different logger messages.
63
+ directory.mkdir(parents=True)
64
+ logger.info(f"Created directory: {directory}")
65
+ except OSError as e:
66
+ logger.warning(f"Failed to create directory {directory}: {e}")
67
+ sys.exit(1)
68
+ else:
69
+ logger.warning(f"Not a directory, dir {directory}")
70
+ sys.exit(1)
71
+
72
+
73
+ def check_files_in_rules_exist(
74
+ rules_input_files: list[str], existing_input_files: list[str]
75
+ ) -> None:
76
+ for infile in existing_input_files:
77
+ if infile not in rules_input_files:
78
+ msg = (
79
+ "WARNING: no mapping rules found for existing input file - {0}".format(
80
+ infile
81
+ )
82
+ )
83
+ logger.warning(msg)
84
+ for infile in rules_input_files:
85
+ if infile not in existing_input_files:
86
+ msg = "WARNING: no data for mapped input file - {0}".format(infile)
87
+ logger.warning(msg)
88
+
89
+
90
+ def open_file(file_path: Path) -> tuple[IO[str], Iterator[list[str]]] | None:
91
+ """opens a file and does something related to CSVs"""
92
+ try:
93
+ fh = file_path.open(mode="r", encoding="utf-8-sig")
94
+ csvr = csv.reader(fh)
95
+ return fh, csvr
96
+ except IOError as e:
97
+ logger.exception("Unable to open: {0}".format(file_path))
98
+ logger.exception("I/O error({0}): {1}".format(e.errno, e.strerror))
99
+ return None
100
+
101
+
102
+ def set_omop_filenames(
103
+ omop_ddl_file: Optional[Path],
104
+ omop_config_file: Optional[Path],
105
+ omop_version: Optional[str],
106
+ ) -> tuple[Optional[Path], Optional[Path]]:
107
+ """
108
+ Set default OMOP file paths when not explicitly provided.
109
+
110
+ This function provides a convenience mechanism where users can specify just
111
+ an OMOP version instead of providing full paths to both DDL and config files.
112
+
113
+ Args:
114
+ omop_ddl_file: Path to OMOP DDL file (optional)
115
+ omop_config_file: Path to OMOP config file (optional)
116
+ omop_version: OMOP version string (e.g., "5.3", "5.4")
117
+
118
+ Returns:
119
+ Tuple of (config_file_path, ddl_file_path) - either provided or defaults
120
+
121
+ Example:
122
+ # User provides version but no files - defaults will be used
123
+ config, ddl = set_omop_filenames(None, None, "5.3")
124
+
125
+ # User provides custom files - they will be returned unchanged
126
+ config, ddl = set_omop_filenames(custom_ddl, custom_config, "5.3")
127
+ """
128
+ # Only set defaults if BOTH files are None AND version is provided
129
+ if omop_ddl_file is None and omop_config_file is None and omop_version is not None:
130
+ logger.info(f"Using default OMOP files for version {omop_version}")
131
+
132
+ # Set default config file - convert Traversable to Path
133
+ config_traversable = resources.files("carrottransform") / "config" / "omop.json"
134
+ omop_config_file = Path(str(config_traversable))
135
+
136
+ # Set version-specific DDL file - convert Traversable to Path
137
+ omop_ddl_file_name = f"OMOPCDM_postgresql_{omop_version}_ddl.sql"
138
+ ddl_traversable = (
139
+ resources.files("carrottransform") / "config" / omop_ddl_file_name
140
+ )
141
+ omop_ddl_file = Path(str(ddl_traversable))
142
+
143
+ # Validate that the default files exist (now safe since they're Path objects)
144
+ if not omop_config_file.is_file():
145
+ logger.warning(f"Default config file not found: {omop_config_file}")
146
+ if not omop_ddl_file.is_file():
147
+ logger.warning(f"Default DDL file not found: {omop_ddl_file}")
148
+
149
+ return omop_config_file, omop_ddl_file
150
+
151
+
152
+ class OutputFileManager:
153
+ """Manages output file creation and cleanup"""
154
+
155
+ def __init__(self, output_dir: Path, omopcdm: OmopCDM):
156
+ self.output_dir = output_dir
157
+ self.omopcdm = omopcdm
158
+ self.file_handles: Dict[str, TextIO] = {}
159
+
160
+ def setup_output_files(
161
+ self, output_files: List[str], write_mode: str
162
+ ) -> Tuple[Dict[str, TextIO], Dict[str, Dict[str, int]]]:
163
+ """Setup output files and return file handles and column maps"""
164
+ target_column_maps = {}
165
+
166
+ for target_file in output_files:
167
+ file_path = (self.output_dir / target_file).with_suffix(".tsv")
168
+ self.file_handles[target_file] = cast(
169
+ TextIO, file_path.open(mode=write_mode, encoding="utf-8")
170
+ )
171
+ if write_mode == "w":
172
+ output_header = self.omopcdm.get_omop_column_list(target_file)
173
+ self.file_handles[target_file].write("\t".join(output_header) + "\n")
174
+
175
+ target_column_maps[target_file] = self.omopcdm.get_omop_column_map(
176
+ target_file
177
+ )
178
+
179
+ return self.file_handles, target_column_maps
180
+
181
+ def close_all_files(self):
182
+ """Close all open file handles"""
183
+ for fh in self.file_handles.values():
184
+ fh.close()
185
+ self.file_handles.clear()
@@ -0,0 +1,19 @@
1
+ import logging
2
+ from logging import Logger
3
+
4
+
5
+ def logger_setup() -> Logger:
6
+ logger = logging.getLogger(__name__)
7
+ if not logger.handlers:
8
+ logger.setLevel(logging.INFO)
9
+
10
+ console_handler = logging.StreamHandler()
11
+ console_handler.setLevel(logging.INFO)
12
+
13
+ formatter = logging.Formatter(
14
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
15
+ )
16
+ console_handler.setFormatter(formatter)
17
+
18
+ logger.addHandler(console_handler)
19
+ return logger
@@ -0,0 +1,32 @@
1
+ from typing import Dict, List, Optional
2
+ from dataclasses import dataclass
3
+
4
+
5
+ # To prevent circular import, these types should be in a separate file rather than in the types.py
6
+ @dataclass
7
+ class PersonIdMapping:
8
+ source_field: str
9
+ dest_field: str
10
+
11
+
12
+ @dataclass
13
+ class DateMapping:
14
+ source_field: str
15
+ dest_fields: List[str]
16
+
17
+
18
+ @dataclass
19
+ class ConceptMapping:
20
+ source_field: str
21
+ value_mappings: Dict[
22
+ str, Dict[str, List[int]]
23
+ ] # value -> dest_field -> concept_ids
24
+ original_value_fields: List[str]
25
+
26
+
27
+ @dataclass
28
+ class V2TableMapping:
29
+ source_table: str
30
+ person_id_mapping: Optional[PersonIdMapping]
31
+ date_mapping: Optional[DateMapping]
32
+ concept_mappings: Dict[str, ConceptMapping] # source_field -> ConceptMapping