carrot-transform 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of carrot-transform might be problematic. Click here for more details.

Files changed (33) hide show
  1. {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info}/METADATA +41 -18
  2. carrot_transform-0.4.0.dist-info/RECORD +41 -0
  3. {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info}/WHEEL +1 -1
  4. carrot_transform-0.4.0.dist-info/entry_points.txt +2 -0
  5. carrottransform/__init__.py +1 -1
  6. carrottransform/_version.py +2 -2
  7. carrottransform/cli/command.py +9 -5
  8. carrottransform/cli/subcommands/run.py +302 -443
  9. carrottransform/cli/subcommands/run_v2.py +145 -0
  10. carrottransform/config/OMOPCDM_postgresql_5.4_ddl.sql +550 -0
  11. carrottransform/examples/test/rules/v1.json +280 -0
  12. carrottransform/examples/test/rules/v2.json +115 -0
  13. carrottransform/tools/__init__.py +4 -14
  14. carrottransform/tools/args.py +128 -0
  15. carrottransform/tools/click.py +21 -0
  16. carrottransform/tools/concept_helpers.py +61 -0
  17. carrottransform/tools/core.py +163 -0
  18. carrottransform/tools/date_helpers.py +79 -0
  19. carrottransform/tools/file_helpers.py +177 -7
  20. carrottransform/tools/logger.py +19 -0
  21. carrottransform/tools/mapping_types.py +32 -0
  22. carrottransform/tools/mappingrules.py +298 -32
  23. carrottransform/tools/metrics.py +274 -49
  24. carrottransform/tools/omopcdm.py +42 -32
  25. carrottransform/tools/orchestrator.py +381 -0
  26. carrottransform/tools/person_helpers.py +126 -0
  27. carrottransform/tools/record_builder.py +413 -0
  28. carrottransform/tools/stream_helpers.py +71 -0
  29. carrottransform/tools/types.py +71 -0
  30. carrottransform/tools/validation.py +62 -0
  31. carrot_transform-0.3.4.dist-info/RECORD +0 -24
  32. carrot_transform-0.3.4.dist-info/entry_points.txt +0 -3
  33. {carrot_transform-0.3.4.dist-info → carrot_transform-0.4.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,381 @@
1
+ import csv
2
+ from pathlib import Path
3
+ from typing import Dict, Tuple, Any, Optional, List, Set
4
+ import carrottransform.tools as tools
5
+ from carrottransform.tools.mappingrules import MappingRules
6
+ from carrottransform.tools.omopcdm import OmopCDM
7
+ from carrottransform.tools.logger import logger_setup
8
+ from carrottransform.tools.person_helpers import (
9
+ load_person_ids,
10
+ set_saved_person_id_file,
11
+ )
12
+ from carrottransform.tools.date_helpers import normalise_to8601
13
+ from carrottransform.tools.types import (
14
+ ProcessingResult,
15
+ ProcessingContext,
16
+ RecordContext,
17
+ )
18
+ from carrottransform.tools.record_builder import RecordBuilderFactory
19
+ from carrottransform.tools.file_helpers import OutputFileManager
20
+ from carrottransform.tools.stream_helpers import StreamingLookupCache
21
+
22
+ logger = logger_setup()
23
+
24
+
25
+ class StreamProcessor:
26
+ """Efficient single-pass streaming processor"""
27
+
28
+ def __init__(self, context: ProcessingContext, lookup_cache: StreamingLookupCache):
29
+ self.context = context
30
+ self.cache = lookup_cache
31
+
32
+ def process_all_data(self) -> ProcessingResult:
33
+ """Process all data with single-pass streaming approach"""
34
+ logger.info("Processing data...")
35
+
36
+ total_output_counts = {outfile: 0 for outfile in self.context.output_files}
37
+ total_rejected_counts = {infile: 0 for infile in self.context.input_files}
38
+
39
+ # Process each input file
40
+ for source_filename in self.context.input_files:
41
+ try:
42
+ output_counts, rejected_count = self._process_input_file_stream(
43
+ source_filename
44
+ )
45
+
46
+ # Update totals
47
+ for target_file, count in output_counts.items():
48
+ total_output_counts[target_file] += count
49
+ total_rejected_counts[source_filename] = rejected_count
50
+
51
+ except Exception as e:
52
+ logger.error(f"Error processing file {source_filename}: {str(e)}")
53
+ return ProcessingResult(
54
+ total_output_counts,
55
+ total_rejected_counts,
56
+ success=False,
57
+ error_message=str(e),
58
+ )
59
+
60
+ return ProcessingResult(total_output_counts, total_rejected_counts)
61
+
62
+ def _process_input_file_stream(
63
+ self, source_filename: str
64
+ ) -> Tuple[Dict[str, int], int]:
65
+ """Stream process a single input file with direct output writing"""
66
+ logger.info(f"Streaming input file: {source_filename}")
67
+
68
+ file_path = self.context.input_dir / source_filename
69
+ if not file_path.exists():
70
+ logger.warning(f"Input file not found: {source_filename}")
71
+ return {}, 0
72
+
73
+ # Get which output tables this input file can map to
74
+ applicable_targets = self.cache.input_to_outputs.get(source_filename, set())
75
+ if not applicable_targets:
76
+ logger.info(f"No mappings found for {source_filename}")
77
+ return {}, 0
78
+
79
+ output_counts = {target: 0 for target in applicable_targets}
80
+ rejected_count = 0
81
+
82
+ # Get file metadata from cache
83
+ file_meta = self.cache.file_metadata_cache[source_filename]
84
+ if not file_meta["datetime_source"] or not file_meta["person_id_source"]:
85
+ logger.warning(f"Missing date or person ID mapping for {source_filename}")
86
+ return output_counts, rejected_count
87
+
88
+ try:
89
+ with file_path.open(mode="r", encoding="utf-8-sig") as fh:
90
+ csv_reader = csv.reader(fh)
91
+ csv_column_headers = next(csv_reader)
92
+ input_column_map = self.context.omopcdm.get_column_map(
93
+ csv_column_headers
94
+ )
95
+
96
+ # Validate required columns exist
97
+ datetime_col_idx = input_column_map.get(file_meta["datetime_source"])
98
+ if datetime_col_idx is None:
99
+ logger.warning(
100
+ f"Date field {file_meta['datetime_source']} not found in {source_filename}"
101
+ )
102
+ return output_counts, rejected_count
103
+
104
+ # Stream process each row
105
+ for input_data in csv_reader:
106
+ row_counts, row_rejected = self._process_single_row_stream(
107
+ source_filename,
108
+ input_data,
109
+ input_column_map,
110
+ applicable_targets,
111
+ datetime_col_idx,
112
+ file_meta,
113
+ )
114
+
115
+ for target, count in row_counts.items():
116
+ output_counts[target] += count
117
+ rejected_count += row_rejected
118
+
119
+ except Exception as e:
120
+ logger.error(f"Error streaming file {source_filename}: {str(e)}")
121
+
122
+ return output_counts, rejected_count
123
+
124
+ def _process_single_row_stream(
125
+ self,
126
+ source_filename: str,
127
+ input_data: List[str],
128
+ input_column_map: Dict[str, int],
129
+ applicable_targets: Set[str],
130
+ datetime_col_idx: int,
131
+ file_meta: Dict[str, Any],
132
+ ) -> Tuple[Dict[str, int], int]:
133
+ """Process single row and write directly to all applicable output files"""
134
+
135
+ # Increment input count
136
+ self.context.metrics.increment_key_count(
137
+ source=source_filename,
138
+ fieldname="all",
139
+ tablename="all",
140
+ concept_id="all",
141
+ additional="",
142
+ count_type="input_count",
143
+ )
144
+
145
+ # Normalize date once
146
+ fulldate = normalise_to8601(input_data[datetime_col_idx])
147
+ if fulldate is None:
148
+ self.context.metrics.increment_key_count(
149
+ source=source_filename,
150
+ fieldname="all",
151
+ tablename="all",
152
+ concept_id="all",
153
+ additional="",
154
+ count_type="input_date_fields",
155
+ )
156
+ return {}, 1
157
+
158
+ input_data[datetime_col_idx] = fulldate
159
+
160
+ row_output_counts = {}
161
+ total_rejected = 0
162
+
163
+ # Process this row for each applicable target table
164
+ for target_file in applicable_targets:
165
+ target_counts, target_rejected = self._process_row_for_target_stream(
166
+ source_filename, input_data, input_column_map, target_file, file_meta
167
+ )
168
+
169
+ row_output_counts[target_file] = target_counts
170
+ total_rejected += target_rejected
171
+
172
+ return row_output_counts, total_rejected
173
+
174
+ def _process_row_for_target_stream(
175
+ self,
176
+ source_filename: str,
177
+ input_data: List[str],
178
+ input_column_map: Dict[str, int],
179
+ target_file: str,
180
+ file_meta: Dict[str, Any],
181
+ ) -> Tuple[int, int]:
182
+ """Process row for specific target and write records directly"""
183
+
184
+ v2_mapping = self.context.mappingrules.v2_mappings[target_file][source_filename]
185
+ target_column_map = self.context.target_column_maps[target_file]
186
+
187
+ # Get target metadata from cache
188
+ target_meta = self.cache.target_metadata_cache[target_file]
189
+ auto_num_col = target_meta["auto_num_col"]
190
+ person_id_col = target_meta["person_id_col"]
191
+ date_col_data = target_meta["date_col_data"]
192
+ date_component_data = target_meta["date_component_data"]
193
+ notnull_numeric_fields = target_meta["notnull_numeric_fields"]
194
+
195
+ data_columns = file_meta["data_fields"].get(target_file, [])
196
+
197
+ output_count = 0
198
+ rejected_count = 0
199
+
200
+ # Process each data column for this target
201
+ for data_column in data_columns:
202
+ if data_column not in input_column_map:
203
+ continue
204
+
205
+ column_output, column_rejected = self._process_data_column_stream(
206
+ source_filename,
207
+ input_data,
208
+ input_column_map,
209
+ target_file,
210
+ v2_mapping,
211
+ target_column_map,
212
+ data_column,
213
+ auto_num_col,
214
+ person_id_col,
215
+ date_col_data,
216
+ date_component_data,
217
+ notnull_numeric_fields,
218
+ )
219
+
220
+ output_count += column_output
221
+ rejected_count += column_rejected
222
+
223
+ return output_count, rejected_count
224
+
225
+ def _process_data_column_stream(
226
+ self,
227
+ source_filename: str,
228
+ input_data: List[str],
229
+ input_column_map: Dict[str, int],
230
+ target_file: str,
231
+ v2_mapping,
232
+ target_column_map: Dict[str, int],
233
+ data_column: str,
234
+ auto_num_col: Optional[str],
235
+ person_id_col: str,
236
+ date_col_data: Dict[str, str],
237
+ date_component_data: Dict[str, Dict[str, str]],
238
+ notnull_numeric_fields: List[str],
239
+ ) -> Tuple[int, int]:
240
+ """Process data column and write records directly to output"""
241
+
242
+ rejected_count = 0
243
+ # Create context for record building with direct write capability
244
+ context = RecordContext(
245
+ tgtfilename=target_file,
246
+ tgtcolmap=target_column_map,
247
+ v2_mapping=v2_mapping,
248
+ srcfield=data_column,
249
+ srcdata=input_data,
250
+ srccolmap=input_column_map,
251
+ srcfilename=source_filename,
252
+ omopcdm=self.context.omopcdm,
253
+ metrics=self.context.metrics,
254
+ # Additional context for direct writing
255
+ person_lookup=self.context.person_lookup,
256
+ record_numbers=self.context.record_numbers,
257
+ file_handles=self.context.file_handles,
258
+ auto_num_col=auto_num_col,
259
+ person_id_col=person_id_col,
260
+ date_col_data=date_col_data,
261
+ date_component_data=date_component_data,
262
+ notnull_numeric_fields=notnull_numeric_fields,
263
+ )
264
+
265
+ # Build records
266
+ builder = RecordBuilderFactory.create_builder(context)
267
+ result = builder.build_records()
268
+
269
+ # Update metrics
270
+ self.context.metrics = result.metrics
271
+
272
+ if not result.success:
273
+ rejected_count += 1
274
+
275
+ return result.record_count, rejected_count
276
+
277
+
278
+ class V2ProcessingOrchestrator:
279
+ """Main orchestrator for the entire V2 processing pipeline"""
280
+
281
+ def __init__(
282
+ self,
283
+ rules_file: Path,
284
+ output_dir: Path,
285
+ input_dir: Path,
286
+ person_file: Path,
287
+ omop_ddl_file: Optional[Path],
288
+ omop_config_file: Optional[Path],
289
+ write_mode: str = "w",
290
+ ):
291
+ self.rules_file = rules_file
292
+ self.output_dir = output_dir
293
+ self.input_dir = input_dir
294
+ self.person_file = person_file
295
+ self.omop_ddl_file = omop_ddl_file
296
+ self.omop_config_file = omop_config_file
297
+ self.write_mode = write_mode
298
+
299
+ # Initialize components immediately
300
+ self.initialize_components()
301
+
302
+ def initialize_components(self):
303
+ """Initialize all processing components"""
304
+ self.omopcdm = OmopCDM(self.omop_ddl_file, self.omop_config_file)
305
+ self.mappingrules = MappingRules(self.rules_file, self.omopcdm)
306
+
307
+ if not self.mappingrules.is_v2_format:
308
+ raise ValueError("Rules file is not in v2 format!")
309
+
310
+ self.metrics = tools.metrics.Metrics(self.mappingrules.get_dataset_name())
311
+ self.output_manager = OutputFileManager(self.output_dir, self.omopcdm)
312
+
313
+ # Pre-compute lookup cache for efficient streaming
314
+ self.lookup_cache = StreamingLookupCache(self.mappingrules, self.omopcdm)
315
+
316
+ def setup_person_lookup(self) -> Tuple[Dict[str, str], int]:
317
+ """Setup person ID lookup and save mapping"""
318
+ saved_person_id_file = set_saved_person_id_file(None, self.output_dir)
319
+
320
+ person_lookup, rejected_person_count = load_person_ids(
321
+ saved_person_id_file,
322
+ self.person_file,
323
+ self.mappingrules,
324
+ use_input_person_ids="N",
325
+ )
326
+
327
+ # Save person IDs
328
+ with saved_person_id_file.open(mode="w") as fhpout:
329
+ fhpout.write("SOURCE_SUBJECT\tTARGET_SUBJECT\n")
330
+ for person_id, person_assigned_id in person_lookup.items():
331
+ fhpout.write(f"{str(person_id)}\t{str(person_assigned_id)}\n")
332
+
333
+ return person_lookup, rejected_person_count
334
+
335
+ def execute_processing(self) -> ProcessingResult:
336
+ """Execute the complete processing pipeline with efficient streaming"""
337
+
338
+ try:
339
+ # Setup person lookup
340
+ person_lookup, rejected_person_count = self.setup_person_lookup()
341
+
342
+ # Setup output files - keep all open for streaming
343
+ output_files = self.mappingrules.get_all_outfile_names()
344
+ file_handles, target_column_maps = self.output_manager.setup_output_files(
345
+ output_files, self.write_mode
346
+ )
347
+
348
+ # Create processing context
349
+ context = ProcessingContext(
350
+ mappingrules=self.mappingrules,
351
+ omopcdm=self.omopcdm,
352
+ input_dir=self.input_dir,
353
+ person_lookup=person_lookup,
354
+ record_numbers={output_file: 1 for output_file in output_files},
355
+ file_handles=file_handles,
356
+ target_column_maps=target_column_maps,
357
+ metrics=self.metrics,
358
+ )
359
+
360
+ # Process data using efficient streaming approach
361
+ processor = StreamProcessor(context, self.lookup_cache)
362
+ result = processor.process_all_data()
363
+
364
+ # Log results
365
+ logger.info(
366
+ f"person_id stats: total loaded {len(person_lookup)}, reject count {rejected_person_count}"
367
+ )
368
+ for target_file, count in result.output_counts.items():
369
+ logger.info(f"TARGET: {target_file}: output count {count}")
370
+
371
+ # Write summary
372
+ data_summary = self.metrics.get_mapstream_summary()
373
+ with (self.output_dir / "summary_mapstream.tsv").open(mode="w") as dsfh:
374
+ dsfh.write(data_summary)
375
+
376
+ return result
377
+
378
+ finally:
379
+ # Always close files
380
+ if self.output_manager:
381
+ self.output_manager.close_all_files()
@@ -0,0 +1,126 @@
1
+ import csv
2
+ import sys
3
+ from pathlib import Path
4
+ from carrottransform.tools.logger import logger_setup
5
+ from carrottransform.tools.validation import valid_value, valid_date_value
6
+ from carrottransform.tools.mappingrules import MappingRules
7
+
8
+ logger = logger_setup()
9
+
10
+
11
+ def load_last_used_ids(last_used_ids_file: Path, last_used_ids):
12
+ fh = last_used_ids_file.open(mode="r", encoding="utf-8-sig")
13
+ csvr = csv.reader(fh, delimiter="\t")
14
+
15
+ for last_ids_data in csvr:
16
+ last_used_ids[last_ids_data[0]] = int(last_ids_data[1]) + 1
17
+
18
+ fh.close()
19
+ return last_used_ids
20
+
21
+
22
+ def load_person_ids(
23
+ saved_person_id_file,
24
+ person_file,
25
+ mappingrules: MappingRules,
26
+ use_input_person_ids,
27
+ delim=",",
28
+ ):
29
+ person_ids, person_number = _get_person_lookup(saved_person_id_file)
30
+
31
+ fh = person_file.open(mode="r", encoding="utf-8-sig")
32
+ csvr = csv.reader(fh, delimiter=delim)
33
+ person_columns = {}
34
+ person_col_in_hdr_number = 0
35
+ reject_count = 0
36
+ # Header row of the person file
37
+ personhdr = next(csvr)
38
+ # TODO: not sure if this is needed
39
+ logger.info("Headers in Person file: %s", personhdr)
40
+
41
+ # Make a dictionary of column names vs their positions
42
+ for col in personhdr:
43
+ person_columns[col] = person_col_in_hdr_number
44
+ person_col_in_hdr_number += 1
45
+
46
+ ## check the mapping rules for person to find where to get the person data) i.e., which column in the person file contains dob, sex
47
+ birth_datetime_source, person_id_source = mappingrules.get_person_source_field_info(
48
+ "person"
49
+ )
50
+
51
+ ## get the column index of the PersonID from the input file
52
+ person_col = person_columns[person_id_source]
53
+
54
+ for persondata in csvr:
55
+ if not valid_value(
56
+ persondata[person_columns[person_id_source]]
57
+ ): # just checking that the id is not an empty string
58
+ reject_count += 1
59
+ continue
60
+ if not valid_date_value(persondata[person_columns[birth_datetime_source]]):
61
+ reject_count += 1
62
+ continue
63
+ if (
64
+ persondata[person_col] not in person_ids
65
+ ): # if not already in person_ids dict, add it
66
+ if use_input_person_ids == "N":
67
+ person_ids[persondata[person_col]] = str(
68
+ person_number
69
+ ) # create a new integer person_id
70
+ person_number += 1
71
+ else:
72
+ person_ids[persondata[person_col]] = str(
73
+ persondata[person_col]
74
+ ) # use existing person_id
75
+ fh.close()
76
+
77
+ return person_ids, reject_count
78
+
79
+
80
+ # TODO: understand the purpose of this function and simplify it
81
+ def set_saved_person_id_file(
82
+ saved_person_id_file: Path | None, output_dir: Path
83
+ ) -> Path:
84
+ """check if there is a saved person id file set in options - if not, check if the file exists and remove it"""
85
+
86
+ if saved_person_id_file is None:
87
+ saved_person_id_file = output_dir / "person_ids.tsv"
88
+ if saved_person_id_file.is_dir():
89
+ logger.exception(
90
+ f"the detected saved_person_id_file {saved_person_id_file} is already a dir"
91
+ )
92
+ sys.exit(1)
93
+ if saved_person_id_file.exists():
94
+ saved_person_id_file.unlink()
95
+ else:
96
+ if saved_person_id_file.is_dir():
97
+ logger.exception(
98
+ f"the passed saved_person_id_file {saved_person_id_file} is already a dir"
99
+ )
100
+ sys.exit(1)
101
+ return saved_person_id_file
102
+
103
+
104
+ def _get_person_lookup(saved_person_id_file: Path) -> tuple[dict[str, str], int]:
105
+ # Saved-person-file existence test, reload if found, return last used integer
106
+ if saved_person_id_file.is_file():
107
+ person_lookup, last_used_integer = _load_saved_person_ids(saved_person_id_file)
108
+ else:
109
+ person_lookup = {}
110
+ last_used_integer = 1
111
+ return person_lookup, last_used_integer
112
+
113
+
114
+ def _load_saved_person_ids(person_file: Path):
115
+ fh = person_file.open(mode="r", encoding="utf-8-sig")
116
+ csvr = csv.reader(fh, delimiter="\t")
117
+ last_int = 1
118
+ person_ids = {}
119
+
120
+ next(csvr)
121
+ for persondata in csvr:
122
+ person_ids[persondata[0]] = persondata[1]
123
+ last_int += 1
124
+
125
+ fh.close()
126
+ return person_ids, last_int