pybutt 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pybutt/io/importer.py ADDED
@@ -0,0 +1,928 @@
1
+ import queue
2
+ import threading
3
+ import time
4
+ import uuid
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import pyarrow.parquet as pq
11
+
12
+ from pybutt.core.base import SqlServerIOBase, rows_from_arrow
13
+ from pybutt.core.config import (
14
+ BATCH_SIZE_DEFAULT,
15
+ CCI_DEFAULT,
16
+ IMPORT_ENGINE_DEFAULT,
17
+ MEM_COOLDOWN_DEFAULT,
18
+ MEM_HEARTBEAT_DEFAULT,
19
+ MEM_MAX_WAIT_DEFAULT,
20
+ MEM_SLEEP_DEFAULT,
21
+ MEM_THRESHOLD_DEFAULT,
22
+ SCHEMA_DEFAULT,
23
+ TRANSACTION_MODE_DEFAULT,
24
+ SqlConfig,
25
+ TransactionMode,
26
+ coerce_transaction_mode,
27
+ quote_identifier,
28
+ validate_engine,
29
+ validate_identifier,
30
+ )
31
+ from pybutt.core.logobs import (
32
+ MemoryGate,
33
+ MemoryHeartbeat,
34
+ context,
35
+ get_logger,
36
+ log_failure_summary,
37
+ log_memory_budget,
38
+ mem_fields,
39
+ )
40
+ from pybutt.exceptions import (
41
+ BatchImportError,
42
+ RowGroupImportError,
43
+ SchemaMismatchError,
44
+ )
45
+ from pybutt.files import (
46
+ default_import_manifest_filename,
47
+ default_manifest_filename,
48
+ load_file_manifest,
49
+ load_manifest,
50
+ validate_manifest_entries,
51
+ write_manifest,
52
+ )
53
+
54
+ logger = get_logger("importer")
55
+
56
+ _SENTINEL = object()
57
+
58
+
59
+ @dataclass(slots=True)
60
+ class _QueueItem:
61
+ """Payload passed from the reader thread to the writer."""
62
+
63
+ rows: list[tuple[Any, ...]]
64
+ rg_label: str
65
+ batch_idx: int | None
66
+ row_count: int
67
+
68
+
69
+ class Importer(SqlServerIOBase):
70
+ def __init__(
71
+ self,
72
+ config: SqlConfig,
73
+ table: str,
74
+ input_path,
75
+ manifest_filename: str | None,
76
+ schema: str = SCHEMA_DEFAULT,
77
+ worker_count=1,
78
+ batch_size: int = BATCH_SIZE_DEFAULT,
79
+ transaction_mode: TransactionMode = TRANSACTION_MODE_DEFAULT,
80
+ engine=IMPORT_ENGINE_DEFAULT,
81
+ temp_manifest_filename: str | None = None,
82
+ create_cci: bool = CCI_DEFAULT,
83
+ mem_heartbeat: float = MEM_HEARTBEAT_DEFAULT,
84
+ mem_threshold: float = MEM_THRESHOLD_DEFAULT,
85
+ mem_sleep: float = MEM_SLEEP_DEFAULT,
86
+ mem_max_wait: float = MEM_MAX_WAIT_DEFAULT,
87
+ mem_cooldown: float = MEM_COOLDOWN_DEFAULT,
88
+ ):
89
+ super().__init__(config)
90
+ self.schema = validate_identifier(schema)
91
+ self.table = validate_identifier(table)
92
+
93
+ self.input_path = Path(input_path)
94
+ self.manifest_filename = (
95
+ manifest_filename
96
+ if manifest_filename
97
+ else default_manifest_filename(self.schema, self.table)
98
+ )
99
+ self.temp_manifest_filename = (
100
+ temp_manifest_filename
101
+ if temp_manifest_filename
102
+ else default_import_manifest_filename(self.schema, self.table)
103
+ )
104
+
105
+ self.worker_count = worker_count
106
+ self.transaction_mode = coerce_transaction_mode(transaction_mode)
107
+ validate_engine(engine)
108
+ self.engine = engine
109
+ self.batch_size = batch_size
110
+ self.create_cci = create_cci
111
+ self.mem_heartbeat = mem_heartbeat
112
+ self.mem_gate = MemoryGate(mem_threshold, mem_sleep, mem_max_wait, mem_cooldown)
113
+
114
+ def load_manifest(self):
115
+ manifest_file = self.input_path / self.manifest_filename
116
+ return load_manifest(manifest_file)
117
+
118
+ def load_manifest_entries(self):
119
+ manifest = load_file_manifest(
120
+ self.input_path / self.manifest_filename, operation="Importer"
121
+ )
122
+ return validate_manifest_entries(manifest, self.input_path)
123
+
124
+ def _build_insert_sql(
125
+ self, columns: list[str], target_table: str | None = None
126
+ ) -> str:
127
+ column_list = ", ".join(quote_identifier(col) for col in columns)
128
+ placeholders = ", ".join("?" for _ in columns)
129
+ table_name = target_table or self.full_table_name()
130
+ return f"INSERT INTO {table_name} ({column_list}) VALUES ({placeholders})"
131
+
132
+ def _rows_from_batch(self, batch):
133
+ return rows_from_arrow(batch)
134
+
135
+ def _validate_and_build_insert(
136
+ self, cur, columns, filename, target_table: str | None = None
137
+ ):
138
+ table_columns = self.get_table_columns(cur, target_table=target_table)
139
+ self.validate_schema(columns, table_columns, filename)
140
+ return self._build_insert_sql(columns, target_table=target_table)
141
+
142
+ def get_table_columns(self, cur, target_table: str | None = None):
143
+ target_table = target_table or self.full_table_name()
144
+ cur.execute(f"SELECT TOP 0 * FROM {target_table}")
145
+ return [column[0] for column in cur.description]
146
+
147
+ def validate_schema(self, parquet_columns, table_columns, filename):
148
+ parquet_set = set(parquet_columns)
149
+ table_set = set(table_columns)
150
+
151
+ if parquet_set != table_set:
152
+ missing_in_sql = parquet_set - table_set
153
+ missing_in_parquet = table_set - parquet_set
154
+
155
+ raise SchemaMismatchError(
156
+ f"Schema mismatch in {filename}:\n"
157
+ f" Columns in parquet but not SQL: {missing_in_sql}\n"
158
+ f" Columns in SQL but not parquet: {missing_in_parquet}"
159
+ )
160
+
161
+ def import_file(self, filename, target_table: str | None = None):
162
+ filepath = self.input_path / filename
163
+ start = time.time()
164
+ target_table_name = target_table or self.full_table_name()
165
+
166
+ logger.info(
167
+ "Importing "
168
+ + context(
169
+ file=filename,
170
+ table=target_table_name,
171
+ engine=self.engine,
172
+ batch_size=self.batch_size,
173
+ transaction_mode=self.transaction_mode.value,
174
+ **mem_fields(),
175
+ )
176
+ )
177
+
178
+ try:
179
+ if self.transaction_mode == TransactionMode.FILE:
180
+ # For FILE mode, wrap entire operation in retry logic
181
+ def _file_operation():
182
+ if self.engine == "duckdb":
183
+ return self._import_file_with_duckdb(
184
+ filepath, filename, start, target_table=target_table
185
+ )
186
+ elif self.engine == "mssql-python":
187
+ return self._import_file_with_mssql(
188
+ filepath, filename, start, target_table=target_table
189
+ )
190
+ return self._import_file_impl(
191
+ filepath, filename, start, target_table=target_table
192
+ )
193
+
194
+ self.retry(_file_operation, context=f"Import file {filename}")
195
+ else:
196
+ # For BATCH, ROWGROUP, and ROW modes, retries happen at granular level
197
+ if self.engine == "duckdb":
198
+ self._import_file_with_duckdb(
199
+ filepath, filename, start, target_table=target_table
200
+ )
201
+ elif self.engine == "mssql-python":
202
+ self._import_file_with_mssql(
203
+ filepath, filename, start, target_table=target_table
204
+ )
205
+ else:
206
+ self._import_file_impl(
207
+ filepath, filename, start, target_table=target_table
208
+ )
209
+ except MemoryError:
210
+ logger.error(
211
+ "Out of memory - not retrying (fatal) " + context(file=filename)
212
+ )
213
+ raise
214
+ except Exception as e:
215
+ logger.error(
216
+ "Failed importing "
217
+ + context(file=filename)
218
+ + f": {self.safe_error_message(e)}"
219
+ )
220
+ logger.debug("Traceback for failed import of %s", filename, exc_info=True)
221
+ raise
222
+
223
+ return True
224
+
225
+ def _import_file_impl(
226
+ self, filepath, filename, start, target_table: str | None = None
227
+ ):
228
+ """Implementation of file import with transaction management.
229
+
230
+ Uses a producer-consumer pattern: a reader thread pre-reads
231
+ rowgroups/batches into a bounded queue while the caller thread
232
+ pushes rows to SQL Server via ``cur.executemany``. This keeps
233
+ the TDS pipe fed and reduces ASYNC_NETWORK_IO waits.
234
+ """
235
+ with self.connection_p() as c:
236
+ with c.cursor() as cur:
237
+ cur.fast_executemany = True
238
+ parquet_file = pq.ParquetFile(filepath)
239
+ columns = parquet_file.schema.names
240
+ insert_sql = self._validate_and_build_insert(
241
+ cur, columns, filename, target_table=target_table
242
+ )
243
+
244
+ total_rows = 0
245
+ buf: queue.Queue[_QueueItem | object] = queue.Queue(maxsize=2)
246
+ cancel = threading.Event()
247
+
248
+ reader = threading.Thread(
249
+ target=self._parquet_reader_thread,
250
+ args=(parquet_file, buf, filename, cancel),
251
+ daemon=True,
252
+ name=f"pyodbc-reader-{filename}",
253
+ )
254
+ reader.start()
255
+
256
+ try:
257
+ while True:
258
+ item = buf.get()
259
+ if item is _SENTINEL:
260
+ break
261
+ if isinstance(item, Exception):
262
+ raise item
263
+ assert isinstance(item, _QueueItem)
264
+
265
+ if self.transaction_mode == TransactionMode.ROWGROUP:
266
+ rows_in_rg = self._import_rowgroup_with_retry(
267
+ c,
268
+ cur,
269
+ item.rows,
270
+ insert_sql,
271
+ filename,
272
+ rg=item.rg_label,
273
+ )
274
+ total_rows += rows_in_rg
275
+ elif self.transaction_mode == TransactionMode.BATCH:
276
+ rows_in_batch = self._import_batch_with_retry(
277
+ c,
278
+ cur,
279
+ item.rows,
280
+ insert_sql,
281
+ filename,
282
+ rg=item.rg_label,
283
+ batch=item.batch_idx,
284
+ offset=total_rows,
285
+ )
286
+ total_rows += rows_in_batch
287
+ else:
288
+ # FILE / ROW modes — no per-item commit
289
+ cur.executemany(insert_sql, item.rows)
290
+ total_rows += item.row_count
291
+ except BaseException:
292
+ cancel.set()
293
+ raise
294
+ finally:
295
+ reader.join(timeout=5)
296
+
297
+ # Commit after entire file if in FILE mode
298
+ if self.transaction_mode == TransactionMode.FILE:
299
+ c.commit()
300
+
301
+ logger.info(
302
+ "Completed "
303
+ + context(
304
+ file=filename,
305
+ rows=total_rows,
306
+ seconds=f"{time.time() - start:.2f}",
307
+ **mem_fields(),
308
+ )
309
+ )
310
+
311
+ def _load_parquet_with_duckdb(self, filepath):
312
+ logger.debug(
313
+ "Loading parquet via DuckDB " + context(file=str(filepath), **mem_fields())
314
+ )
315
+ with self.connection_d() as dconn:
316
+ sanitized_path = str(filepath.as_posix()).replace("'", "''")
317
+ table = dconn.execute(
318
+ f"SELECT * FROM read_parquet('{sanitized_path}')"
319
+ ).fetch_arrow_table()
320
+ logger.debug(
321
+ "Loaded parquet via DuckDB "
322
+ + context(
323
+ file=str(filepath),
324
+ rows=table.num_rows,
325
+ **mem_fields(),
326
+ )
327
+ )
328
+ return table
329
+
330
+ def _import_file_with_duckdb(
331
+ self, filepath, filename, start, target_table: str | None = None
332
+ ):
333
+ with self.connection_p() as c:
334
+ with c.cursor() as cur:
335
+ cur.fast_executemany = True
336
+ if self.transaction_mode == TransactionMode.ROWGROUP:
337
+ parquet_file = pq.ParquetFile(filepath)
338
+ columns = parquet_file.schema.names
339
+ else:
340
+ parquet_table = self._load_parquet_with_duckdb(filepath)
341
+ columns = parquet_table.schema.names
342
+
343
+ insert_sql = self._validate_and_build_insert(
344
+ cur, columns, filename, target_table=target_table
345
+ )
346
+
347
+ total_rows = 0
348
+
349
+ if self.transaction_mode == TransactionMode.ROWGROUP:
350
+ for rg_idx in range(parquet_file.num_row_groups):
351
+ self.mem_gate.check(
352
+ f"read_row_group file={filename}"
353
+ f" rg={rg_idx + 1}/{parquet_file.num_row_groups}"
354
+ )
355
+ logger.debug(
356
+ "Reading row group "
357
+ + context(
358
+ file=filename,
359
+ rg=f"{rg_idx + 1}/{parquet_file.num_row_groups}",
360
+ **mem_fields(),
361
+ )
362
+ )
363
+ rowgroup_table = parquet_file.read_row_group(rg_idx)
364
+ rows_in_rg = self._import_rowgroup_with_retry(
365
+ c,
366
+ cur,
367
+ rowgroup_table,
368
+ insert_sql,
369
+ filename,
370
+ rg_idx,
371
+ parquet_file.num_row_groups,
372
+ )
373
+ total_rows += rows_in_rg
374
+ else:
375
+ for batch_idx, batch in enumerate(
376
+ parquet_table.to_batches(max_chunksize=self.batch_size)
377
+ ):
378
+ rows = self._rows_from_batch(batch)
379
+ if self.transaction_mode == TransactionMode.BATCH:
380
+ rows_in_batch = self._import_batch_with_retry(
381
+ c,
382
+ cur,
383
+ rows,
384
+ insert_sql,
385
+ filename,
386
+ batch=batch_idx,
387
+ offset=total_rows,
388
+ )
389
+ total_rows += rows_in_batch
390
+ else:
391
+ cur.executemany(insert_sql, rows)
392
+ total_rows += len(rows)
393
+
394
+ if self.transaction_mode == TransactionMode.FILE:
395
+ c.commit()
396
+
397
+ logger.info(
398
+ "Completed "
399
+ + context(
400
+ file=filename,
401
+ rows=total_rows,
402
+ seconds=f"{time.time() - start:.2f}",
403
+ **mem_fields(),
404
+ )
405
+ )
406
+
407
+ # ------------------------------------------------------------------
408
+ # producer-consumer helpers
409
+ # ------------------------------------------------------------------
410
+
411
+ def _parquet_reader_thread(
412
+ self,
413
+ parquet_file: pq.ParquetFile,
414
+ buf: queue.Queue[_QueueItem | object],
415
+ filename: str,
416
+ cancel: threading.Event,
417
+ ) -> None:
418
+ """Producer: read rowgroups/batches and enqueue row tuples."""
419
+ total_rg = parquet_file.num_row_groups
420
+ try:
421
+ for rg_idx in range(total_rg):
422
+ if cancel.is_set():
423
+ return
424
+ rg_label = f"{rg_idx + 1}/{total_rg}"
425
+ self.mem_gate.check(f"read_row_group file={filename} rg={rg_label}")
426
+ logger.debug(
427
+ "Reading row group "
428
+ + context(file=filename, rg=rg_label, **mem_fields())
429
+ )
430
+ table = parquet_file.read_row_group(rg_idx)
431
+
432
+ if self.transaction_mode == TransactionMode.ROWGROUP:
433
+ rows = rows_from_arrow(table)
434
+ buf.put(
435
+ _QueueItem(
436
+ rows=rows,
437
+ rg_label=rg_label,
438
+ batch_idx=None,
439
+ row_count=len(rows),
440
+ )
441
+ )
442
+ else:
443
+ for batch_idx, batch in enumerate(
444
+ table.to_batches(max_chunksize=self.batch_size)
445
+ ):
446
+ if cancel.is_set():
447
+ return
448
+ rows = self._rows_from_batch(batch)
449
+ buf.put(
450
+ _QueueItem(
451
+ rows=rows,
452
+ rg_label=rg_label,
453
+ batch_idx=batch_idx,
454
+ row_count=len(rows),
455
+ )
456
+ )
457
+ except Exception as exc:
458
+ buf.put(exc)
459
+ return
460
+ finally:
461
+ buf.put(_SENTINEL)
462
+
463
+ def _import_file_with_mssql(
464
+ self, filepath, filename, start, target_table: str | None = None
465
+ ):
466
+ """Import a parquet file using mssql-python's bulkcopy API.
467
+
468
+ Uses a producer-consumer pattern: a reader thread pre-reads
469
+ rowgroups/batches into a bounded queue while the caller thread
470
+ pushes rows to SQL Server via ``cursor.bulkcopy``. This keeps
471
+ the TDS pipe fed and avoids ASYNC_NETWORK_IO waits on the
472
+ server.
473
+ """
474
+ parquet_file = pq.ParquetFile(filepath)
475
+ columns = parquet_file.schema.names
476
+ target_table_name = target_table or self.full_table_name()
477
+
478
+ conn = self.connection_m()
479
+ try:
480
+ cur = conn.cursor()
481
+ try:
482
+ table_columns = [
483
+ col[0]
484
+ for col in cur.execute(
485
+ f"SELECT TOP 0 * FROM {target_table_name}"
486
+ ).description
487
+ ]
488
+ self.validate_schema(columns, table_columns, filename)
489
+ finally:
490
+ cur.close()
491
+
492
+ total_rows = 0
493
+ buf: queue.Queue[_QueueItem | object] = queue.Queue(maxsize=2)
494
+ cancel = threading.Event()
495
+
496
+ reader = threading.Thread(
497
+ target=self._parquet_reader_thread,
498
+ args=(parquet_file, buf, filename, cancel),
499
+ daemon=True,
500
+ name=f"mssql-reader-{filename}",
501
+ )
502
+ reader.start()
503
+
504
+ try:
505
+ while True:
506
+ item = buf.get()
507
+ if item is _SENTINEL:
508
+ break
509
+ if isinstance(item, Exception):
510
+ raise item
511
+ assert isinstance(item, _QueueItem)
512
+
513
+ if self.transaction_mode == TransactionMode.ROWGROUP:
514
+ rows_in_rg = self._mssql_bulkcopy_with_retry(
515
+ conn,
516
+ item.rows,
517
+ columns,
518
+ target_table_name,
519
+ filename,
520
+ op="bulkcopy(rowgroup)",
521
+ rg=item.rg_label,
522
+ offset=total_rows,
523
+ is_rows=True,
524
+ )
525
+ total_rows += rows_in_rg
526
+ logger.debug(
527
+ "Processed row group "
528
+ + context(
529
+ file=filename,
530
+ rg=item.rg_label,
531
+ **mem_fields(),
532
+ )
533
+ )
534
+ elif self.transaction_mode == TransactionMode.BATCH:
535
+ rows_in_batch = self._mssql_bulkcopy_with_retry(
536
+ conn,
537
+ item.rows,
538
+ columns,
539
+ target_table_name,
540
+ filename,
541
+ op="bulkcopy(batch)",
542
+ rg=item.rg_label,
543
+ batch=item.batch_idx,
544
+ offset=total_rows,
545
+ is_rows=True,
546
+ )
547
+ total_rows += rows_in_batch
548
+ else:
549
+ # FILE / ROW modes — no per-item retry
550
+ cursor = conn.cursor()
551
+ try:
552
+ cursor.bulkcopy(
553
+ target_table_name,
554
+ item.rows,
555
+ column_mappings=columns,
556
+ )
557
+ finally:
558
+ cursor.close()
559
+ total_rows += item.row_count
560
+ except BaseException:
561
+ cancel.set()
562
+ raise
563
+ finally:
564
+ reader.join(timeout=5)
565
+
566
+ if self.transaction_mode == TransactionMode.FILE:
567
+ conn.commit()
568
+
569
+ logger.info(
570
+ "Completed "
571
+ + context(
572
+ file=filename,
573
+ rows=total_rows,
574
+ seconds=f"{time.time() - start:.2f}",
575
+ **mem_fields(),
576
+ )
577
+ )
578
+ finally:
579
+ conn.close()
580
+
581
+ def _mssql_bulkcopy_with_retry(
582
+ self,
583
+ conn,
584
+ data,
585
+ columns,
586
+ target_table_name,
587
+ filename,
588
+ op="bulkcopy",
589
+ rg=None,
590
+ batch=None,
591
+ offset=None,
592
+ is_rows=False,
593
+ ):
594
+ """Execute bulkcopy with retry logic."""
595
+ if not is_rows:
596
+ rows = self._rows_from_arrow_table(data)
597
+ else:
598
+ rows = data
599
+
600
+ for attempt in range(self.config.retries):
601
+ try:
602
+ cursor = conn.cursor()
603
+ try:
604
+ result = cursor.bulkcopy(
605
+ target_table_name,
606
+ rows,
607
+ column_mappings=columns,
608
+ )
609
+ finally:
610
+ cursor.close()
611
+ if self.transaction_mode in (
612
+ TransactionMode.BATCH,
613
+ TransactionMode.ROWGROUP,
614
+ ):
615
+ conn.commit()
616
+ return (
617
+ result.get("rows_copied", len(rows))
618
+ if isinstance(result, dict)
619
+ else len(rows)
620
+ )
621
+ except MemoryError:
622
+ logger.error(
623
+ f"Out of memory during {op} - not retrying (fatal) "
624
+ + context(file=filename, rg=rg, batch=batch, offset=offset)
625
+ )
626
+ raise
627
+ except Exception as e:
628
+ safe_msg = self.safe_error_message(e)
629
+ if attempt < self.config.retries - 1:
630
+ logger.warning(
631
+ f"{op} attempt {attempt + 1}/{self.config.retries} failed "
632
+ + context(
633
+ file=filename,
634
+ rg=rg,
635
+ batch=batch,
636
+ rows=len(rows),
637
+ offset=offset,
638
+ )
639
+ + f": {safe_msg}"
640
+ )
641
+ conn.rollback()
642
+ time.sleep(2**attempt)
643
+ else:
644
+ raise BatchImportError(
645
+ f"Bulk copy failed after {self.config.retries} retries "
646
+ + context(file=filename, rg=rg, batch=batch, offset=offset)
647
+ + f": {safe_msg}"
648
+ ) from e
649
+
650
+ def _rows_from_arrow_table(self, table):
651
+ """Convert a PyArrow table to a list of tuples for bulkcopy."""
652
+ return rows_from_arrow(table)
653
+
654
+ def _import_batch_with_retry(
655
+ self,
656
+ c,
657
+ cur,
658
+ rows_or_batch,
659
+ insert_sql,
660
+ filename,
661
+ rg=None,
662
+ batch=None,
663
+ offset=None,
664
+ ):
665
+ """Import a single batch with retry logic for BATCH mode."""
666
+ rows = (
667
+ rows_or_batch
668
+ if isinstance(rows_or_batch, list)
669
+ else self._rows_from_batch(rows_or_batch)
670
+ )
671
+
672
+ for attempt in range(self.config.retries):
673
+ try:
674
+ cur.executemany(insert_sql, rows)
675
+ c.commit()
676
+ return len(rows)
677
+ except MemoryError:
678
+ logger.error(
679
+ "Out of memory during batch insert - not retrying (fatal) "
680
+ + context(file=filename, rg=rg, batch=batch, offset=offset)
681
+ )
682
+ raise
683
+ except Exception as e:
684
+ safe_msg = self.safe_error_message(e)
685
+
686
+ if attempt < self.config.retries - 1:
687
+ logger.warning(
688
+ f"batch insert attempt {attempt + 1}/{self.config.retries} "
689
+ "failed "
690
+ + context(
691
+ file=filename,
692
+ rg=rg,
693
+ batch=batch,
694
+ rows=len(rows),
695
+ offset=offset,
696
+ )
697
+ + f": {safe_msg}"
698
+ )
699
+ c.rollback()
700
+ time.sleep(2**attempt)
701
+ else:
702
+ raise BatchImportError(
703
+ f"Batch import failed after {self.config.retries} retries "
704
+ + context(file=filename, rg=rg, batch=batch, offset=offset)
705
+ + f": {safe_msg}"
706
+ ) from e
707
+
708
+ def _import_rowgroup_with_retry(
709
+ self,
710
+ c,
711
+ cur,
712
+ table_or_batch,
713
+ insert_sql,
714
+ filename,
715
+ rg_idx=None,
716
+ total_rg=None,
717
+ *,
718
+ rg=None,
719
+ ):
720
+ """Import a single row group with retry logic for ROWGROUP mode.
721
+
722
+ ``table_or_batch`` may be a PyArrow Table, RecordBatch, or a
723
+ pre-converted ``list[tuple]`` of rows (from the reader thread).
724
+ Pass *rg* to supply a pre-formatted label; otherwise it is
725
+ derived from *rg_idx* / *total_rg*.
726
+ """
727
+ if rg is None:
728
+ rg = f"{rg_idx + 1}/{total_rg}"
729
+ is_rows = isinstance(table_or_batch, list)
730
+ for attempt in range(self.config.retries):
731
+ try:
732
+ total_rows = 0
733
+ if is_rows:
734
+ for i in range(0, len(table_or_batch), self.batch_size):
735
+ chunk = table_or_batch[i : i + self.batch_size]
736
+ cur.executemany(insert_sql, chunk)
737
+ total_rows += len(chunk)
738
+ else:
739
+ to_batches = getattr(table_or_batch, "to_batches", None)
740
+ rowgroup_batches = (
741
+ table_or_batch.to_batches(max_chunksize=self.batch_size)
742
+ if callable(to_batches)
743
+ else [table_or_batch]
744
+ )
745
+ for batch in rowgroup_batches:
746
+ rows = rows_from_arrow(batch)
747
+ cur.executemany(insert_sql, rows)
748
+ total_rows += len(rows)
749
+
750
+ c.commit()
751
+ logger.debug("Processed row group " + context(file=filename, rg=rg))
752
+ return total_rows
753
+ except MemoryError:
754
+ logger.error(
755
+ "Out of memory during rowgroup insert - not retrying (fatal) "
756
+ + context(file=filename, rg=rg)
757
+ )
758
+ raise
759
+ except Exception as e:
760
+ safe_msg = self.safe_error_message(e)
761
+
762
+ if attempt < self.config.retries - 1:
763
+ logger.warning(
764
+ f"rowgroup insert attempt {attempt + 1}/{self.config.retries} "
765
+ "failed " + context(file=filename, rg=rg) + f": {safe_msg}"
766
+ )
767
+ c.rollback()
768
+ time.sleep(2**attempt)
769
+ else:
770
+ raise RowGroupImportError(
771
+ f"Row group import failed after {self.config.retries} retries "
772
+ + context(file=filename, rg=rg)
773
+ + f": {safe_msg}"
774
+ ) from e
775
+
776
+ def _make_temp_table_name(self, worker_index: int) -> str:
777
+ suffix = uuid.uuid4().hex[:8]
778
+ return f"{self.schema}.{self.table}_{worker_index + 1:02d}_{suffix}"
779
+
780
+ def _make_columnstore_index_name(self, temp_table_name: str) -> str:
781
+ table_part = temp_table_name.split(".", 1)[-1]
782
+ return quote_identifier(f"cci_{table_part}")
783
+
784
+ def _execute_temp_table_ddl(self, cur, count: int) -> list[str]:
785
+ """Run the CREATE TABLE / CCI DDL on a cursor, returning table names."""
786
+ temp_tables: list[str] = []
787
+ for i in range(count):
788
+ temp_table_name = self._make_temp_table_name(i)
789
+ cur.execute(
790
+ f"SELECT TOP 0 * INTO {temp_table_name} FROM {self.full_table_name()}"
791
+ )
792
+ if self.create_cci:
793
+ index_name = self._make_columnstore_index_name(temp_table_name)
794
+ cur.execute(
795
+ f"CREATE CLUSTERED COLUMNSTORE INDEX {index_name} "
796
+ f"ON {temp_table_name}"
797
+ )
798
+ temp_tables.append(temp_table_name)
799
+ return temp_tables
800
+
801
+ def _create_temp_tables(self, count: int) -> list[str]:
802
+ if self.engine == "mssql-python":
803
+ conn = self.connection_m(autocommit=True)
804
+ try:
805
+ cur = conn.cursor()
806
+ try:
807
+ return self._execute_temp_table_ddl(cur, count)
808
+ finally:
809
+ cur.close()
810
+ finally:
811
+ conn.close()
812
+ else:
813
+ with self.connection_p(autocommit=True) as conn:
814
+ with conn.cursor() as cur:
815
+ return self._execute_temp_table_ddl(cur, count)
816
+
817
+ def _assign_files_to_workers(
818
+ self, filenames: list[str], temp_tables: list[str]
819
+ ) -> dict[str, list[str]]:
820
+ assignments: dict[str, list[str]] = {tbl: [] for tbl in temp_tables}
821
+ for index, filename in enumerate(filenames):
822
+ target_table = temp_tables[index % len(temp_tables)]
823
+ assignments[target_table].append(filename)
824
+ return assignments
825
+
826
+ def _write_temp_manifest(self, temp_tables: list[str]) -> Path:
827
+ return write_manifest(
828
+ self.input_path / self.temp_manifest_filename,
829
+ temp_tables,
830
+ manifest_type="tables",
831
+ )
832
+
833
+ def _import_files_to_temp_table(self, target_table: str, filenames: list[str]):
834
+ for filename in filenames:
835
+ self.import_file(filename, target_table=target_table)
836
+
837
+ def _delete_original_files(self, filenames: list[str]):
838
+ for filename in filenames:
839
+ path = self.input_path / filename
840
+ if path.exists():
841
+ path.unlink()
842
+
843
+ manifest_path = self.input_path / self.manifest_filename
844
+ if manifest_path.exists():
845
+ manifest_path.unlink()
846
+
847
+ def perform_work(self):
848
+ # Import runs in a single process (worker threads share its memory), so
849
+ # one heartbeat here reports the whole run's RSS trend.
850
+ with MemoryHeartbeat(self.mem_heartbeat, unit="import"):
851
+ self._perform_work()
852
+
853
+ def _perform_work(self):
854
+ filenames = self.load_manifest_entries()
855
+
856
+ log_memory_budget(
857
+ operation="import",
858
+ workers=self.worker_count,
859
+ threshold_pct=self.mem_gate.threshold_pct,
860
+ )
861
+
862
+ if self.worker_count > 1 and len(filenames) > 1:
863
+ worker_count = min(self.worker_count, len(filenames))
864
+ temp_tables = self._create_temp_tables(worker_count)
865
+ assignments = self._assign_files_to_workers(filenames, temp_tables)
866
+
867
+ with ThreadPoolExecutor(
868
+ max_workers=worker_count, thread_name_prefix="import"
869
+ ) as executor:
870
+ futures = {
871
+ executor.submit(
872
+ self._import_files_to_temp_table, target_table, assigned
873
+ ): target_table
874
+ for target_table, assigned in assignments.items()
875
+ if assigned
876
+ }
877
+
878
+ self._await_futures(futures, label="table")
879
+
880
+ manifest_file = self._write_temp_manifest(temp_tables)
881
+ logger.info("Wrote temporary table manifest " + context(file=manifest_file))
882
+ return
883
+
884
+ with ThreadPoolExecutor(
885
+ max_workers=self.worker_count, thread_name_prefix="import"
886
+ ) as executor:
887
+ futures = {
888
+ executor.submit(self.import_file, filename): filename
889
+ for filename in filenames
890
+ }
891
+
892
+ self._await_futures(futures, label="file")
893
+
894
+ def _await_futures(self, futures, label):
895
+ """Wait for worker futures, surfacing *all* failures before re-raising.
896
+
897
+ Without this, a worker exception only re-raises as a bare traceback with
898
+ no indication of *which* file/table the dead worker was handling. We now
899
+ wait for every future so errors from all workers are logged, then raise
900
+ the first failure.
901
+ """
902
+ first_error: Exception | None = None
903
+ completed_units: list[str] = []
904
+ for future in as_completed(futures):
905
+ unit = futures[future]
906
+ try:
907
+ future.result()
908
+ completed_units.append(str(unit))
909
+ except Exception as e:
910
+ logger.error(
911
+ "Worker failed "
912
+ + context(**{label: unit})
913
+ + f": {self.safe_error_message(e)}"
914
+ )
915
+ if first_error is None:
916
+ first_error = e
917
+ if first_error is not None:
918
+ log_failure_summary(
919
+ operation="import",
920
+ workers=len(futures),
921
+ completed=completed_units,
922
+ failed_error=self.safe_error_message(first_error),
923
+ )
924
+ raise first_error
925
+
926
+
927
+ if __name__ == "__main__":
928
+ pass