pybutt 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- old_tests/app.py +713 -0
- pybutt/__init__.py +17 -0
- pybutt/cli/__init__.py +11 -0
- pybutt/cli/app.py +94 -0
- pybutt/cli/combine_command.py +236 -0
- pybutt/cli/export_command.py +317 -0
- pybutt/cli/import_command.py +286 -0
- pybutt/cli/inspect_command.py +30 -0
- pybutt/cli/purge_command.py +235 -0
- pybutt/core/__init__.py +30 -0
- pybutt/core/base.py +124 -0
- pybutt/core/config.py +144 -0
- pybutt/core/logobs.py +445 -0
- pybutt/exceptions.py +82 -0
- pybutt/files/__init__.py +28 -0
- pybutt/files/combine.py +93 -0
- pybutt/files/inspect.py +51 -0
- pybutt/files/manifest.py +160 -0
- pybutt/io/__init__.py +6 -0
- pybutt/io/combiner.py +119 -0
- pybutt/io/exporter.py +612 -0
- pybutt/io/importer.py +928 -0
- pybutt/io/purger.py +44 -0
- pybutt-2.0.0.dist-info/METADATA +756 -0
- pybutt-2.0.0.dist-info/RECORD +39 -0
- pybutt-2.0.0.dist-info/WHEEL +5 -0
- pybutt-2.0.0.dist-info/entry_points.txt +2 -0
- pybutt-2.0.0.dist-info/licenses/LICENSE +21 -0
- pybutt-2.0.0.dist-info/top_level.txt +3 -0
- tests/conftest.py +22 -0
- tests/test_cli.py +979 -0
- tests/test_cli_help.py +130 -0
- tests/test_combiner.py +259 -0
- tests/test_core.py +1009 -0
- tests/test_exporter.py +637 -0
- tests/test_files.py +178 -0
- tests/test_import_retry_logic.py +837 -0
- tests/test_logobs.py +491 -0
- tests/test_purge.py +219 -0
pybutt/io/importer.py
ADDED
|
@@ -0,0 +1,928 @@
|
|
|
1
|
+
import queue
|
|
2
|
+
import threading
|
|
3
|
+
import time
|
|
4
|
+
import uuid
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import pyarrow.parquet as pq
|
|
11
|
+
|
|
12
|
+
from pybutt.core.base import SqlServerIOBase, rows_from_arrow
|
|
13
|
+
from pybutt.core.config import (
|
|
14
|
+
BATCH_SIZE_DEFAULT,
|
|
15
|
+
CCI_DEFAULT,
|
|
16
|
+
IMPORT_ENGINE_DEFAULT,
|
|
17
|
+
MEM_COOLDOWN_DEFAULT,
|
|
18
|
+
MEM_HEARTBEAT_DEFAULT,
|
|
19
|
+
MEM_MAX_WAIT_DEFAULT,
|
|
20
|
+
MEM_SLEEP_DEFAULT,
|
|
21
|
+
MEM_THRESHOLD_DEFAULT,
|
|
22
|
+
SCHEMA_DEFAULT,
|
|
23
|
+
TRANSACTION_MODE_DEFAULT,
|
|
24
|
+
SqlConfig,
|
|
25
|
+
TransactionMode,
|
|
26
|
+
coerce_transaction_mode,
|
|
27
|
+
quote_identifier,
|
|
28
|
+
validate_engine,
|
|
29
|
+
validate_identifier,
|
|
30
|
+
)
|
|
31
|
+
from pybutt.core.logobs import (
|
|
32
|
+
MemoryGate,
|
|
33
|
+
MemoryHeartbeat,
|
|
34
|
+
context,
|
|
35
|
+
get_logger,
|
|
36
|
+
log_failure_summary,
|
|
37
|
+
log_memory_budget,
|
|
38
|
+
mem_fields,
|
|
39
|
+
)
|
|
40
|
+
from pybutt.exceptions import (
|
|
41
|
+
BatchImportError,
|
|
42
|
+
RowGroupImportError,
|
|
43
|
+
SchemaMismatchError,
|
|
44
|
+
)
|
|
45
|
+
from pybutt.files import (
|
|
46
|
+
default_import_manifest_filename,
|
|
47
|
+
default_manifest_filename,
|
|
48
|
+
load_file_manifest,
|
|
49
|
+
load_manifest,
|
|
50
|
+
validate_manifest_entries,
|
|
51
|
+
write_manifest,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
logger = get_logger("importer")
|
|
55
|
+
|
|
56
|
+
_SENTINEL = object()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass(slots=True)
|
|
60
|
+
class _QueueItem:
|
|
61
|
+
"""Payload passed from the reader thread to the writer."""
|
|
62
|
+
|
|
63
|
+
rows: list[tuple[Any, ...]]
|
|
64
|
+
rg_label: str
|
|
65
|
+
batch_idx: int | None
|
|
66
|
+
row_count: int
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class Importer(SqlServerIOBase):
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
config: SqlConfig,
|
|
73
|
+
table: str,
|
|
74
|
+
input_path,
|
|
75
|
+
manifest_filename: str | None,
|
|
76
|
+
schema: str = SCHEMA_DEFAULT,
|
|
77
|
+
worker_count=1,
|
|
78
|
+
batch_size: int = BATCH_SIZE_DEFAULT,
|
|
79
|
+
transaction_mode: TransactionMode = TRANSACTION_MODE_DEFAULT,
|
|
80
|
+
engine=IMPORT_ENGINE_DEFAULT,
|
|
81
|
+
temp_manifest_filename: str | None = None,
|
|
82
|
+
create_cci: bool = CCI_DEFAULT,
|
|
83
|
+
mem_heartbeat: float = MEM_HEARTBEAT_DEFAULT,
|
|
84
|
+
mem_threshold: float = MEM_THRESHOLD_DEFAULT,
|
|
85
|
+
mem_sleep: float = MEM_SLEEP_DEFAULT,
|
|
86
|
+
mem_max_wait: float = MEM_MAX_WAIT_DEFAULT,
|
|
87
|
+
mem_cooldown: float = MEM_COOLDOWN_DEFAULT,
|
|
88
|
+
):
|
|
89
|
+
super().__init__(config)
|
|
90
|
+
self.schema = validate_identifier(schema)
|
|
91
|
+
self.table = validate_identifier(table)
|
|
92
|
+
|
|
93
|
+
self.input_path = Path(input_path)
|
|
94
|
+
self.manifest_filename = (
|
|
95
|
+
manifest_filename
|
|
96
|
+
if manifest_filename
|
|
97
|
+
else default_manifest_filename(self.schema, self.table)
|
|
98
|
+
)
|
|
99
|
+
self.temp_manifest_filename = (
|
|
100
|
+
temp_manifest_filename
|
|
101
|
+
if temp_manifest_filename
|
|
102
|
+
else default_import_manifest_filename(self.schema, self.table)
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
self.worker_count = worker_count
|
|
106
|
+
self.transaction_mode = coerce_transaction_mode(transaction_mode)
|
|
107
|
+
validate_engine(engine)
|
|
108
|
+
self.engine = engine
|
|
109
|
+
self.batch_size = batch_size
|
|
110
|
+
self.create_cci = create_cci
|
|
111
|
+
self.mem_heartbeat = mem_heartbeat
|
|
112
|
+
self.mem_gate = MemoryGate(mem_threshold, mem_sleep, mem_max_wait, mem_cooldown)
|
|
113
|
+
|
|
114
|
+
def load_manifest(self):
|
|
115
|
+
manifest_file = self.input_path / self.manifest_filename
|
|
116
|
+
return load_manifest(manifest_file)
|
|
117
|
+
|
|
118
|
+
def load_manifest_entries(self):
|
|
119
|
+
manifest = load_file_manifest(
|
|
120
|
+
self.input_path / self.manifest_filename, operation="Importer"
|
|
121
|
+
)
|
|
122
|
+
return validate_manifest_entries(manifest, self.input_path)
|
|
123
|
+
|
|
124
|
+
def _build_insert_sql(
|
|
125
|
+
self, columns: list[str], target_table: str | None = None
|
|
126
|
+
) -> str:
|
|
127
|
+
column_list = ", ".join(quote_identifier(col) for col in columns)
|
|
128
|
+
placeholders = ", ".join("?" for _ in columns)
|
|
129
|
+
table_name = target_table or self.full_table_name()
|
|
130
|
+
return f"INSERT INTO {table_name} ({column_list}) VALUES ({placeholders})"
|
|
131
|
+
|
|
132
|
+
def _rows_from_batch(self, batch):
|
|
133
|
+
return rows_from_arrow(batch)
|
|
134
|
+
|
|
135
|
+
def _validate_and_build_insert(
|
|
136
|
+
self, cur, columns, filename, target_table: str | None = None
|
|
137
|
+
):
|
|
138
|
+
table_columns = self.get_table_columns(cur, target_table=target_table)
|
|
139
|
+
self.validate_schema(columns, table_columns, filename)
|
|
140
|
+
return self._build_insert_sql(columns, target_table=target_table)
|
|
141
|
+
|
|
142
|
+
def get_table_columns(self, cur, target_table: str | None = None):
|
|
143
|
+
target_table = target_table or self.full_table_name()
|
|
144
|
+
cur.execute(f"SELECT TOP 0 * FROM {target_table}")
|
|
145
|
+
return [column[0] for column in cur.description]
|
|
146
|
+
|
|
147
|
+
def validate_schema(self, parquet_columns, table_columns, filename):
|
|
148
|
+
parquet_set = set(parquet_columns)
|
|
149
|
+
table_set = set(table_columns)
|
|
150
|
+
|
|
151
|
+
if parquet_set != table_set:
|
|
152
|
+
missing_in_sql = parquet_set - table_set
|
|
153
|
+
missing_in_parquet = table_set - parquet_set
|
|
154
|
+
|
|
155
|
+
raise SchemaMismatchError(
|
|
156
|
+
f"Schema mismatch in {filename}:\n"
|
|
157
|
+
f" Columns in parquet but not SQL: {missing_in_sql}\n"
|
|
158
|
+
f" Columns in SQL but not parquet: {missing_in_parquet}"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def import_file(self, filename, target_table: str | None = None):
|
|
162
|
+
filepath = self.input_path / filename
|
|
163
|
+
start = time.time()
|
|
164
|
+
target_table_name = target_table or self.full_table_name()
|
|
165
|
+
|
|
166
|
+
logger.info(
|
|
167
|
+
"Importing "
|
|
168
|
+
+ context(
|
|
169
|
+
file=filename,
|
|
170
|
+
table=target_table_name,
|
|
171
|
+
engine=self.engine,
|
|
172
|
+
batch_size=self.batch_size,
|
|
173
|
+
transaction_mode=self.transaction_mode.value,
|
|
174
|
+
**mem_fields(),
|
|
175
|
+
)
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
if self.transaction_mode == TransactionMode.FILE:
|
|
180
|
+
# For FILE mode, wrap entire operation in retry logic
|
|
181
|
+
def _file_operation():
|
|
182
|
+
if self.engine == "duckdb":
|
|
183
|
+
return self._import_file_with_duckdb(
|
|
184
|
+
filepath, filename, start, target_table=target_table
|
|
185
|
+
)
|
|
186
|
+
elif self.engine == "mssql-python":
|
|
187
|
+
return self._import_file_with_mssql(
|
|
188
|
+
filepath, filename, start, target_table=target_table
|
|
189
|
+
)
|
|
190
|
+
return self._import_file_impl(
|
|
191
|
+
filepath, filename, start, target_table=target_table
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
self.retry(_file_operation, context=f"Import file {filename}")
|
|
195
|
+
else:
|
|
196
|
+
# For BATCH, ROWGROUP, and ROW modes, retries happen at granular level
|
|
197
|
+
if self.engine == "duckdb":
|
|
198
|
+
self._import_file_with_duckdb(
|
|
199
|
+
filepath, filename, start, target_table=target_table
|
|
200
|
+
)
|
|
201
|
+
elif self.engine == "mssql-python":
|
|
202
|
+
self._import_file_with_mssql(
|
|
203
|
+
filepath, filename, start, target_table=target_table
|
|
204
|
+
)
|
|
205
|
+
else:
|
|
206
|
+
self._import_file_impl(
|
|
207
|
+
filepath, filename, start, target_table=target_table
|
|
208
|
+
)
|
|
209
|
+
except MemoryError:
|
|
210
|
+
logger.error(
|
|
211
|
+
"Out of memory - not retrying (fatal) " + context(file=filename)
|
|
212
|
+
)
|
|
213
|
+
raise
|
|
214
|
+
except Exception as e:
|
|
215
|
+
logger.error(
|
|
216
|
+
"Failed importing "
|
|
217
|
+
+ context(file=filename)
|
|
218
|
+
+ f": {self.safe_error_message(e)}"
|
|
219
|
+
)
|
|
220
|
+
logger.debug("Traceback for failed import of %s", filename, exc_info=True)
|
|
221
|
+
raise
|
|
222
|
+
|
|
223
|
+
return True
|
|
224
|
+
|
|
225
|
+
def _import_file_impl(
|
|
226
|
+
self, filepath, filename, start, target_table: str | None = None
|
|
227
|
+
):
|
|
228
|
+
"""Implementation of file import with transaction management.
|
|
229
|
+
|
|
230
|
+
Uses a producer-consumer pattern: a reader thread pre-reads
|
|
231
|
+
rowgroups/batches into a bounded queue while the caller thread
|
|
232
|
+
pushes rows to SQL Server via ``cur.executemany``. This keeps
|
|
233
|
+
the TDS pipe fed and reduces ASYNC_NETWORK_IO waits.
|
|
234
|
+
"""
|
|
235
|
+
with self.connection_p() as c:
|
|
236
|
+
with c.cursor() as cur:
|
|
237
|
+
cur.fast_executemany = True
|
|
238
|
+
parquet_file = pq.ParquetFile(filepath)
|
|
239
|
+
columns = parquet_file.schema.names
|
|
240
|
+
insert_sql = self._validate_and_build_insert(
|
|
241
|
+
cur, columns, filename, target_table=target_table
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
total_rows = 0
|
|
245
|
+
buf: queue.Queue[_QueueItem | object] = queue.Queue(maxsize=2)
|
|
246
|
+
cancel = threading.Event()
|
|
247
|
+
|
|
248
|
+
reader = threading.Thread(
|
|
249
|
+
target=self._parquet_reader_thread,
|
|
250
|
+
args=(parquet_file, buf, filename, cancel),
|
|
251
|
+
daemon=True,
|
|
252
|
+
name=f"pyodbc-reader-{filename}",
|
|
253
|
+
)
|
|
254
|
+
reader.start()
|
|
255
|
+
|
|
256
|
+
try:
|
|
257
|
+
while True:
|
|
258
|
+
item = buf.get()
|
|
259
|
+
if item is _SENTINEL:
|
|
260
|
+
break
|
|
261
|
+
if isinstance(item, Exception):
|
|
262
|
+
raise item
|
|
263
|
+
assert isinstance(item, _QueueItem)
|
|
264
|
+
|
|
265
|
+
if self.transaction_mode == TransactionMode.ROWGROUP:
|
|
266
|
+
rows_in_rg = self._import_rowgroup_with_retry(
|
|
267
|
+
c,
|
|
268
|
+
cur,
|
|
269
|
+
item.rows,
|
|
270
|
+
insert_sql,
|
|
271
|
+
filename,
|
|
272
|
+
rg=item.rg_label,
|
|
273
|
+
)
|
|
274
|
+
total_rows += rows_in_rg
|
|
275
|
+
elif self.transaction_mode == TransactionMode.BATCH:
|
|
276
|
+
rows_in_batch = self._import_batch_with_retry(
|
|
277
|
+
c,
|
|
278
|
+
cur,
|
|
279
|
+
item.rows,
|
|
280
|
+
insert_sql,
|
|
281
|
+
filename,
|
|
282
|
+
rg=item.rg_label,
|
|
283
|
+
batch=item.batch_idx,
|
|
284
|
+
offset=total_rows,
|
|
285
|
+
)
|
|
286
|
+
total_rows += rows_in_batch
|
|
287
|
+
else:
|
|
288
|
+
# FILE / ROW modes — no per-item commit
|
|
289
|
+
cur.executemany(insert_sql, item.rows)
|
|
290
|
+
total_rows += item.row_count
|
|
291
|
+
except BaseException:
|
|
292
|
+
cancel.set()
|
|
293
|
+
raise
|
|
294
|
+
finally:
|
|
295
|
+
reader.join(timeout=5)
|
|
296
|
+
|
|
297
|
+
# Commit after entire file if in FILE mode
|
|
298
|
+
if self.transaction_mode == TransactionMode.FILE:
|
|
299
|
+
c.commit()
|
|
300
|
+
|
|
301
|
+
logger.info(
|
|
302
|
+
"Completed "
|
|
303
|
+
+ context(
|
|
304
|
+
file=filename,
|
|
305
|
+
rows=total_rows,
|
|
306
|
+
seconds=f"{time.time() - start:.2f}",
|
|
307
|
+
**mem_fields(),
|
|
308
|
+
)
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
def _load_parquet_with_duckdb(self, filepath):
|
|
312
|
+
logger.debug(
|
|
313
|
+
"Loading parquet via DuckDB " + context(file=str(filepath), **mem_fields())
|
|
314
|
+
)
|
|
315
|
+
with self.connection_d() as dconn:
|
|
316
|
+
sanitized_path = str(filepath.as_posix()).replace("'", "''")
|
|
317
|
+
table = dconn.execute(
|
|
318
|
+
f"SELECT * FROM read_parquet('{sanitized_path}')"
|
|
319
|
+
).fetch_arrow_table()
|
|
320
|
+
logger.debug(
|
|
321
|
+
"Loaded parquet via DuckDB "
|
|
322
|
+
+ context(
|
|
323
|
+
file=str(filepath),
|
|
324
|
+
rows=table.num_rows,
|
|
325
|
+
**mem_fields(),
|
|
326
|
+
)
|
|
327
|
+
)
|
|
328
|
+
return table
|
|
329
|
+
|
|
330
|
+
def _import_file_with_duckdb(
|
|
331
|
+
self, filepath, filename, start, target_table: str | None = None
|
|
332
|
+
):
|
|
333
|
+
with self.connection_p() as c:
|
|
334
|
+
with c.cursor() as cur:
|
|
335
|
+
cur.fast_executemany = True
|
|
336
|
+
if self.transaction_mode == TransactionMode.ROWGROUP:
|
|
337
|
+
parquet_file = pq.ParquetFile(filepath)
|
|
338
|
+
columns = parquet_file.schema.names
|
|
339
|
+
else:
|
|
340
|
+
parquet_table = self._load_parquet_with_duckdb(filepath)
|
|
341
|
+
columns = parquet_table.schema.names
|
|
342
|
+
|
|
343
|
+
insert_sql = self._validate_and_build_insert(
|
|
344
|
+
cur, columns, filename, target_table=target_table
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
total_rows = 0
|
|
348
|
+
|
|
349
|
+
if self.transaction_mode == TransactionMode.ROWGROUP:
|
|
350
|
+
for rg_idx in range(parquet_file.num_row_groups):
|
|
351
|
+
self.mem_gate.check(
|
|
352
|
+
f"read_row_group file={filename}"
|
|
353
|
+
f" rg={rg_idx + 1}/{parquet_file.num_row_groups}"
|
|
354
|
+
)
|
|
355
|
+
logger.debug(
|
|
356
|
+
"Reading row group "
|
|
357
|
+
+ context(
|
|
358
|
+
file=filename,
|
|
359
|
+
rg=f"{rg_idx + 1}/{parquet_file.num_row_groups}",
|
|
360
|
+
**mem_fields(),
|
|
361
|
+
)
|
|
362
|
+
)
|
|
363
|
+
rowgroup_table = parquet_file.read_row_group(rg_idx)
|
|
364
|
+
rows_in_rg = self._import_rowgroup_with_retry(
|
|
365
|
+
c,
|
|
366
|
+
cur,
|
|
367
|
+
rowgroup_table,
|
|
368
|
+
insert_sql,
|
|
369
|
+
filename,
|
|
370
|
+
rg_idx,
|
|
371
|
+
parquet_file.num_row_groups,
|
|
372
|
+
)
|
|
373
|
+
total_rows += rows_in_rg
|
|
374
|
+
else:
|
|
375
|
+
for batch_idx, batch in enumerate(
|
|
376
|
+
parquet_table.to_batches(max_chunksize=self.batch_size)
|
|
377
|
+
):
|
|
378
|
+
rows = self._rows_from_batch(batch)
|
|
379
|
+
if self.transaction_mode == TransactionMode.BATCH:
|
|
380
|
+
rows_in_batch = self._import_batch_with_retry(
|
|
381
|
+
c,
|
|
382
|
+
cur,
|
|
383
|
+
rows,
|
|
384
|
+
insert_sql,
|
|
385
|
+
filename,
|
|
386
|
+
batch=batch_idx,
|
|
387
|
+
offset=total_rows,
|
|
388
|
+
)
|
|
389
|
+
total_rows += rows_in_batch
|
|
390
|
+
else:
|
|
391
|
+
cur.executemany(insert_sql, rows)
|
|
392
|
+
total_rows += len(rows)
|
|
393
|
+
|
|
394
|
+
if self.transaction_mode == TransactionMode.FILE:
|
|
395
|
+
c.commit()
|
|
396
|
+
|
|
397
|
+
logger.info(
|
|
398
|
+
"Completed "
|
|
399
|
+
+ context(
|
|
400
|
+
file=filename,
|
|
401
|
+
rows=total_rows,
|
|
402
|
+
seconds=f"{time.time() - start:.2f}",
|
|
403
|
+
**mem_fields(),
|
|
404
|
+
)
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
# ------------------------------------------------------------------
|
|
408
|
+
# producer-consumer helpers
|
|
409
|
+
# ------------------------------------------------------------------
|
|
410
|
+
|
|
411
|
+
def _parquet_reader_thread(
|
|
412
|
+
self,
|
|
413
|
+
parquet_file: pq.ParquetFile,
|
|
414
|
+
buf: queue.Queue[_QueueItem | object],
|
|
415
|
+
filename: str,
|
|
416
|
+
cancel: threading.Event,
|
|
417
|
+
) -> None:
|
|
418
|
+
"""Producer: read rowgroups/batches and enqueue row tuples."""
|
|
419
|
+
total_rg = parquet_file.num_row_groups
|
|
420
|
+
try:
|
|
421
|
+
for rg_idx in range(total_rg):
|
|
422
|
+
if cancel.is_set():
|
|
423
|
+
return
|
|
424
|
+
rg_label = f"{rg_idx + 1}/{total_rg}"
|
|
425
|
+
self.mem_gate.check(f"read_row_group file={filename} rg={rg_label}")
|
|
426
|
+
logger.debug(
|
|
427
|
+
"Reading row group "
|
|
428
|
+
+ context(file=filename, rg=rg_label, **mem_fields())
|
|
429
|
+
)
|
|
430
|
+
table = parquet_file.read_row_group(rg_idx)
|
|
431
|
+
|
|
432
|
+
if self.transaction_mode == TransactionMode.ROWGROUP:
|
|
433
|
+
rows = rows_from_arrow(table)
|
|
434
|
+
buf.put(
|
|
435
|
+
_QueueItem(
|
|
436
|
+
rows=rows,
|
|
437
|
+
rg_label=rg_label,
|
|
438
|
+
batch_idx=None,
|
|
439
|
+
row_count=len(rows),
|
|
440
|
+
)
|
|
441
|
+
)
|
|
442
|
+
else:
|
|
443
|
+
for batch_idx, batch in enumerate(
|
|
444
|
+
table.to_batches(max_chunksize=self.batch_size)
|
|
445
|
+
):
|
|
446
|
+
if cancel.is_set():
|
|
447
|
+
return
|
|
448
|
+
rows = self._rows_from_batch(batch)
|
|
449
|
+
buf.put(
|
|
450
|
+
_QueueItem(
|
|
451
|
+
rows=rows,
|
|
452
|
+
rg_label=rg_label,
|
|
453
|
+
batch_idx=batch_idx,
|
|
454
|
+
row_count=len(rows),
|
|
455
|
+
)
|
|
456
|
+
)
|
|
457
|
+
except Exception as exc:
|
|
458
|
+
buf.put(exc)
|
|
459
|
+
return
|
|
460
|
+
finally:
|
|
461
|
+
buf.put(_SENTINEL)
|
|
462
|
+
|
|
463
|
+
def _import_file_with_mssql(
|
|
464
|
+
self, filepath, filename, start, target_table: str | None = None
|
|
465
|
+
):
|
|
466
|
+
"""Import a parquet file using mssql-python's bulkcopy API.
|
|
467
|
+
|
|
468
|
+
Uses a producer-consumer pattern: a reader thread pre-reads
|
|
469
|
+
rowgroups/batches into a bounded queue while the caller thread
|
|
470
|
+
pushes rows to SQL Server via ``cursor.bulkcopy``. This keeps
|
|
471
|
+
the TDS pipe fed and avoids ASYNC_NETWORK_IO waits on the
|
|
472
|
+
server.
|
|
473
|
+
"""
|
|
474
|
+
parquet_file = pq.ParquetFile(filepath)
|
|
475
|
+
columns = parquet_file.schema.names
|
|
476
|
+
target_table_name = target_table or self.full_table_name()
|
|
477
|
+
|
|
478
|
+
conn = self.connection_m()
|
|
479
|
+
try:
|
|
480
|
+
cur = conn.cursor()
|
|
481
|
+
try:
|
|
482
|
+
table_columns = [
|
|
483
|
+
col[0]
|
|
484
|
+
for col in cur.execute(
|
|
485
|
+
f"SELECT TOP 0 * FROM {target_table_name}"
|
|
486
|
+
).description
|
|
487
|
+
]
|
|
488
|
+
self.validate_schema(columns, table_columns, filename)
|
|
489
|
+
finally:
|
|
490
|
+
cur.close()
|
|
491
|
+
|
|
492
|
+
total_rows = 0
|
|
493
|
+
buf: queue.Queue[_QueueItem | object] = queue.Queue(maxsize=2)
|
|
494
|
+
cancel = threading.Event()
|
|
495
|
+
|
|
496
|
+
reader = threading.Thread(
|
|
497
|
+
target=self._parquet_reader_thread,
|
|
498
|
+
args=(parquet_file, buf, filename, cancel),
|
|
499
|
+
daemon=True,
|
|
500
|
+
name=f"mssql-reader-{filename}",
|
|
501
|
+
)
|
|
502
|
+
reader.start()
|
|
503
|
+
|
|
504
|
+
try:
|
|
505
|
+
while True:
|
|
506
|
+
item = buf.get()
|
|
507
|
+
if item is _SENTINEL:
|
|
508
|
+
break
|
|
509
|
+
if isinstance(item, Exception):
|
|
510
|
+
raise item
|
|
511
|
+
assert isinstance(item, _QueueItem)
|
|
512
|
+
|
|
513
|
+
if self.transaction_mode == TransactionMode.ROWGROUP:
|
|
514
|
+
rows_in_rg = self._mssql_bulkcopy_with_retry(
|
|
515
|
+
conn,
|
|
516
|
+
item.rows,
|
|
517
|
+
columns,
|
|
518
|
+
target_table_name,
|
|
519
|
+
filename,
|
|
520
|
+
op="bulkcopy(rowgroup)",
|
|
521
|
+
rg=item.rg_label,
|
|
522
|
+
offset=total_rows,
|
|
523
|
+
is_rows=True,
|
|
524
|
+
)
|
|
525
|
+
total_rows += rows_in_rg
|
|
526
|
+
logger.debug(
|
|
527
|
+
"Processed row group "
|
|
528
|
+
+ context(
|
|
529
|
+
file=filename,
|
|
530
|
+
rg=item.rg_label,
|
|
531
|
+
**mem_fields(),
|
|
532
|
+
)
|
|
533
|
+
)
|
|
534
|
+
elif self.transaction_mode == TransactionMode.BATCH:
|
|
535
|
+
rows_in_batch = self._mssql_bulkcopy_with_retry(
|
|
536
|
+
conn,
|
|
537
|
+
item.rows,
|
|
538
|
+
columns,
|
|
539
|
+
target_table_name,
|
|
540
|
+
filename,
|
|
541
|
+
op="bulkcopy(batch)",
|
|
542
|
+
rg=item.rg_label,
|
|
543
|
+
batch=item.batch_idx,
|
|
544
|
+
offset=total_rows,
|
|
545
|
+
is_rows=True,
|
|
546
|
+
)
|
|
547
|
+
total_rows += rows_in_batch
|
|
548
|
+
else:
|
|
549
|
+
# FILE / ROW modes — no per-item retry
|
|
550
|
+
cursor = conn.cursor()
|
|
551
|
+
try:
|
|
552
|
+
cursor.bulkcopy(
|
|
553
|
+
target_table_name,
|
|
554
|
+
item.rows,
|
|
555
|
+
column_mappings=columns,
|
|
556
|
+
)
|
|
557
|
+
finally:
|
|
558
|
+
cursor.close()
|
|
559
|
+
total_rows += item.row_count
|
|
560
|
+
except BaseException:
|
|
561
|
+
cancel.set()
|
|
562
|
+
raise
|
|
563
|
+
finally:
|
|
564
|
+
reader.join(timeout=5)
|
|
565
|
+
|
|
566
|
+
if self.transaction_mode == TransactionMode.FILE:
|
|
567
|
+
conn.commit()
|
|
568
|
+
|
|
569
|
+
logger.info(
|
|
570
|
+
"Completed "
|
|
571
|
+
+ context(
|
|
572
|
+
file=filename,
|
|
573
|
+
rows=total_rows,
|
|
574
|
+
seconds=f"{time.time() - start:.2f}",
|
|
575
|
+
**mem_fields(),
|
|
576
|
+
)
|
|
577
|
+
)
|
|
578
|
+
finally:
|
|
579
|
+
conn.close()
|
|
580
|
+
|
|
581
|
+
def _mssql_bulkcopy_with_retry(
|
|
582
|
+
self,
|
|
583
|
+
conn,
|
|
584
|
+
data,
|
|
585
|
+
columns,
|
|
586
|
+
target_table_name,
|
|
587
|
+
filename,
|
|
588
|
+
op="bulkcopy",
|
|
589
|
+
rg=None,
|
|
590
|
+
batch=None,
|
|
591
|
+
offset=None,
|
|
592
|
+
is_rows=False,
|
|
593
|
+
):
|
|
594
|
+
"""Execute bulkcopy with retry logic."""
|
|
595
|
+
if not is_rows:
|
|
596
|
+
rows = self._rows_from_arrow_table(data)
|
|
597
|
+
else:
|
|
598
|
+
rows = data
|
|
599
|
+
|
|
600
|
+
for attempt in range(self.config.retries):
|
|
601
|
+
try:
|
|
602
|
+
cursor = conn.cursor()
|
|
603
|
+
try:
|
|
604
|
+
result = cursor.bulkcopy(
|
|
605
|
+
target_table_name,
|
|
606
|
+
rows,
|
|
607
|
+
column_mappings=columns,
|
|
608
|
+
)
|
|
609
|
+
finally:
|
|
610
|
+
cursor.close()
|
|
611
|
+
if self.transaction_mode in (
|
|
612
|
+
TransactionMode.BATCH,
|
|
613
|
+
TransactionMode.ROWGROUP,
|
|
614
|
+
):
|
|
615
|
+
conn.commit()
|
|
616
|
+
return (
|
|
617
|
+
result.get("rows_copied", len(rows))
|
|
618
|
+
if isinstance(result, dict)
|
|
619
|
+
else len(rows)
|
|
620
|
+
)
|
|
621
|
+
except MemoryError:
|
|
622
|
+
logger.error(
|
|
623
|
+
f"Out of memory during {op} - not retrying (fatal) "
|
|
624
|
+
+ context(file=filename, rg=rg, batch=batch, offset=offset)
|
|
625
|
+
)
|
|
626
|
+
raise
|
|
627
|
+
except Exception as e:
|
|
628
|
+
safe_msg = self.safe_error_message(e)
|
|
629
|
+
if attempt < self.config.retries - 1:
|
|
630
|
+
logger.warning(
|
|
631
|
+
f"{op} attempt {attempt + 1}/{self.config.retries} failed "
|
|
632
|
+
+ context(
|
|
633
|
+
file=filename,
|
|
634
|
+
rg=rg,
|
|
635
|
+
batch=batch,
|
|
636
|
+
rows=len(rows),
|
|
637
|
+
offset=offset,
|
|
638
|
+
)
|
|
639
|
+
+ f": {safe_msg}"
|
|
640
|
+
)
|
|
641
|
+
conn.rollback()
|
|
642
|
+
time.sleep(2**attempt)
|
|
643
|
+
else:
|
|
644
|
+
raise BatchImportError(
|
|
645
|
+
f"Bulk copy failed after {self.config.retries} retries "
|
|
646
|
+
+ context(file=filename, rg=rg, batch=batch, offset=offset)
|
|
647
|
+
+ f": {safe_msg}"
|
|
648
|
+
) from e
|
|
649
|
+
|
|
650
|
+
def _rows_from_arrow_table(self, table):
|
|
651
|
+
"""Convert a PyArrow table to a list of tuples for bulkcopy."""
|
|
652
|
+
return rows_from_arrow(table)
|
|
653
|
+
|
|
654
|
+
def _import_batch_with_retry(
|
|
655
|
+
self,
|
|
656
|
+
c,
|
|
657
|
+
cur,
|
|
658
|
+
rows_or_batch,
|
|
659
|
+
insert_sql,
|
|
660
|
+
filename,
|
|
661
|
+
rg=None,
|
|
662
|
+
batch=None,
|
|
663
|
+
offset=None,
|
|
664
|
+
):
|
|
665
|
+
"""Import a single batch with retry logic for BATCH mode."""
|
|
666
|
+
rows = (
|
|
667
|
+
rows_or_batch
|
|
668
|
+
if isinstance(rows_or_batch, list)
|
|
669
|
+
else self._rows_from_batch(rows_or_batch)
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
for attempt in range(self.config.retries):
|
|
673
|
+
try:
|
|
674
|
+
cur.executemany(insert_sql, rows)
|
|
675
|
+
c.commit()
|
|
676
|
+
return len(rows)
|
|
677
|
+
except MemoryError:
|
|
678
|
+
logger.error(
|
|
679
|
+
"Out of memory during batch insert - not retrying (fatal) "
|
|
680
|
+
+ context(file=filename, rg=rg, batch=batch, offset=offset)
|
|
681
|
+
)
|
|
682
|
+
raise
|
|
683
|
+
except Exception as e:
|
|
684
|
+
safe_msg = self.safe_error_message(e)
|
|
685
|
+
|
|
686
|
+
if attempt < self.config.retries - 1:
|
|
687
|
+
logger.warning(
|
|
688
|
+
f"batch insert attempt {attempt + 1}/{self.config.retries} "
|
|
689
|
+
"failed "
|
|
690
|
+
+ context(
|
|
691
|
+
file=filename,
|
|
692
|
+
rg=rg,
|
|
693
|
+
batch=batch,
|
|
694
|
+
rows=len(rows),
|
|
695
|
+
offset=offset,
|
|
696
|
+
)
|
|
697
|
+
+ f": {safe_msg}"
|
|
698
|
+
)
|
|
699
|
+
c.rollback()
|
|
700
|
+
time.sleep(2**attempt)
|
|
701
|
+
else:
|
|
702
|
+
raise BatchImportError(
|
|
703
|
+
f"Batch import failed after {self.config.retries} retries "
|
|
704
|
+
+ context(file=filename, rg=rg, batch=batch, offset=offset)
|
|
705
|
+
+ f": {safe_msg}"
|
|
706
|
+
) from e
|
|
707
|
+
|
|
708
|
+
def _import_rowgroup_with_retry(
|
|
709
|
+
self,
|
|
710
|
+
c,
|
|
711
|
+
cur,
|
|
712
|
+
table_or_batch,
|
|
713
|
+
insert_sql,
|
|
714
|
+
filename,
|
|
715
|
+
rg_idx=None,
|
|
716
|
+
total_rg=None,
|
|
717
|
+
*,
|
|
718
|
+
rg=None,
|
|
719
|
+
):
|
|
720
|
+
"""Import a single row group with retry logic for ROWGROUP mode.
|
|
721
|
+
|
|
722
|
+
``table_or_batch`` may be a PyArrow Table, RecordBatch, or a
|
|
723
|
+
pre-converted ``list[tuple]`` of rows (from the reader thread).
|
|
724
|
+
Pass *rg* to supply a pre-formatted label; otherwise it is
|
|
725
|
+
derived from *rg_idx* / *total_rg*.
|
|
726
|
+
"""
|
|
727
|
+
if rg is None:
|
|
728
|
+
rg = f"{rg_idx + 1}/{total_rg}"
|
|
729
|
+
is_rows = isinstance(table_or_batch, list)
|
|
730
|
+
for attempt in range(self.config.retries):
|
|
731
|
+
try:
|
|
732
|
+
total_rows = 0
|
|
733
|
+
if is_rows:
|
|
734
|
+
for i in range(0, len(table_or_batch), self.batch_size):
|
|
735
|
+
chunk = table_or_batch[i : i + self.batch_size]
|
|
736
|
+
cur.executemany(insert_sql, chunk)
|
|
737
|
+
total_rows += len(chunk)
|
|
738
|
+
else:
|
|
739
|
+
to_batches = getattr(table_or_batch, "to_batches", None)
|
|
740
|
+
rowgroup_batches = (
|
|
741
|
+
table_or_batch.to_batches(max_chunksize=self.batch_size)
|
|
742
|
+
if callable(to_batches)
|
|
743
|
+
else [table_or_batch]
|
|
744
|
+
)
|
|
745
|
+
for batch in rowgroup_batches:
|
|
746
|
+
rows = rows_from_arrow(batch)
|
|
747
|
+
cur.executemany(insert_sql, rows)
|
|
748
|
+
total_rows += len(rows)
|
|
749
|
+
|
|
750
|
+
c.commit()
|
|
751
|
+
logger.debug("Processed row group " + context(file=filename, rg=rg))
|
|
752
|
+
return total_rows
|
|
753
|
+
except MemoryError:
|
|
754
|
+
logger.error(
|
|
755
|
+
"Out of memory during rowgroup insert - not retrying (fatal) "
|
|
756
|
+
+ context(file=filename, rg=rg)
|
|
757
|
+
)
|
|
758
|
+
raise
|
|
759
|
+
except Exception as e:
|
|
760
|
+
safe_msg = self.safe_error_message(e)
|
|
761
|
+
|
|
762
|
+
if attempt < self.config.retries - 1:
|
|
763
|
+
logger.warning(
|
|
764
|
+
f"rowgroup insert attempt {attempt + 1}/{self.config.retries} "
|
|
765
|
+
"failed " + context(file=filename, rg=rg) + f": {safe_msg}"
|
|
766
|
+
)
|
|
767
|
+
c.rollback()
|
|
768
|
+
time.sleep(2**attempt)
|
|
769
|
+
else:
|
|
770
|
+
raise RowGroupImportError(
|
|
771
|
+
f"Row group import failed after {self.config.retries} retries "
|
|
772
|
+
+ context(file=filename, rg=rg)
|
|
773
|
+
+ f": {safe_msg}"
|
|
774
|
+
) from e
|
|
775
|
+
|
|
776
|
+
def _make_temp_table_name(self, worker_index: int) -> str:
|
|
777
|
+
suffix = uuid.uuid4().hex[:8]
|
|
778
|
+
return f"{self.schema}.{self.table}_{worker_index + 1:02d}_{suffix}"
|
|
779
|
+
|
|
780
|
+
def _make_columnstore_index_name(self, temp_table_name: str) -> str:
|
|
781
|
+
table_part = temp_table_name.split(".", 1)[-1]
|
|
782
|
+
return quote_identifier(f"cci_{table_part}")
|
|
783
|
+
|
|
784
|
+
def _execute_temp_table_ddl(self, cur, count: int) -> list[str]:
|
|
785
|
+
"""Run the CREATE TABLE / CCI DDL on a cursor, returning table names."""
|
|
786
|
+
temp_tables: list[str] = []
|
|
787
|
+
for i in range(count):
|
|
788
|
+
temp_table_name = self._make_temp_table_name(i)
|
|
789
|
+
cur.execute(
|
|
790
|
+
f"SELECT TOP 0 * INTO {temp_table_name} FROM {self.full_table_name()}"
|
|
791
|
+
)
|
|
792
|
+
if self.create_cci:
|
|
793
|
+
index_name = self._make_columnstore_index_name(temp_table_name)
|
|
794
|
+
cur.execute(
|
|
795
|
+
f"CREATE CLUSTERED COLUMNSTORE INDEX {index_name} "
|
|
796
|
+
f"ON {temp_table_name}"
|
|
797
|
+
)
|
|
798
|
+
temp_tables.append(temp_table_name)
|
|
799
|
+
return temp_tables
|
|
800
|
+
|
|
801
|
+
def _create_temp_tables(self, count: int) -> list[str]:
|
|
802
|
+
if self.engine == "mssql-python":
|
|
803
|
+
conn = self.connection_m(autocommit=True)
|
|
804
|
+
try:
|
|
805
|
+
cur = conn.cursor()
|
|
806
|
+
try:
|
|
807
|
+
return self._execute_temp_table_ddl(cur, count)
|
|
808
|
+
finally:
|
|
809
|
+
cur.close()
|
|
810
|
+
finally:
|
|
811
|
+
conn.close()
|
|
812
|
+
else:
|
|
813
|
+
with self.connection_p(autocommit=True) as conn:
|
|
814
|
+
with conn.cursor() as cur:
|
|
815
|
+
return self._execute_temp_table_ddl(cur, count)
|
|
816
|
+
|
|
817
|
+
def _assign_files_to_workers(
|
|
818
|
+
self, filenames: list[str], temp_tables: list[str]
|
|
819
|
+
) -> dict[str, list[str]]:
|
|
820
|
+
assignments: dict[str, list[str]] = {tbl: [] for tbl in temp_tables}
|
|
821
|
+
for index, filename in enumerate(filenames):
|
|
822
|
+
target_table = temp_tables[index % len(temp_tables)]
|
|
823
|
+
assignments[target_table].append(filename)
|
|
824
|
+
return assignments
|
|
825
|
+
|
|
826
|
+
def _write_temp_manifest(self, temp_tables: list[str]) -> Path:
|
|
827
|
+
return write_manifest(
|
|
828
|
+
self.input_path / self.temp_manifest_filename,
|
|
829
|
+
temp_tables,
|
|
830
|
+
manifest_type="tables",
|
|
831
|
+
)
|
|
832
|
+
|
|
833
|
+
def _import_files_to_temp_table(self, target_table: str, filenames: list[str]):
|
|
834
|
+
for filename in filenames:
|
|
835
|
+
self.import_file(filename, target_table=target_table)
|
|
836
|
+
|
|
837
|
+
def _delete_original_files(self, filenames: list[str]):
|
|
838
|
+
for filename in filenames:
|
|
839
|
+
path = self.input_path / filename
|
|
840
|
+
if path.exists():
|
|
841
|
+
path.unlink()
|
|
842
|
+
|
|
843
|
+
manifest_path = self.input_path / self.manifest_filename
|
|
844
|
+
if manifest_path.exists():
|
|
845
|
+
manifest_path.unlink()
|
|
846
|
+
|
|
847
|
+
def perform_work(self):
|
|
848
|
+
# Import runs in a single process (worker threads share its memory), so
|
|
849
|
+
# one heartbeat here reports the whole run's RSS trend.
|
|
850
|
+
with MemoryHeartbeat(self.mem_heartbeat, unit="import"):
|
|
851
|
+
self._perform_work()
|
|
852
|
+
|
|
853
|
+
def _perform_work(self):
|
|
854
|
+
filenames = self.load_manifest_entries()
|
|
855
|
+
|
|
856
|
+
log_memory_budget(
|
|
857
|
+
operation="import",
|
|
858
|
+
workers=self.worker_count,
|
|
859
|
+
threshold_pct=self.mem_gate.threshold_pct,
|
|
860
|
+
)
|
|
861
|
+
|
|
862
|
+
if self.worker_count > 1 and len(filenames) > 1:
|
|
863
|
+
worker_count = min(self.worker_count, len(filenames))
|
|
864
|
+
temp_tables = self._create_temp_tables(worker_count)
|
|
865
|
+
assignments = self._assign_files_to_workers(filenames, temp_tables)
|
|
866
|
+
|
|
867
|
+
with ThreadPoolExecutor(
|
|
868
|
+
max_workers=worker_count, thread_name_prefix="import"
|
|
869
|
+
) as executor:
|
|
870
|
+
futures = {
|
|
871
|
+
executor.submit(
|
|
872
|
+
self._import_files_to_temp_table, target_table, assigned
|
|
873
|
+
): target_table
|
|
874
|
+
for target_table, assigned in assignments.items()
|
|
875
|
+
if assigned
|
|
876
|
+
}
|
|
877
|
+
|
|
878
|
+
self._await_futures(futures, label="table")
|
|
879
|
+
|
|
880
|
+
manifest_file = self._write_temp_manifest(temp_tables)
|
|
881
|
+
logger.info("Wrote temporary table manifest " + context(file=manifest_file))
|
|
882
|
+
return
|
|
883
|
+
|
|
884
|
+
with ThreadPoolExecutor(
|
|
885
|
+
max_workers=self.worker_count, thread_name_prefix="import"
|
|
886
|
+
) as executor:
|
|
887
|
+
futures = {
|
|
888
|
+
executor.submit(self.import_file, filename): filename
|
|
889
|
+
for filename in filenames
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
self._await_futures(futures, label="file")
|
|
893
|
+
|
|
894
|
+
def _await_futures(self, futures, label):
|
|
895
|
+
"""Wait for worker futures, surfacing *all* failures before re-raising.
|
|
896
|
+
|
|
897
|
+
Without this, a worker exception only re-raises as a bare traceback with
|
|
898
|
+
no indication of *which* file/table the dead worker was handling. We now
|
|
899
|
+
wait for every future so errors from all workers are logged, then raise
|
|
900
|
+
the first failure.
|
|
901
|
+
"""
|
|
902
|
+
first_error: Exception | None = None
|
|
903
|
+
completed_units: list[str] = []
|
|
904
|
+
for future in as_completed(futures):
|
|
905
|
+
unit = futures[future]
|
|
906
|
+
try:
|
|
907
|
+
future.result()
|
|
908
|
+
completed_units.append(str(unit))
|
|
909
|
+
except Exception as e:
|
|
910
|
+
logger.error(
|
|
911
|
+
"Worker failed "
|
|
912
|
+
+ context(**{label: unit})
|
|
913
|
+
+ f": {self.safe_error_message(e)}"
|
|
914
|
+
)
|
|
915
|
+
if first_error is None:
|
|
916
|
+
first_error = e
|
|
917
|
+
if first_error is not None:
|
|
918
|
+
log_failure_summary(
|
|
919
|
+
operation="import",
|
|
920
|
+
workers=len(futures),
|
|
921
|
+
completed=completed_units,
|
|
922
|
+
failed_error=self.safe_error_message(first_error),
|
|
923
|
+
)
|
|
924
|
+
raise first_error
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
if __name__ == "__main__":
|
|
928
|
+
pass
|