pybutt 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pybutt/io/exporter.py ADDED
@@ -0,0 +1,612 @@
1
+ import math as m
2
+ import time
3
+ from multiprocessing import get_context
4
+ from pathlib import Path
5
+
6
+ import pyarrow as pa
7
+ import pyarrow.parquet as pq
8
+ import pyodbc
9
+
10
+ from pybutt.core.base import SqlServerIOBase
11
+ from pybutt.core.config import (
12
+ EXPORT_ENGINE_DEFAULT,
13
+ FETCH_SIZE_DEFAULT,
14
+ MEM_COOLDOWN_DEFAULT,
15
+ MEM_HEARTBEAT_DEFAULT,
16
+ MEM_MAX_WAIT_DEFAULT,
17
+ MEM_SLEEP_DEFAULT,
18
+ MEM_THRESHOLD_DEFAULT,
19
+ ROWGROUP_SIZE_DEFAULT,
20
+ SCHEMA_DEFAULT,
21
+ SqlConfig,
22
+ quote_identifier,
23
+ validate_engine,
24
+ validate_identifier,
25
+ validate_parameters,
26
+ )
27
+ from pybutt.core.logobs import (
28
+ MemoryGate,
29
+ MemoryHeartbeat,
30
+ WorkerMonitor,
31
+ context,
32
+ get_logger,
33
+ init_worker_logging,
34
+ log_failure_summary,
35
+ log_memory_budget,
36
+ mem_fields,
37
+ )
38
+ from pybutt.exceptions import (
39
+ ConfigurationError,
40
+ DataExportError,
41
+ TableEmptyError,
42
+ )
43
+ from pybutt.files import (
44
+ default_manifest_filename,
45
+ write_manifest,
46
+ )
47
+
48
+ logger = get_logger("exporter")
49
+
50
+
51
+ class Exporter(SqlServerIOBase):
52
+ def __init__(
53
+ self,
54
+ config: SqlConfig,
55
+ table: str,
56
+ output_path,
57
+ schema: str = SCHEMA_DEFAULT,
58
+ pk_column=None,
59
+ columns=None,
60
+ worker_count=1,
61
+ file_count=1,
62
+ rowgroup_size=ROWGROUP_SIZE_DEFAULT,
63
+ fetch_size=FETCH_SIZE_DEFAULT,
64
+ engine=EXPORT_ENGINE_DEFAULT,
65
+ manifest_filename: str | None = None,
66
+ parameters: str | None = None,
67
+ mem_heartbeat: float = MEM_HEARTBEAT_DEFAULT,
68
+ mem_threshold: float = MEM_THRESHOLD_DEFAULT,
69
+ mem_sleep: float = MEM_SLEEP_DEFAULT,
70
+ mem_max_wait: float = MEM_MAX_WAIT_DEFAULT,
71
+ mem_cooldown: float = MEM_COOLDOWN_DEFAULT,
72
+ ):
73
+ super().__init__(config)
74
+ self.schema = validate_identifier(schema)
75
+ self.table = validate_identifier(table)
76
+
77
+ self.mem_heartbeat = mem_heartbeat
78
+ self.mem_gate = MemoryGate(mem_threshold, mem_sleep, mem_max_wait, mem_cooldown)
79
+
80
+ self.pk_column = validate_identifier(pk_column) if pk_column else None
81
+ self.columns = [validate_identifier(c) for c in columns] if columns else None
82
+ self.parameters = validate_parameters(parameters) if parameters else None
83
+
84
+ validate_engine(engine)
85
+
86
+ if file_count < 1:
87
+ raise ConfigurationError("file_count must be at least 1")
88
+
89
+ if fetch_size is not None and fetch_size < 1:
90
+ raise ConfigurationError("fetch_size must be at least 1")
91
+
92
+ self.worker_count = worker_count
93
+ self.file_count = file_count
94
+ self.rowgroup_size = rowgroup_size
95
+ self.engine = engine
96
+ self.fetch_size = fetch_size
97
+
98
+ self.output_path = Path(output_path)
99
+ self.output_path.mkdir(parents=True, exist_ok=True)
100
+ self.manifest_filename = (
101
+ manifest_filename
102
+ if manifest_filename
103
+ else default_manifest_filename(self.schema, self.table)
104
+ )
105
+
106
+ self.total_rows = 0
107
+ self.partition_count = 0
108
+ self.chunk_size = 0
109
+
110
+ self.partition_meta()
111
+
112
+ def partition_meta(self):
113
+ def _work():
114
+ with self.connection_d() as c:
115
+ partition_query = f"""
116
+ SELECT SUM(row_count)
117
+ FROM sys.dm_db_partition_stats
118
+ WHERE object_id = OBJECT_ID('{self.full_table_name()}')
119
+ AND index_id IN (0,1)
120
+ """
121
+
122
+ row_count = (
123
+ c.execute(
124
+ f"FROM odbc_query('{self.dsn}', $$ {partition_query} $$)"
125
+ ).fetchone()[0]
126
+ or 0
127
+ )
128
+
129
+ if row_count == 0:
130
+ logger.info(
131
+ "Partition stats returned zero rows; falling back to COUNT(*)"
132
+ )
133
+ count_query = f"SELECT COUNT(*) FROM {self._source_reference()}"
134
+ row_count = (
135
+ c.execute(
136
+ f"FROM odbc_query('{self.dsn}', $$ {count_query} $$)"
137
+ ).fetchone()[0]
138
+ or 0
139
+ )
140
+
141
+ return row_count
142
+
143
+ self.total_rows = self.retry(_work, context="Fetching partition strategy")
144
+
145
+ if self.total_rows == 0:
146
+ raise TableEmptyError("Table empty or not found")
147
+
148
+ self.partition_count = self.file_count
149
+ self.chunk_size = m.ceil(self.total_rows / self.partition_count)
150
+
151
+ logger.info(
152
+ "Partitioning "
153
+ + context(
154
+ table=f"{self.schema}.{self.table}",
155
+ total_rows=self.total_rows,
156
+ file_count=self.file_count,
157
+ chunk_size=self.chunk_size,
158
+ )
159
+ )
160
+
161
+ if self.pk_column:
162
+ logger.info("Partition strategy=ROW_NUMBER " + context(pk=self.pk_column))
163
+ else:
164
+ logger.info(
165
+ "Partition strategy=CHECKSUM " + context(modulo=self.partition_count)
166
+ )
167
+
168
+ def get_table_columns(self):
169
+ query = f"""
170
+ SELECT COLUMN_NAME
171
+ FROM INFORMATION_SCHEMA.COLUMNS
172
+ WHERE TABLE_SCHEMA = '{self.schema}'
173
+ AND TABLE_NAME = '{self.table}'
174
+ ORDER BY ORDINAL_POSITION
175
+ """
176
+
177
+ with self.connection_d() as c:
178
+ rows = c.execute(f"FROM odbc_query('{self.dsn}', $$ {query} $$)").fetchall()
179
+
180
+ return [row[0] for row in rows]
181
+
182
+ def _source_reference(self) -> str:
183
+ if self.parameters is None:
184
+ return self.full_table_name()
185
+ return f"{self.full_table_name()}({self.parameters})"
186
+
187
+ def build_partition_query(self, n):
188
+ if self.pk_column:
189
+ start = n * self.chunk_size
190
+ end = (n + 1) * self.chunk_size
191
+
192
+ if self.columns is None:
193
+ column_names = self.get_table_columns()
194
+ selected_columns = ", ".join(quote_identifier(c) for c in column_names)
195
+ else:
196
+ selected_columns = ", ".join(quote_identifier(c) for c in self.columns)
197
+
198
+ return (
199
+ f"SELECT {selected_columns} "
200
+ "FROM ( "
201
+ f"SELECT {selected_columns}, "
202
+ "ROW_NUMBER() OVER ("
203
+ f"ORDER BY {quote_identifier(self.pk_column)}"
204
+ ") AS rn "
205
+ f"FROM {self._source_reference()} "
206
+ ") t "
207
+ f"WHERE rn > {start} AND rn <= {end}"
208
+ )
209
+ else:
210
+ selected_columns = (
211
+ ", ".join(quote_identifier(c) for c in self.columns)
212
+ if self.columns is not None
213
+ else "*"
214
+ )
215
+ return f"""
216
+ SELECT {selected_columns}
217
+ FROM {self._source_reference()}
218
+ WHERE ABS(CHECKSUM(*)) % {self.partition_count} = {n}
219
+ """
220
+
221
+ def _pyodbc_type_code_to_pyarrow(self, type_code, precision, scale, internal_size):
222
+ if type_code in (pyodbc.SQL_TINYINT, pyodbc.SQL_SMALLINT, pyodbc.SQL_INTEGER):
223
+ return pa.int32()
224
+ if type_code == pyodbc.SQL_BIGINT:
225
+ return pa.int64()
226
+ if type_code in (pyodbc.SQL_REAL, pyodbc.SQL_FLOAT):
227
+ return pa.float32()
228
+ if type_code == pyodbc.SQL_DOUBLE:
229
+ return pa.float64()
230
+ if type_code in (pyodbc.SQL_DECIMAL, pyodbc.SQL_NUMERIC):
231
+ precision = precision or 38
232
+ scale = scale or 0
233
+ return pa.decimal128(precision, scale)
234
+ if type_code in (
235
+ pyodbc.SQL_CHAR,
236
+ pyodbc.SQL_VARCHAR,
237
+ pyodbc.SQL_LONGVARCHAR,
238
+ pyodbc.SQL_WCHAR,
239
+ pyodbc.SQL_WVARCHAR,
240
+ pyodbc.SQL_WLONGVARCHAR,
241
+ ):
242
+ return pa.string()
243
+ if type_code in (
244
+ pyodbc.SQL_BINARY,
245
+ pyodbc.SQL_VARBINARY,
246
+ pyodbc.SQL_LONGVARBINARY,
247
+ ):
248
+ return pa.binary()
249
+ if type_code == pyodbc.SQL_BIT:
250
+ return pa.bool_()
251
+ if type_code == pyodbc.SQL_TYPE_DATE:
252
+ return pa.date32()
253
+ if type_code == pyodbc.SQL_TYPE_TIME:
254
+ return pa.time64("us")
255
+ if type_code == pyodbc.SQL_TYPE_TIMESTAMP:
256
+ return pa.timestamp("us")
257
+ return pa.string()
258
+
259
+ def _pyodbc_schema_from_description(self, description):
260
+ fields = []
261
+ for column in description:
262
+ name = column[0]
263
+ type_code = column[1]
264
+ precision = column[5] if len(column) > 5 else None
265
+ scale = column[6] if len(column) > 6 else None
266
+ nullable = column[6] if len(column) > 6 else True
267
+ fields.append(
268
+ pa.field(
269
+ name,
270
+ self._pyodbc_type_code_to_pyarrow(
271
+ type_code,
272
+ precision,
273
+ scale,
274
+ column[3] if len(column) > 3 else None,
275
+ ),
276
+ nullable=nullable,
277
+ )
278
+ )
279
+ return pa.schema(fields)
280
+
281
+ def _write_parquet_from_record_batches(self, reader, filepath, filename):
282
+ try:
283
+ schema = reader.schema
284
+ rg_written = 0
285
+ with pq.ParquetWriter(
286
+ str(filepath.as_posix()), schema, compression="snappy"
287
+ ) as writer:
288
+ buffered_table = None
289
+ for batch in reader:
290
+ self.mem_gate.check(f"record_batch file={filename}")
291
+ table = pa.Table.from_batches([batch])
292
+ if buffered_table is None:
293
+ buffered_table = table
294
+ else:
295
+ buffered_table = pa.concat_tables([buffered_table, table])
296
+
297
+ while (
298
+ buffered_table is not None
299
+ and buffered_table.num_rows >= self.rowgroup_size
300
+ ):
301
+ chunk = buffered_table.slice(0, self.rowgroup_size)
302
+ writer.write_table(chunk, row_group_size=self.rowgroup_size)
303
+ buffered_table = buffered_table.slice(self.rowgroup_size)
304
+ rg_written += 1
305
+ logger.debug(
306
+ "Flushed rowgroup "
307
+ + context(
308
+ file=filename,
309
+ rg=rg_written,
310
+ buffered=(
311
+ buffered_table.num_rows if buffered_table else 0
312
+ ),
313
+ **mem_fields(),
314
+ )
315
+ )
316
+
317
+ if buffered_table is None:
318
+ writer.write_table(pa.Table.from_batches([], schema=schema))
319
+ elif buffered_table.num_rows > 0:
320
+ writer.write_table(
321
+ buffered_table, row_group_size=self.rowgroup_size
322
+ )
323
+ except Exception as e:
324
+ raise DataExportError(
325
+ f"Failed exporting {filename}: {self.safe_error_message(e)}"
326
+ ) from e
327
+
328
+ def _export_partition_with_duckdb(self, query, filepath, filename):
329
+ with self.connection_d() as c:
330
+ try:
331
+ result = c.execute(f"FROM odbc_query('{self.dsn}', $$ {query} $$)")
332
+ reader = result.arrow()
333
+ self._write_parquet_from_record_batches(reader, filepath, filename)
334
+ except DataExportError:
335
+ raise
336
+ except Exception as e:
337
+ raise DataExportError(
338
+ f"Failed exporting {filename}: {self.safe_error_message(e)}"
339
+ ) from e
340
+
341
+ def _export_cursor_to_parquet(self, cur, filepath, filename):
342
+ """Shared fetch-buffer-write logic for cursor-based engines (pyodbc / mssql)."""
343
+ if cur.description is None:
344
+ raise DataExportError(
345
+ f"Failed exporting {filename}: query returned no column metadata"
346
+ )
347
+
348
+ columns = [desc[0] for desc in cur.description]
349
+ fetch_size = self.fetch_size
350
+
351
+ first_rows = cur.fetchmany(fetch_size)
352
+ if not first_rows:
353
+ empty_schema = pa.schema([pa.field(c, pa.string()) for c in columns])
354
+ with pq.ParquetWriter(
355
+ str(filepath.as_posix()), empty_schema, compression="snappy"
356
+ ) as writer:
357
+ writer.write_table(
358
+ pa.Table.from_pydict({c: [] for c in columns}, schema=empty_schema)
359
+ )
360
+ return
361
+
362
+ batch_dicts = [dict(zip(columns, row, strict=True)) for row in first_rows]
363
+ target_schema = pa.Table.from_pylist(batch_dicts).schema
364
+
365
+ def _rows_to_table(rows_to_write):
366
+ batch = [dict(zip(columns, row, strict=True)) for row in rows_to_write]
367
+ tbl = pa.Table.from_pylist(batch)
368
+ if tbl.schema != target_schema:
369
+ arrays = []
370
+ for field in target_schema:
371
+ name = field.name
372
+ col_type = field.type
373
+ vals = [r.get(name) for r in batch]
374
+ arrays.append(pa.array(vals, type=col_type))
375
+ tbl = pa.Table.from_arrays(
376
+ arrays, names=[f.name for f in target_schema]
377
+ )
378
+ return tbl
379
+
380
+ with pq.ParquetWriter(
381
+ str(filepath.as_posix()), target_schema, compression="snappy"
382
+ ) as writer:
383
+ buffered_rows = list(first_rows)
384
+ rg_written = 0
385
+ total_fetched = len(first_rows)
386
+
387
+ while True:
388
+ if len(buffered_rows) >= self.rowgroup_size:
389
+ rows_to_write = buffered_rows[: self.rowgroup_size]
390
+ writer.write_table(
391
+ _rows_to_table(rows_to_write),
392
+ row_group_size=self.rowgroup_size,
393
+ )
394
+ buffered_rows = buffered_rows[self.rowgroup_size :]
395
+ rg_written += 1
396
+ logger.debug(
397
+ "Flushed rowgroup "
398
+ + context(
399
+ file=filename,
400
+ rg=rg_written,
401
+ buffered=len(buffered_rows),
402
+ fetched=total_fetched,
403
+ **mem_fields(),
404
+ )
405
+ )
406
+ continue
407
+
408
+ self.mem_gate.check(f"fetchmany file={filename}")
409
+ rows = cur.fetchmany(fetch_size)
410
+ if not rows:
411
+ break
412
+ buffered_rows.extend(rows)
413
+ total_fetched += len(rows)
414
+ logger.debug(
415
+ "Fetched batch "
416
+ + context(
417
+ file=filename,
418
+ rows=len(rows),
419
+ buffered=len(buffered_rows),
420
+ total_fetched=total_fetched,
421
+ **mem_fields(),
422
+ )
423
+ )
424
+
425
+ if buffered_rows:
426
+ writer.write_table(
427
+ _rows_to_table(buffered_rows),
428
+ row_group_size=self.rowgroup_size,
429
+ )
430
+
431
+ def _export_partition_with_pyodbc(self, query, filepath, filename):
432
+ with self.connection_p() as conn:
433
+ with conn.cursor() as cur:
434
+ try:
435
+ cur.execute(query)
436
+ self._export_cursor_to_parquet(cur, filepath, filename)
437
+ except DataExportError:
438
+ raise
439
+ except Exception as e:
440
+ raise DataExportError(
441
+ f"Failed exporting {filename}: {self.safe_error_message(e)}"
442
+ ) from e
443
+
444
+ def _export_partition_with_mssql(self, query, filepath, filename):
445
+ conn = self.connection_m()
446
+ try:
447
+ cur = conn.cursor()
448
+ try:
449
+ cur.execute(query)
450
+ self._export_cursor_to_parquet(cur, filepath, filename)
451
+ except DataExportError:
452
+ raise
453
+ except Exception as e:
454
+ raise DataExportError(
455
+ f"Failed exporting {filename}: {self.safe_error_message(e)}"
456
+ ) from e
457
+ finally:
458
+ cur.close()
459
+ finally:
460
+ conn.close()
461
+
462
+ def export_partition(self, n):
463
+ start = time.time()
464
+ safe_name = f"{self.schema}_{self.table}"
465
+ filename = f"{safe_name}_part_{n:05d}.parquet"
466
+ filepath = self.output_path / filename
467
+ query = self.build_partition_query(n)
468
+
469
+ logger.debug("Partition query " + context(partition=n) + f": {query}")
470
+ logger.info(
471
+ "Exporting "
472
+ + context(
473
+ file=filename,
474
+ partition=f"{n}/{self.partition_count - 1}",
475
+ table=f"{self.schema}.{self.table}",
476
+ engine=self.engine,
477
+ **mem_fields(),
478
+ )
479
+ )
480
+
481
+ def _work():
482
+ if self.engine == "duckdb":
483
+ self._export_partition_with_duckdb(query, filepath, filename)
484
+ elif self.engine == "mssql-python":
485
+ self._export_partition_with_mssql(query, filepath, filename)
486
+ else:
487
+ self._export_partition_with_pyodbc(query, filepath, filename)
488
+
489
+ try:
490
+ # Heartbeat runs inside the worker process where the memory lives.
491
+ with MemoryHeartbeat(self.mem_heartbeat, unit=f"partition={n}"):
492
+ self.retry(_work, context=f"Export partition {n}")
493
+ except MemoryError:
494
+ logger.error(
495
+ "Out of memory during export - not retrying (fatal) "
496
+ + context(partition=n, file=filename)
497
+ )
498
+ raise
499
+ except Exception as e:
500
+ logger.error(
501
+ "Export partition failed "
502
+ + context(partition=n, file=filename)
503
+ + f": {self.safe_error_message(e)}"
504
+ )
505
+ logger.debug("Traceback for partition %s", n, exc_info=True)
506
+ raise
507
+
508
+ duration = time.time() - start
509
+ if filepath.exists():
510
+ size_mb = filepath.stat().st_size / (1024 * 1024)
511
+ else:
512
+ size_mb = 0
513
+ logger.info(
514
+ "Completed "
515
+ + context(
516
+ file=filename,
517
+ rows_approx=self.chunk_size,
518
+ size_mb=f"{size_mb:.2f}",
519
+ seconds=f"{duration:.2f}",
520
+ progress=f"{n + 1}/{self.partition_count}",
521
+ **mem_fields(),
522
+ )
523
+ )
524
+
525
+ return filename
526
+
527
+ def _get_pool_worker_pids(self, pool) -> list[int]:
528
+ """Extract worker PIDs from a multiprocessing Pool."""
529
+ try:
530
+ return [w.pid for w in pool._pool if w.pid is not None]
531
+ except Exception:
532
+ return []
533
+
534
+ def perform_work(self):
535
+ start = time.time()
536
+ manifest_file = self.output_path / self.manifest_filename
537
+
538
+ log_memory_budget(
539
+ operation="export",
540
+ workers=self.worker_count,
541
+ total_rows=getattr(self, "total_rows", None),
542
+ threshold_pct=self.mem_gate.threshold_pct,
543
+ )
544
+
545
+ # Spawned worker processes re-import modules and do NOT inherit the
546
+ # parent's logging config, so configure it in each via the initialiser
547
+ # (spawn is the default on Windows/macOS and is forced here on all OSes).
548
+ worker_level = get_logger().getEffectiveLevel()
549
+ try:
550
+ with get_context("spawn").Pool(
551
+ self.worker_count,
552
+ initializer=init_worker_logging,
553
+ initargs=(worker_level,),
554
+ ) as p:
555
+ # Use map_async so we can extract PIDs and start the monitor
556
+ # before blocking on results.
557
+ result = p.map_async(self.export_partition, range(self.partition_count))
558
+ pids = self._get_pool_worker_pids(p)
559
+ if pids:
560
+ logger.info(
561
+ "Worker pool started "
562
+ + context(workers=len(pids), pids=",".join(map(str, pids)))
563
+ )
564
+ with WorkerMonitor(pids, self.mem_heartbeat):
565
+ filenames = result.get()
566
+ except Exception as e:
567
+ # A worker killed abruptly (e.g. OOM/SIGKILL) surfaces here without a
568
+ # partition context; make the likely cause explicit.
569
+ logger.error(
570
+ "Export pool failed - a worker may have terminated abnormally "
571
+ "(possible out-of-memory/SIGKILL); check earlier per-partition "
572
+ f"logs: {self.safe_error_message(e)}"
573
+ )
574
+ log_failure_summary(
575
+ operation="export",
576
+ workers=self.worker_count,
577
+ failed_error=self.safe_error_message(e),
578
+ )
579
+ raise
580
+
581
+ duration = time.time() - start
582
+
583
+ logger.info(
584
+ "Export complete "
585
+ + context(
586
+ table=f"{self.schema}.{self.table}",
587
+ files=len(filenames),
588
+ seconds=f"{duration:.2f}",
589
+ )
590
+ )
591
+
592
+ logger.info("Writing manifest " + context(file=manifest_file))
593
+ try:
594
+ write_manifest(manifest_file, filenames)
595
+ logger.info(
596
+ "Manifest written " + context(file=manifest_file, files=len(filenames))
597
+ )
598
+
599
+ except Exception as e:
600
+ logger.error(
601
+ "Failed to write manifest "
602
+ + context(file=manifest_file)
603
+ + f": {self.safe_error_message(e)}"
604
+ )
605
+ raise DataExportError(
606
+ f"Failed to write manifest {manifest_file}: "
607
+ f"{self.safe_error_message(e)}"
608
+ ) from e
609
+
610
+
611
+ if __name__ == "__main__":
612
+ pass