pybutt 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pybutt/core/config.py ADDED
@@ -0,0 +1,144 @@
1
+ import re
2
+ from dataclasses import dataclass
3
+ from enum import StrEnum
4
+
5
+ from pybutt.exceptions import (
6
+ EngineSelectionError,
7
+ InvalidIdentifierError,
8
+ InvalidParameterError,
9
+ )
10
+
11
+ ENGINE_CHOICES = frozenset({"duckdb", "pyodbc", "mssql-python"})
12
+
13
+
14
+ class TransactionMode(StrEnum):
15
+ """Control how transactions are handled during import."""
16
+
17
+ BATCH = "batch" # Each batch of batch_size rows in its own transaction
18
+ ROWGROUP = "rowgroup" # Each row group in the parquet file in its own transaction
19
+ FILE = "file" # Entire file in one transaction
20
+
21
+
22
+ # Global defaults
23
+ DRIVER_DEFAULT = "ODBC Driver 18 for SQL Server"
24
+ SCHEMA_DEFAULT = "dbo"
25
+ TRUSTED_CONNECTION_DEFAULT = False
26
+ TRUST_CERT_DEFAULT = False
27
+ ENCRYPT_DEFAULT = True
28
+ RETRIES_DEFAULT = 3
29
+
30
+ # Default memory heartbeat interval in seconds. Set to 30 so operators always
31
+ # have a recent RSS breadcrumb trail when a worker is OOM-killed.
32
+ MEM_HEARTBEAT_DEFAULT: float = 30.0
33
+
34
+ # Default memory-pressure throttle threshold (% system memory used). When system
35
+ # memory exceeds this %, workers sleep until pressure drops. Set to 85% so OOM
36
+ # kill is avoided without throttling during normal operation.
37
+ MEM_THRESHOLD_DEFAULT: float = 85.0
38
+
39
+ # Seconds to sleep per throttle cycle and max total wait before giving up.
40
+ MEM_SLEEP_DEFAULT: float = 5.0
41
+ MEM_MAX_WAIT_DEFAULT: float = 300.0
42
+
43
+ # Cooldown seconds after a throttle event before the gate re-checks. Prevents
44
+ # the gate from firing on every loop iteration and serialising workers.
45
+ MEM_COOLDOWN_DEFAULT: float = 30.0
46
+
47
+ # Default TDS packet size in bytes. 16383 is the maximum for encrypted
48
+ # connections (SQL Server caps encrypted packets at this size). Valid range
49
+ # for all drivers is 512–32767.
50
+ PACKET_SIZE_DEFAULT: int = 4_096
51
+
52
+ # Import specific defaults
53
+ IMPORT_ENGINE_DEFAULT = "mssql-python"
54
+ BATCH_SIZE_DEFAULT = 1_000
55
+ TRANSACTION_MODE_DEFAULT = TransactionMode.ROWGROUP
56
+ CCI_DEFAULT = True
57
+
58
+ # Export specific defaults
59
+ EXPORT_ENGINE_DEFAULT = "pyodbc"
60
+ FETCH_SIZE_DEFAULT = 1_000
61
+ ROWGROUP_SIZE_DEFAULT = 1_048_576
62
+
63
+
64
+ def validate_engine(engine: str, allowed: frozenset[str] | None = None) -> str:
65
+ """Raise :class:`EngineSelectionError` if *engine* is not in *allowed*."""
66
+ choices = allowed if allowed is not None else ENGINE_CHOICES
67
+ if engine not in choices:
68
+ raise EngineSelectionError(f"engine must be one of {sorted(choices)}")
69
+ return engine
70
+
71
+
72
+ def coerce_transaction_mode(mode: TransactionMode | str) -> TransactionMode:
73
+ """Accept a :class:`TransactionMode` or its string value and return the enum."""
74
+ if isinstance(mode, str):
75
+ return TransactionMode(mode)
76
+ return mode
77
+
78
+
79
+ IDENTIFIER_REGEX = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
80
+
81
+ # TVF parameters must be a comma-separated list of literals:
82
+ # integers/decimals, single-quoted strings (no nested quotes), NULLs.
83
+ _PARAM_TOKEN_RE = re.compile(
84
+ r"\s*(?:" r"NULL" r"|[+-]?\d+(?:\.\d+)?" r"|'[^']*'" r")\s*",
85
+ re.IGNORECASE,
86
+ )
87
+
88
+
89
+ def validate_parameters(params: str) -> str:
90
+ """Reject parameter strings that could contain SQL injection payloads.
91
+
92
+ Accepts only comma-separated SQL literals: numbers, single-quoted
93
+ strings (no embedded quotes), and NULL.
94
+ """
95
+ tokens = params.split(",")
96
+ for token in tokens:
97
+ if not _PARAM_TOKEN_RE.fullmatch(token):
98
+ raise InvalidParameterError(
99
+ f"Unsafe TVF parameter token: {token.strip()!r}. "
100
+ "Only numeric literals, single-quoted strings, and NULL are allowed."
101
+ )
102
+ return params
103
+
104
+
105
+ def validate_identifier(name: str) -> str:
106
+ if not IDENTIFIER_REGEX.match(name):
107
+ raise InvalidIdentifierError(f"Invalid identifier: {name}")
108
+ return name
109
+
110
+
111
+ def quote_identifier(name: str) -> str:
112
+ return f"[{name.replace(']', ']]')}]"
113
+
114
+
115
+ def sanitise_dsn_value(value: str) -> str:
116
+ """Escape ODBC connection-string metacharacters in a value.
117
+
118
+ Braces and semicolons are special in ODBC DSN strings. If the value
119
+ contains any of them, wrap it in ``{…}`` (doubling any literal
120
+ ``}`` inside) so the driver interprets the whole token as one value.
121
+ """
122
+ if not value:
123
+ return value
124
+ if any(ch in value for ch in (";", "{", "}", "=")):
125
+ return "{" + value.replace("}", "}}") + "}"
126
+ return value
127
+
128
+
129
+ @dataclass
130
+ class SqlConfig:
131
+ server: str
132
+ database: str
133
+ username: str | None = None
134
+ password: str | None = None
135
+ driver: str = DRIVER_DEFAULT
136
+ trusted_connection: bool = TRUSTED_CONNECTION_DEFAULT
137
+ trust_cert: bool = TRUST_CERT_DEFAULT
138
+ encrypt: bool = ENCRYPT_DEFAULT
139
+ retries: int = RETRIES_DEFAULT
140
+ packet_size: int = PACKET_SIZE_DEFAULT
141
+
142
+
143
+ if __name__ == "__main__":
144
+ pass
pybutt/core/logobs.py ADDED
@@ -0,0 +1,445 @@
1
+ """Centralised logging/observability helpers for PyButt.
2
+
3
+ All PyButt modules log through the ``pybutt`` logger (via :func:`get_logger`)
4
+ rather than the root logger. The CLI calls :func:`configure_logging` once at
5
+ startup; spawned export worker processes call it again through the pool
6
+ initialiser (see ``Exporter.perform_work``) so their output is formatted
7
+ identically on every platform (``spawn`` is the default on Windows/macOS and is
8
+ forced here on all OSes).
9
+
10
+ Library/API users who want PyButt's formatted output should call
11
+ :func:`configure_logging` themselves; otherwise standard ``logging`` rules apply.
12
+ """
13
+
14
+ import logging
15
+ import threading
16
+ import time as _time
17
+
18
+ import psutil
19
+
20
+ LOGGER_NAME = "pybutt"
21
+
22
+ # Timestamp + level + process/thread identity so concurrent workers' lines can be
23
+ # told apart and ordered. Identity matters because a single import run fans out
24
+ # across threads and an export run across (spawned) processes.
25
+ LOG_FORMAT = (
26
+ "%(asctime)s %(levelname)s [%(processName)s/%(threadName)s] %(name)s: %(message)s"
27
+ )
28
+ DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
29
+
30
+
31
+ def get_logger(name: str | None = None) -> logging.Logger:
32
+ """Return a child of the ``pybutt`` logger (or the root pybutt logger)."""
33
+ if name is None:
34
+ return logging.getLogger(LOGGER_NAME)
35
+ return logging.getLogger(f"{LOGGER_NAME}.{name}")
36
+
37
+
38
+ def configure_logging(verbose: bool = False) -> logging.Logger:
39
+ """Configure the ``pybutt`` logger. Idempotent and safe to call repeatedly.
40
+
41
+ Adds a single stderr handler with :data:`LOG_FORMAT`, sets the level
42
+ (``DEBUG`` when ``verbose`` else ``INFO``), and disables propagation so we
43
+ don't double-emit through the root logger or fight library handlers.
44
+ """
45
+ logger = logging.getLogger(LOGGER_NAME)
46
+ logger.setLevel(logging.DEBUG if verbose else logging.INFO)
47
+
48
+ if not any(getattr(h, "_pybutt_handler", False) for h in logger.handlers):
49
+ handler = logging.StreamHandler()
50
+ handler.setFormatter(logging.Formatter(LOG_FORMAT, DATE_FORMAT))
51
+ handler._pybutt_handler = True # marker so we never add a duplicate
52
+ logger.addHandler(handler)
53
+
54
+ logger.propagate = False
55
+ return logger
56
+
57
+
58
+ def init_worker_logging(level: int) -> None:
59
+ """Pool initialiser: configure logging inside a spawned worker process."""
60
+ configure_logging(verbose=level <= logging.DEBUG)
61
+ logging.getLogger(LOGGER_NAME).setLevel(level)
62
+
63
+
64
+ def context(**fields: object) -> str:
65
+ """Render structured ``key=value`` context, skipping ``None`` values.
66
+
67
+ Example: ``context(file="a.parquet", rg="3/40", batch=12)`` ->
68
+ ``"file=a.parquet rg=3/40 batch=12"``.
69
+ """
70
+ return " ".join(
71
+ f"{key}={value}" for key, value in fields.items() if value is not None
72
+ )
73
+
74
+
75
+ # --- memory observability --------------------------------------------------
76
+ #
77
+ # psutil gives a uniform *current* RSS on Windows/Linux/BSD/macOS (stdlib
78
+ # ``resource`` is Unix-only and its units differ by OS). There is no portable
79
+ # "peak RSS", so we track a running peak ourselves, per process. Export workers
80
+ # are separate processes, so each tracks (and reports) its own peak.
81
+
82
+ _process = psutil.Process()
83
+ _peak_rss = 0
84
+
85
+
86
+ def _human_bytes(num: float) -> str:
87
+ """Render a byte count compactly, e.g. ``1.8GB`` / ``512.0MB`` / ``900B``."""
88
+ value = float(num)
89
+ for unit in ("B", "KB", "MB", "GB"):
90
+ if value < 1024 or unit == "GB":
91
+ return f"{int(value)}{unit}" if unit == "B" else f"{value:.1f}{unit}"
92
+ value /= 1024
93
+ return f"{value:.1f}GB"
94
+
95
+
96
+ def rss_bytes() -> int:
97
+ """Return current process RSS in bytes, updating the per-process peak.
98
+
99
+ Returns 0 if the platform/process info is unavailable, so logging never
100
+ fails because of a memory probe.
101
+ """
102
+ global _peak_rss
103
+ try:
104
+ rss = _process.memory_info().rss
105
+ except Exception:
106
+ return 0
107
+ if rss > _peak_rss:
108
+ _peak_rss = rss
109
+ return rss
110
+
111
+
112
+ def peak_rss_bytes() -> int:
113
+ """Return the highest RSS observed in this process (refreshes first)."""
114
+ rss_bytes()
115
+ return _peak_rss
116
+
117
+
118
+ def sys_mem_fields() -> dict[str, str]:
119
+ """System-wide memory fields for :func:`context`.
120
+
121
+ Returns ``{"sys_pct": "78%", "sys_avail": "4.2GB"}`` so log lines show
122
+ how close the *machine* is to the OOM-kill threshold — not just this
123
+ process's own RSS.
124
+ """
125
+ try:
126
+ vm = psutil.virtual_memory()
127
+ return {
128
+ "sys_pct": f"{vm.percent:.0f}%",
129
+ "sys_avail": _human_bytes(vm.available),
130
+ }
131
+ except Exception:
132
+ return {}
133
+
134
+
135
+ def mem_fields() -> dict[str, str]:
136
+ """RSS + system-wide memory fields for :func:`context`.
137
+
138
+ Splat into ``context`` at boundary log points so the last line before an
139
+ OOM-kill shows the memory trend and exactly where it died, e.g.::
140
+
141
+ context(file=fn, rows=n, **mem_fields())
142
+ """
143
+ rss = rss_bytes()
144
+ return {
145
+ "rss": _human_bytes(rss),
146
+ "peak": _human_bytes(_peak_rss),
147
+ **sys_mem_fields(),
148
+ }
149
+
150
+
151
+ def log_memory_budget(
152
+ *,
153
+ operation: str,
154
+ workers: int,
155
+ total_rows: int | None = None,
156
+ threshold_pct: float = 0,
157
+ ) -> None:
158
+ """Log a pre-flight memory budget so operators can gauge headroom.
159
+
160
+ Called once before ``perform_work()`` begins real processing. The
161
+ estimate is deliberately rough (and labelled as such) — it exists to
162
+ surface an immediate "this probably won't fit" signal, not to be
163
+ precise.
164
+ """
165
+ log = get_logger("budget")
166
+ try:
167
+ vm = psutil.virtual_memory()
168
+ except Exception:
169
+ return
170
+
171
+ avail = vm.available
172
+ total = vm.total
173
+ pct = vm.percent
174
+
175
+ parts = [
176
+ f"operation={operation}",
177
+ f"workers={workers}",
178
+ f"sys_total={_human_bytes(total)}",
179
+ f"sys_avail={_human_bytes(avail)}",
180
+ f"sys_pct={pct:.0f}%",
181
+ ]
182
+ if total_rows is not None:
183
+ parts.append(f"total_rows={total_rows}")
184
+ if threshold_pct > 0:
185
+ headroom_bytes = int(total * (1 - threshold_pct / 100)) - (total - avail)
186
+ parts.append(f"threshold={threshold_pct:.0f}%")
187
+ parts.append(f"headroom={_human_bytes(max(headroom_bytes, 0))}")
188
+
189
+ log.info("Memory budget " + " ".join(parts))
190
+
191
+
192
+ def log_failure_summary(
193
+ *,
194
+ operation: str,
195
+ workers: int,
196
+ completed: list[str] | None = None,
197
+ failed_error: str = "",
198
+ ) -> None:
199
+ """Log a structured post-mortem when a pool/executor fails.
200
+
201
+ Gives the operator a concise picture of what finished before the
202
+ failure so they know how much progress was lost.
203
+ """
204
+ log = get_logger("postmortem")
205
+ try:
206
+ vm = psutil.virtual_memory()
207
+ sys_info = f"sys_pct={vm.percent:.0f}% sys_avail={_human_bytes(vm.available)}"
208
+ except Exception:
209
+ sys_info = ""
210
+
211
+ completed = completed or []
212
+ parts = [
213
+ f"operation={operation}",
214
+ f"workers={workers}",
215
+ f"completed={len(completed)}/{workers}",
216
+ ]
217
+ if sys_info:
218
+ parts.append(sys_info)
219
+ if failed_error:
220
+ parts.append(f"error={failed_error}")
221
+
222
+ log.error("FAILURE SUMMARY " + " ".join(parts))
223
+ if completed:
224
+ log.error(f" Completed units: {', '.join(completed)}")
225
+
226
+
227
+ class MemoryHeartbeat:
228
+ """Periodically log process RSS while a long operation runs.
229
+
230
+ Use as a context manager. A no-op when ``interval <= 0`` so callers can pass
231
+ a user-configured value unconditionally. The thread is a daemon and is
232
+ stopped/joined on exit. Runs in whichever process enters it, so for export
233
+ it must be entered inside the worker (where the memory actually lives).
234
+ """
235
+
236
+ def __init__(
237
+ self,
238
+ interval: float,
239
+ unit: str | None = None,
240
+ progress: dict[str, object] | None = None,
241
+ ):
242
+ self.interval = interval or 0
243
+ self.progress = progress
244
+ self.unit = unit
245
+ self._stop = threading.Event()
246
+ self._thread: threading.Thread | None = None
247
+
248
+ def __enter__(self) -> "MemoryHeartbeat":
249
+ if self.interval > 0:
250
+ self._thread = threading.Thread(
251
+ target=self._run, name="mem-heartbeat", daemon=True
252
+ )
253
+ self._thread.start()
254
+ return self
255
+
256
+ def __exit__(self, *exc: object) -> bool:
257
+ self._stop.set()
258
+ if self._thread is not None:
259
+ self._thread.join(timeout=self.interval + 1)
260
+ return False
261
+
262
+ def _run(self) -> None:
263
+ log = get_logger("mem")
264
+ while not self._stop.wait(self.interval):
265
+ extra = dict(self.progress) if self.progress else {}
266
+ log.info(
267
+ "Memory heartbeat " + context(unit=self.unit, **extra, **mem_fields())
268
+ )
269
+
270
+
271
+ class WorkerMonitor:
272
+ """Monitor child worker processes from the parent and log their RSS.
273
+
274
+ Runs a daemon thread that polls each worker PID via ``psutil``. When a
275
+ worker disappears (e.g. OOM-killed by SIGKILL), the monitor logs the last
276
+ known RSS and system memory state so the operator has a breadcrumb trail
277
+ even though the child had no chance to log anything itself.
278
+
279
+ Use as a context manager. A no-op when ``interval <= 0``.
280
+ """
281
+
282
+ def __init__(self, pids: list[int], interval: float):
283
+ self.interval = interval or 0
284
+ self._pids = list(pids)
285
+ self._stop = threading.Event()
286
+ self._thread: threading.Thread | None = None
287
+ self._last_rss: dict[int, int] = {}
288
+
289
+ def __enter__(self) -> "WorkerMonitor":
290
+ if self.interval > 0 and self._pids:
291
+ self._thread = threading.Thread(
292
+ target=self._run, name="worker-monitor", daemon=True
293
+ )
294
+ self._thread.start()
295
+ return self
296
+
297
+ def __exit__(self, *exc: object) -> bool:
298
+ self._stop.set()
299
+ if self._thread is not None:
300
+ self._thread.join(timeout=self.interval + 1)
301
+ return False
302
+
303
+ def _run(self) -> None:
304
+ log = get_logger("monitor")
305
+ procs: dict[int, psutil.Process] = {}
306
+ for pid in self._pids:
307
+ try:
308
+ procs[pid] = psutil.Process(pid)
309
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
310
+ pass
311
+
312
+ gone: set[int] = set()
313
+
314
+ while not self._stop.wait(self.interval):
315
+ sys_fields = sys_mem_fields()
316
+ for pid in self._pids:
317
+ if pid in gone:
318
+ continue
319
+ proc = procs.get(pid)
320
+ if proc is None:
321
+ gone.add(pid)
322
+ log.warning(
323
+ "Worker vanished "
324
+ + context(
325
+ pid=pid,
326
+ last_rss=_human_bytes(self._last_rss.get(pid, 0)),
327
+ status="GONE",
328
+ **sys_fields,
329
+ )
330
+ + " — likely OOM-killed"
331
+ )
332
+ continue
333
+ try:
334
+ rss = proc.memory_info().rss
335
+ self._last_rss[pid] = rss
336
+ log.debug(
337
+ "Worker health "
338
+ + context(
339
+ pid=pid,
340
+ rss=_human_bytes(rss),
341
+ status="alive",
342
+ **sys_fields,
343
+ )
344
+ )
345
+ except psutil.NoSuchProcess:
346
+ gone.add(pid)
347
+ log.warning(
348
+ "Worker vanished "
349
+ + context(
350
+ pid=pid,
351
+ last_rss=_human_bytes(self._last_rss.get(pid, 0)),
352
+ status="GONE",
353
+ **sys_fields,
354
+ )
355
+ + " — likely OOM-killed"
356
+ )
357
+ except (psutil.AccessDenied, Exception):
358
+ pass
359
+
360
+
361
+ class MemoryGate:
362
+ """Cooperative throttle: sleep the caller when system memory is high.
363
+
364
+ Call :meth:`check` at natural pause points in hot loops (before a
365
+ ``fetchmany``, ``read_row_group``, etc.). When system memory exceeds
366
+ *threshold_pct*, the caller sleeps in increments of *sleep_seconds*
367
+ until memory drops or *max_wait* is exhausted.
368
+
369
+ After a throttle event (or max_wait timeout), a cooldown of
370
+ *cooldown_seconds* prevents the gate from re-triggering on every
371
+ subsequent loop iteration — allowing workers to make real progress
372
+ between checks.
373
+
374
+ A no-op when ``threshold_pct <= 0``.
375
+ """
376
+
377
+ def __init__(
378
+ self,
379
+ threshold_pct: float = 0.0,
380
+ sleep_seconds: float = 5.0,
381
+ max_wait: float = 300.0,
382
+ cooldown_seconds: float = 30.0,
383
+ ):
384
+ self.threshold_pct = threshold_pct
385
+ self.sleep_seconds = sleep_seconds
386
+ self.max_wait = max_wait
387
+ self.cooldown_seconds = cooldown_seconds
388
+ self._log = get_logger("gate")
389
+ self._enabled = threshold_pct > 0
390
+ self._last_release: float = 0.0
391
+
392
+ def check(self, context_msg: str = "") -> float:
393
+ """Block while system memory exceeds the threshold.
394
+
395
+ Returns the total seconds waited (0.0 if no throttling occurred).
396
+ Skips the check entirely if still within the cooldown window from
397
+ the last throttle event.
398
+ """
399
+ if not self._enabled:
400
+ return 0.0
401
+
402
+ now = _time.monotonic()
403
+ if now - self._last_release < self.cooldown_seconds:
404
+ return 0.0
405
+
406
+ waited = 0.0
407
+ vm = psutil.virtual_memory()
408
+ if vm.percent <= self.threshold_pct:
409
+ return 0.0
410
+
411
+ self._log.warning(
412
+ "Memory pressure — throttling "
413
+ + context(
414
+ reason=context_msg or "gate",
415
+ threshold=f"{self.threshold_pct:.0f}%",
416
+ **mem_fields(),
417
+ )
418
+ )
419
+
420
+ while vm.percent > self.threshold_pct and waited < self.max_wait:
421
+ _time.sleep(self.sleep_seconds)
422
+ waited += self.sleep_seconds
423
+ vm = psutil.virtual_memory()
424
+ self._log.info(
425
+ "Throttle wait "
426
+ + context(
427
+ waited=f"{waited:.0f}s",
428
+ sys_pct=f"{vm.percent:.0f}%",
429
+ threshold=f"{self.threshold_pct:.0f}%",
430
+ sys_avail=_human_bytes(vm.available),
431
+ )
432
+ )
433
+
434
+ if waited > 0:
435
+ self._log.info(
436
+ "Throttle released "
437
+ + context(
438
+ total_waited=f"{waited:.0f}s",
439
+ sys_pct=f"{vm.percent:.0f}%",
440
+ sys_avail=_human_bytes(vm.available),
441
+ )
442
+ )
443
+
444
+ self._last_release = _time.monotonic()
445
+ return waited
pybutt/exceptions.py ADDED
@@ -0,0 +1,82 @@
1
+ class PyButtError(Exception):
2
+ """Base class for all PyButt-specific errors."""
3
+
4
+
5
+ class ConfigurationError(PyButtError, ValueError):
6
+ """Raised for invalid application configuration."""
7
+
8
+
9
+ class EngineSelectionError(ConfigurationError):
10
+ """Raised when an unsupported engine is selected."""
11
+
12
+
13
+ class InvalidIdentifierError(ConfigurationError):
14
+ """Raised when a SQL identifier is invalid."""
15
+
16
+
17
+ class InvalidParameterError(ConfigurationError):
18
+ """Raised when a TVF parameter string contains unsafe content."""
19
+
20
+
21
+ class ManifestError(PyButtError, ValueError):
22
+ """Base class for manifest validation errors."""
23
+
24
+
25
+ class ManifestNotFoundError(FileNotFoundError, ManifestError):
26
+ """Raised when a manifest file cannot be found."""
27
+
28
+
29
+ class InvalidManifestError(ManifestError):
30
+ """Raised when a manifest file contains invalid data."""
31
+
32
+
33
+ class InvalidManifestEntryError(InvalidManifestError):
34
+ """Raised when a manifest entry is malformed."""
35
+
36
+
37
+ class DuplicateManifestEntryError(InvalidManifestError):
38
+ """Raised when a manifest contains duplicate file entries."""
39
+
40
+
41
+ class UnsupportedManifestVersionError(InvalidManifestError):
42
+ """Raised when a manifest has an unsupported version."""
43
+
44
+
45
+ class UnsupportedManifestTypeError(InvalidManifestError):
46
+ """Raised when a manifest type is not supported."""
47
+
48
+
49
+ class MissingManifestEntryError(FileNotFoundError, InvalidManifestError):
50
+ """Raised when a manifest references a missing Parquet file."""
51
+
52
+
53
+ class PathTraversalError(InvalidManifestError):
54
+ """Raised when a manifest entry resolves outside its base directory."""
55
+
56
+
57
+ class SchemaMismatchError(PyButtError, ValueError):
58
+ """Raised when Parquet schema does not match the destination table schema."""
59
+
60
+
61
+ class DataExportError(PyButtError, RuntimeError):
62
+ """Raised when exporting data fails."""
63
+
64
+
65
+ class DataImportError(PyButtError, RuntimeError):
66
+ """Raised when importing data fails."""
67
+
68
+
69
+ class BatchImportError(DataImportError):
70
+ """Raised when a batch import fails after retries."""
71
+
72
+
73
+ class RowGroupImportError(DataImportError):
74
+ """Raised when a row group import fails after retries."""
75
+
76
+
77
+ class RetryExceededError(PyButtError, RuntimeError):
78
+ """Raised when retry logic exhausts all attempts."""
79
+
80
+
81
+ class TableEmptyError(DataExportError):
82
+ """Raised when the source table is empty or missing."""
@@ -0,0 +1,28 @@
1
+ from .combine import combine_parquet_files
2
+ from .inspect import inspect_manifest, inspect_parquet_file
3
+ from .manifest import (
4
+ MANIFEST_VERSION_1,
5
+ MANIFEST_VERSION_2,
6
+ SUPPORTED_MANIFEST_TYPES,
7
+ default_import_manifest_filename,
8
+ default_manifest_filename,
9
+ load_file_manifest,
10
+ load_manifest,
11
+ validate_manifest_entries,
12
+ write_manifest,
13
+ )
14
+
15
+ __all__ = [
16
+ "MANIFEST_VERSION_1",
17
+ "MANIFEST_VERSION_2",
18
+ "SUPPORTED_MANIFEST_TYPES",
19
+ "default_manifest_filename",
20
+ "default_import_manifest_filename",
21
+ "load_file_manifest",
22
+ "load_manifest",
23
+ "validate_manifest_entries",
24
+ "write_manifest",
25
+ "inspect_manifest",
26
+ "inspect_parquet_file",
27
+ "combine_parquet_files",
28
+ ]