pybutt 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- old_tests/app.py +713 -0
- pybutt/__init__.py +17 -0
- pybutt/cli/__init__.py +11 -0
- pybutt/cli/app.py +94 -0
- pybutt/cli/combine_command.py +236 -0
- pybutt/cli/export_command.py +317 -0
- pybutt/cli/import_command.py +286 -0
- pybutt/cli/inspect_command.py +30 -0
- pybutt/cli/purge_command.py +235 -0
- pybutt/core/__init__.py +30 -0
- pybutt/core/base.py +124 -0
- pybutt/core/config.py +144 -0
- pybutt/core/logobs.py +445 -0
- pybutt/exceptions.py +82 -0
- pybutt/files/__init__.py +28 -0
- pybutt/files/combine.py +93 -0
- pybutt/files/inspect.py +51 -0
- pybutt/files/manifest.py +160 -0
- pybutt/io/__init__.py +6 -0
- pybutt/io/combiner.py +119 -0
- pybutt/io/exporter.py +612 -0
- pybutt/io/importer.py +928 -0
- pybutt/io/purger.py +44 -0
- pybutt-2.0.0.dist-info/METADATA +756 -0
- pybutt-2.0.0.dist-info/RECORD +39 -0
- pybutt-2.0.0.dist-info/WHEEL +5 -0
- pybutt-2.0.0.dist-info/entry_points.txt +2 -0
- pybutt-2.0.0.dist-info/licenses/LICENSE +21 -0
- pybutt-2.0.0.dist-info/top_level.txt +3 -0
- tests/conftest.py +22 -0
- tests/test_cli.py +979 -0
- tests/test_cli_help.py +130 -0
- tests/test_combiner.py +259 -0
- tests/test_core.py +1009 -0
- tests/test_exporter.py +637 -0
- tests/test_files.py +178 -0
- tests/test_import_retry_logic.py +837 -0
- tests/test_logobs.py +491 -0
- tests/test_purge.py +219 -0
pybutt/core/config.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from enum import StrEnum
|
|
4
|
+
|
|
5
|
+
from pybutt.exceptions import (
|
|
6
|
+
EngineSelectionError,
|
|
7
|
+
InvalidIdentifierError,
|
|
8
|
+
InvalidParameterError,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
ENGINE_CHOICES = frozenset({"duckdb", "pyodbc", "mssql-python"})
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TransactionMode(StrEnum):
|
|
15
|
+
"""Control how transactions are handled during import."""
|
|
16
|
+
|
|
17
|
+
BATCH = "batch" # Each batch of batch_size rows in its own transaction
|
|
18
|
+
ROWGROUP = "rowgroup" # Each row group in the parquet file in its own transaction
|
|
19
|
+
FILE = "file" # Entire file in one transaction
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Global defaults
|
|
23
|
+
DRIVER_DEFAULT = "ODBC Driver 18 for SQL Server"
|
|
24
|
+
SCHEMA_DEFAULT = "dbo"
|
|
25
|
+
TRUSTED_CONNECTION_DEFAULT = False
|
|
26
|
+
TRUST_CERT_DEFAULT = False
|
|
27
|
+
ENCRYPT_DEFAULT = True
|
|
28
|
+
RETRIES_DEFAULT = 3
|
|
29
|
+
|
|
30
|
+
# Default memory heartbeat interval in seconds. Set to 30 so operators always
|
|
31
|
+
# have a recent RSS breadcrumb trail when a worker is OOM-killed.
|
|
32
|
+
MEM_HEARTBEAT_DEFAULT: float = 30.0
|
|
33
|
+
|
|
34
|
+
# Default memory-pressure throttle threshold (% system memory used). When system
|
|
35
|
+
# memory exceeds this %, workers sleep until pressure drops. Set to 85% so OOM
|
|
36
|
+
# kill is avoided without throttling during normal operation.
|
|
37
|
+
MEM_THRESHOLD_DEFAULT: float = 85.0
|
|
38
|
+
|
|
39
|
+
# Seconds to sleep per throttle cycle and max total wait before giving up.
|
|
40
|
+
MEM_SLEEP_DEFAULT: float = 5.0
|
|
41
|
+
MEM_MAX_WAIT_DEFAULT: float = 300.0
|
|
42
|
+
|
|
43
|
+
# Cooldown seconds after a throttle event before the gate re-checks. Prevents
|
|
44
|
+
# the gate from firing on every loop iteration and serialising workers.
|
|
45
|
+
MEM_COOLDOWN_DEFAULT: float = 30.0
|
|
46
|
+
|
|
47
|
+
# Default TDS packet size in bytes. 16383 is the maximum for encrypted
|
|
48
|
+
# connections (SQL Server caps encrypted packets at this size). Valid range
|
|
49
|
+
# for all drivers is 512–32767.
|
|
50
|
+
PACKET_SIZE_DEFAULT: int = 4_096
|
|
51
|
+
|
|
52
|
+
# Import specific defaults
|
|
53
|
+
IMPORT_ENGINE_DEFAULT = "mssql-python"
|
|
54
|
+
BATCH_SIZE_DEFAULT = 1_000
|
|
55
|
+
TRANSACTION_MODE_DEFAULT = TransactionMode.ROWGROUP
|
|
56
|
+
CCI_DEFAULT = True
|
|
57
|
+
|
|
58
|
+
# Export specific defaults
|
|
59
|
+
EXPORT_ENGINE_DEFAULT = "pyodbc"
|
|
60
|
+
FETCH_SIZE_DEFAULT = 1_000
|
|
61
|
+
ROWGROUP_SIZE_DEFAULT = 1_048_576
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def validate_engine(engine: str, allowed: frozenset[str] | None = None) -> str:
|
|
65
|
+
"""Raise :class:`EngineSelectionError` if *engine* is not in *allowed*."""
|
|
66
|
+
choices = allowed if allowed is not None else ENGINE_CHOICES
|
|
67
|
+
if engine not in choices:
|
|
68
|
+
raise EngineSelectionError(f"engine must be one of {sorted(choices)}")
|
|
69
|
+
return engine
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def coerce_transaction_mode(mode: TransactionMode | str) -> TransactionMode:
|
|
73
|
+
"""Accept a :class:`TransactionMode` or its string value and return the enum."""
|
|
74
|
+
if isinstance(mode, str):
|
|
75
|
+
return TransactionMode(mode)
|
|
76
|
+
return mode
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
IDENTIFIER_REGEX = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
|
|
80
|
+
|
|
81
|
+
# TVF parameters must be a comma-separated list of literals:
|
|
82
|
+
# integers/decimals, single-quoted strings (no nested quotes), NULLs.
|
|
83
|
+
_PARAM_TOKEN_RE = re.compile(
|
|
84
|
+
r"\s*(?:" r"NULL" r"|[+-]?\d+(?:\.\d+)?" r"|'[^']*'" r")\s*",
|
|
85
|
+
re.IGNORECASE,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def validate_parameters(params: str) -> str:
|
|
90
|
+
"""Reject parameter strings that could contain SQL injection payloads.
|
|
91
|
+
|
|
92
|
+
Accepts only comma-separated SQL literals: numbers, single-quoted
|
|
93
|
+
strings (no embedded quotes), and NULL.
|
|
94
|
+
"""
|
|
95
|
+
tokens = params.split(",")
|
|
96
|
+
for token in tokens:
|
|
97
|
+
if not _PARAM_TOKEN_RE.fullmatch(token):
|
|
98
|
+
raise InvalidParameterError(
|
|
99
|
+
f"Unsafe TVF parameter token: {token.strip()!r}. "
|
|
100
|
+
"Only numeric literals, single-quoted strings, and NULL are allowed."
|
|
101
|
+
)
|
|
102
|
+
return params
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def validate_identifier(name: str) -> str:
|
|
106
|
+
if not IDENTIFIER_REGEX.match(name):
|
|
107
|
+
raise InvalidIdentifierError(f"Invalid identifier: {name}")
|
|
108
|
+
return name
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def quote_identifier(name: str) -> str:
|
|
112
|
+
return f"[{name.replace(']', ']]')}]"
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def sanitise_dsn_value(value: str) -> str:
|
|
116
|
+
"""Escape ODBC connection-string metacharacters in a value.
|
|
117
|
+
|
|
118
|
+
Braces and semicolons are special in ODBC DSN strings. If the value
|
|
119
|
+
contains any of them, wrap it in ``{…}`` (doubling any literal
|
|
120
|
+
``}`` inside) so the driver interprets the whole token as one value.
|
|
121
|
+
"""
|
|
122
|
+
if not value:
|
|
123
|
+
return value
|
|
124
|
+
if any(ch in value for ch in (";", "{", "}", "=")):
|
|
125
|
+
return "{" + value.replace("}", "}}") + "}"
|
|
126
|
+
return value
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@dataclass
|
|
130
|
+
class SqlConfig:
|
|
131
|
+
server: str
|
|
132
|
+
database: str
|
|
133
|
+
username: str | None = None
|
|
134
|
+
password: str | None = None
|
|
135
|
+
driver: str = DRIVER_DEFAULT
|
|
136
|
+
trusted_connection: bool = TRUSTED_CONNECTION_DEFAULT
|
|
137
|
+
trust_cert: bool = TRUST_CERT_DEFAULT
|
|
138
|
+
encrypt: bool = ENCRYPT_DEFAULT
|
|
139
|
+
retries: int = RETRIES_DEFAULT
|
|
140
|
+
packet_size: int = PACKET_SIZE_DEFAULT
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
if __name__ == "__main__":
|
|
144
|
+
pass
|
pybutt/core/logobs.py
ADDED
|
@@ -0,0 +1,445 @@
|
|
|
1
|
+
"""Centralised logging/observability helpers for PyButt.
|
|
2
|
+
|
|
3
|
+
All PyButt modules log through the ``pybutt`` logger (via :func:`get_logger`)
|
|
4
|
+
rather than the root logger. The CLI calls :func:`configure_logging` once at
|
|
5
|
+
startup; spawned export worker processes call it again through the pool
|
|
6
|
+
initialiser (see ``Exporter.perform_work``) so their output is formatted
|
|
7
|
+
identically on every platform (``spawn`` is the default on Windows/macOS and is
|
|
8
|
+
forced here on all OSes).
|
|
9
|
+
|
|
10
|
+
Library/API users who want PyButt's formatted output should call
|
|
11
|
+
:func:`configure_logging` themselves; otherwise standard ``logging`` rules apply.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import threading
|
|
16
|
+
import time as _time
|
|
17
|
+
|
|
18
|
+
import psutil
|
|
19
|
+
|
|
20
|
+
LOGGER_NAME = "pybutt"
|
|
21
|
+
|
|
22
|
+
# Timestamp + level + process/thread identity so concurrent workers' lines can be
|
|
23
|
+
# told apart and ordered. Identity matters because a single import run fans out
|
|
24
|
+
# across threads and an export run across (spawned) processes.
|
|
25
|
+
LOG_FORMAT = (
|
|
26
|
+
"%(asctime)s %(levelname)s [%(processName)s/%(threadName)s] %(name)s: %(message)s"
|
|
27
|
+
)
|
|
28
|
+
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_logger(name: str | None = None) -> logging.Logger:
|
|
32
|
+
"""Return a child of the ``pybutt`` logger (or the root pybutt logger)."""
|
|
33
|
+
if name is None:
|
|
34
|
+
return logging.getLogger(LOGGER_NAME)
|
|
35
|
+
return logging.getLogger(f"{LOGGER_NAME}.{name}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def configure_logging(verbose: bool = False) -> logging.Logger:
|
|
39
|
+
"""Configure the ``pybutt`` logger. Idempotent and safe to call repeatedly.
|
|
40
|
+
|
|
41
|
+
Adds a single stderr handler with :data:`LOG_FORMAT`, sets the level
|
|
42
|
+
(``DEBUG`` when ``verbose`` else ``INFO``), and disables propagation so we
|
|
43
|
+
don't double-emit through the root logger or fight library handlers.
|
|
44
|
+
"""
|
|
45
|
+
logger = logging.getLogger(LOGGER_NAME)
|
|
46
|
+
logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
|
47
|
+
|
|
48
|
+
if not any(getattr(h, "_pybutt_handler", False) for h in logger.handlers):
|
|
49
|
+
handler = logging.StreamHandler()
|
|
50
|
+
handler.setFormatter(logging.Formatter(LOG_FORMAT, DATE_FORMAT))
|
|
51
|
+
handler._pybutt_handler = True # marker so we never add a duplicate
|
|
52
|
+
logger.addHandler(handler)
|
|
53
|
+
|
|
54
|
+
logger.propagate = False
|
|
55
|
+
return logger
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def init_worker_logging(level: int) -> None:
|
|
59
|
+
"""Pool initialiser: configure logging inside a spawned worker process."""
|
|
60
|
+
configure_logging(verbose=level <= logging.DEBUG)
|
|
61
|
+
logging.getLogger(LOGGER_NAME).setLevel(level)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def context(**fields: object) -> str:
|
|
65
|
+
"""Render structured ``key=value`` context, skipping ``None`` values.
|
|
66
|
+
|
|
67
|
+
Example: ``context(file="a.parquet", rg="3/40", batch=12)`` ->
|
|
68
|
+
``"file=a.parquet rg=3/40 batch=12"``.
|
|
69
|
+
"""
|
|
70
|
+
return " ".join(
|
|
71
|
+
f"{key}={value}" for key, value in fields.items() if value is not None
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# --- memory observability --------------------------------------------------
|
|
76
|
+
#
|
|
77
|
+
# psutil gives a uniform *current* RSS on Windows/Linux/BSD/macOS (stdlib
|
|
78
|
+
# ``resource`` is Unix-only and its units differ by OS). There is no portable
|
|
79
|
+
# "peak RSS", so we track a running peak ourselves, per process. Export workers
|
|
80
|
+
# are separate processes, so each tracks (and reports) its own peak.
|
|
81
|
+
|
|
82
|
+
_process = psutil.Process()
|
|
83
|
+
_peak_rss = 0
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _human_bytes(num: float) -> str:
|
|
87
|
+
"""Render a byte count compactly, e.g. ``1.8GB`` / ``512.0MB`` / ``900B``."""
|
|
88
|
+
value = float(num)
|
|
89
|
+
for unit in ("B", "KB", "MB", "GB"):
|
|
90
|
+
if value < 1024 or unit == "GB":
|
|
91
|
+
return f"{int(value)}{unit}" if unit == "B" else f"{value:.1f}{unit}"
|
|
92
|
+
value /= 1024
|
|
93
|
+
return f"{value:.1f}GB"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def rss_bytes() -> int:
|
|
97
|
+
"""Return current process RSS in bytes, updating the per-process peak.
|
|
98
|
+
|
|
99
|
+
Returns 0 if the platform/process info is unavailable, so logging never
|
|
100
|
+
fails because of a memory probe.
|
|
101
|
+
"""
|
|
102
|
+
global _peak_rss
|
|
103
|
+
try:
|
|
104
|
+
rss = _process.memory_info().rss
|
|
105
|
+
except Exception:
|
|
106
|
+
return 0
|
|
107
|
+
if rss > _peak_rss:
|
|
108
|
+
_peak_rss = rss
|
|
109
|
+
return rss
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def peak_rss_bytes() -> int:
|
|
113
|
+
"""Return the highest RSS observed in this process (refreshes first)."""
|
|
114
|
+
rss_bytes()
|
|
115
|
+
return _peak_rss
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def sys_mem_fields() -> dict[str, str]:
|
|
119
|
+
"""System-wide memory fields for :func:`context`.
|
|
120
|
+
|
|
121
|
+
Returns ``{"sys_pct": "78%", "sys_avail": "4.2GB"}`` so log lines show
|
|
122
|
+
how close the *machine* is to the OOM-kill threshold — not just this
|
|
123
|
+
process's own RSS.
|
|
124
|
+
"""
|
|
125
|
+
try:
|
|
126
|
+
vm = psutil.virtual_memory()
|
|
127
|
+
return {
|
|
128
|
+
"sys_pct": f"{vm.percent:.0f}%",
|
|
129
|
+
"sys_avail": _human_bytes(vm.available),
|
|
130
|
+
}
|
|
131
|
+
except Exception:
|
|
132
|
+
return {}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def mem_fields() -> dict[str, str]:
|
|
136
|
+
"""RSS + system-wide memory fields for :func:`context`.
|
|
137
|
+
|
|
138
|
+
Splat into ``context`` at boundary log points so the last line before an
|
|
139
|
+
OOM-kill shows the memory trend and exactly where it died, e.g.::
|
|
140
|
+
|
|
141
|
+
context(file=fn, rows=n, **mem_fields())
|
|
142
|
+
"""
|
|
143
|
+
rss = rss_bytes()
|
|
144
|
+
return {
|
|
145
|
+
"rss": _human_bytes(rss),
|
|
146
|
+
"peak": _human_bytes(_peak_rss),
|
|
147
|
+
**sys_mem_fields(),
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def log_memory_budget(
|
|
152
|
+
*,
|
|
153
|
+
operation: str,
|
|
154
|
+
workers: int,
|
|
155
|
+
total_rows: int | None = None,
|
|
156
|
+
threshold_pct: float = 0,
|
|
157
|
+
) -> None:
|
|
158
|
+
"""Log a pre-flight memory budget so operators can gauge headroom.
|
|
159
|
+
|
|
160
|
+
Called once before ``perform_work()`` begins real processing. The
|
|
161
|
+
estimate is deliberately rough (and labelled as such) — it exists to
|
|
162
|
+
surface an immediate "this probably won't fit" signal, not to be
|
|
163
|
+
precise.
|
|
164
|
+
"""
|
|
165
|
+
log = get_logger("budget")
|
|
166
|
+
try:
|
|
167
|
+
vm = psutil.virtual_memory()
|
|
168
|
+
except Exception:
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
avail = vm.available
|
|
172
|
+
total = vm.total
|
|
173
|
+
pct = vm.percent
|
|
174
|
+
|
|
175
|
+
parts = [
|
|
176
|
+
f"operation={operation}",
|
|
177
|
+
f"workers={workers}",
|
|
178
|
+
f"sys_total={_human_bytes(total)}",
|
|
179
|
+
f"sys_avail={_human_bytes(avail)}",
|
|
180
|
+
f"sys_pct={pct:.0f}%",
|
|
181
|
+
]
|
|
182
|
+
if total_rows is not None:
|
|
183
|
+
parts.append(f"total_rows={total_rows}")
|
|
184
|
+
if threshold_pct > 0:
|
|
185
|
+
headroom_bytes = int(total * (1 - threshold_pct / 100)) - (total - avail)
|
|
186
|
+
parts.append(f"threshold={threshold_pct:.0f}%")
|
|
187
|
+
parts.append(f"headroom={_human_bytes(max(headroom_bytes, 0))}")
|
|
188
|
+
|
|
189
|
+
log.info("Memory budget " + " ".join(parts))
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def log_failure_summary(
|
|
193
|
+
*,
|
|
194
|
+
operation: str,
|
|
195
|
+
workers: int,
|
|
196
|
+
completed: list[str] | None = None,
|
|
197
|
+
failed_error: str = "",
|
|
198
|
+
) -> None:
|
|
199
|
+
"""Log a structured post-mortem when a pool/executor fails.
|
|
200
|
+
|
|
201
|
+
Gives the operator a concise picture of what finished before the
|
|
202
|
+
failure so they know how much progress was lost.
|
|
203
|
+
"""
|
|
204
|
+
log = get_logger("postmortem")
|
|
205
|
+
try:
|
|
206
|
+
vm = psutil.virtual_memory()
|
|
207
|
+
sys_info = f"sys_pct={vm.percent:.0f}% sys_avail={_human_bytes(vm.available)}"
|
|
208
|
+
except Exception:
|
|
209
|
+
sys_info = ""
|
|
210
|
+
|
|
211
|
+
completed = completed or []
|
|
212
|
+
parts = [
|
|
213
|
+
f"operation={operation}",
|
|
214
|
+
f"workers={workers}",
|
|
215
|
+
f"completed={len(completed)}/{workers}",
|
|
216
|
+
]
|
|
217
|
+
if sys_info:
|
|
218
|
+
parts.append(sys_info)
|
|
219
|
+
if failed_error:
|
|
220
|
+
parts.append(f"error={failed_error}")
|
|
221
|
+
|
|
222
|
+
log.error("FAILURE SUMMARY " + " ".join(parts))
|
|
223
|
+
if completed:
|
|
224
|
+
log.error(f" Completed units: {', '.join(completed)}")
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
class MemoryHeartbeat:
|
|
228
|
+
"""Periodically log process RSS while a long operation runs.
|
|
229
|
+
|
|
230
|
+
Use as a context manager. A no-op when ``interval <= 0`` so callers can pass
|
|
231
|
+
a user-configured value unconditionally. The thread is a daemon and is
|
|
232
|
+
stopped/joined on exit. Runs in whichever process enters it, so for export
|
|
233
|
+
it must be entered inside the worker (where the memory actually lives).
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
def __init__(
|
|
237
|
+
self,
|
|
238
|
+
interval: float,
|
|
239
|
+
unit: str | None = None,
|
|
240
|
+
progress: dict[str, object] | None = None,
|
|
241
|
+
):
|
|
242
|
+
self.interval = interval or 0
|
|
243
|
+
self.progress = progress
|
|
244
|
+
self.unit = unit
|
|
245
|
+
self._stop = threading.Event()
|
|
246
|
+
self._thread: threading.Thread | None = None
|
|
247
|
+
|
|
248
|
+
def __enter__(self) -> "MemoryHeartbeat":
|
|
249
|
+
if self.interval > 0:
|
|
250
|
+
self._thread = threading.Thread(
|
|
251
|
+
target=self._run, name="mem-heartbeat", daemon=True
|
|
252
|
+
)
|
|
253
|
+
self._thread.start()
|
|
254
|
+
return self
|
|
255
|
+
|
|
256
|
+
def __exit__(self, *exc: object) -> bool:
|
|
257
|
+
self._stop.set()
|
|
258
|
+
if self._thread is not None:
|
|
259
|
+
self._thread.join(timeout=self.interval + 1)
|
|
260
|
+
return False
|
|
261
|
+
|
|
262
|
+
def _run(self) -> None:
|
|
263
|
+
log = get_logger("mem")
|
|
264
|
+
while not self._stop.wait(self.interval):
|
|
265
|
+
extra = dict(self.progress) if self.progress else {}
|
|
266
|
+
log.info(
|
|
267
|
+
"Memory heartbeat " + context(unit=self.unit, **extra, **mem_fields())
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
class WorkerMonitor:
|
|
272
|
+
"""Monitor child worker processes from the parent and log their RSS.
|
|
273
|
+
|
|
274
|
+
Runs a daemon thread that polls each worker PID via ``psutil``. When a
|
|
275
|
+
worker disappears (e.g. OOM-killed by SIGKILL), the monitor logs the last
|
|
276
|
+
known RSS and system memory state so the operator has a breadcrumb trail
|
|
277
|
+
even though the child had no chance to log anything itself.
|
|
278
|
+
|
|
279
|
+
Use as a context manager. A no-op when ``interval <= 0``.
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
def __init__(self, pids: list[int], interval: float):
|
|
283
|
+
self.interval = interval or 0
|
|
284
|
+
self._pids = list(pids)
|
|
285
|
+
self._stop = threading.Event()
|
|
286
|
+
self._thread: threading.Thread | None = None
|
|
287
|
+
self._last_rss: dict[int, int] = {}
|
|
288
|
+
|
|
289
|
+
def __enter__(self) -> "WorkerMonitor":
|
|
290
|
+
if self.interval > 0 and self._pids:
|
|
291
|
+
self._thread = threading.Thread(
|
|
292
|
+
target=self._run, name="worker-monitor", daemon=True
|
|
293
|
+
)
|
|
294
|
+
self._thread.start()
|
|
295
|
+
return self
|
|
296
|
+
|
|
297
|
+
def __exit__(self, *exc: object) -> bool:
|
|
298
|
+
self._stop.set()
|
|
299
|
+
if self._thread is not None:
|
|
300
|
+
self._thread.join(timeout=self.interval + 1)
|
|
301
|
+
return False
|
|
302
|
+
|
|
303
|
+
def _run(self) -> None:
|
|
304
|
+
log = get_logger("monitor")
|
|
305
|
+
procs: dict[int, psutil.Process] = {}
|
|
306
|
+
for pid in self._pids:
|
|
307
|
+
try:
|
|
308
|
+
procs[pid] = psutil.Process(pid)
|
|
309
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
310
|
+
pass
|
|
311
|
+
|
|
312
|
+
gone: set[int] = set()
|
|
313
|
+
|
|
314
|
+
while not self._stop.wait(self.interval):
|
|
315
|
+
sys_fields = sys_mem_fields()
|
|
316
|
+
for pid in self._pids:
|
|
317
|
+
if pid in gone:
|
|
318
|
+
continue
|
|
319
|
+
proc = procs.get(pid)
|
|
320
|
+
if proc is None:
|
|
321
|
+
gone.add(pid)
|
|
322
|
+
log.warning(
|
|
323
|
+
"Worker vanished "
|
|
324
|
+
+ context(
|
|
325
|
+
pid=pid,
|
|
326
|
+
last_rss=_human_bytes(self._last_rss.get(pid, 0)),
|
|
327
|
+
status="GONE",
|
|
328
|
+
**sys_fields,
|
|
329
|
+
)
|
|
330
|
+
+ " — likely OOM-killed"
|
|
331
|
+
)
|
|
332
|
+
continue
|
|
333
|
+
try:
|
|
334
|
+
rss = proc.memory_info().rss
|
|
335
|
+
self._last_rss[pid] = rss
|
|
336
|
+
log.debug(
|
|
337
|
+
"Worker health "
|
|
338
|
+
+ context(
|
|
339
|
+
pid=pid,
|
|
340
|
+
rss=_human_bytes(rss),
|
|
341
|
+
status="alive",
|
|
342
|
+
**sys_fields,
|
|
343
|
+
)
|
|
344
|
+
)
|
|
345
|
+
except psutil.NoSuchProcess:
|
|
346
|
+
gone.add(pid)
|
|
347
|
+
log.warning(
|
|
348
|
+
"Worker vanished "
|
|
349
|
+
+ context(
|
|
350
|
+
pid=pid,
|
|
351
|
+
last_rss=_human_bytes(self._last_rss.get(pid, 0)),
|
|
352
|
+
status="GONE",
|
|
353
|
+
**sys_fields,
|
|
354
|
+
)
|
|
355
|
+
+ " — likely OOM-killed"
|
|
356
|
+
)
|
|
357
|
+
except (psutil.AccessDenied, Exception):
|
|
358
|
+
pass
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
class MemoryGate:
|
|
362
|
+
"""Cooperative throttle: sleep the caller when system memory is high.
|
|
363
|
+
|
|
364
|
+
Call :meth:`check` at natural pause points in hot loops (before a
|
|
365
|
+
``fetchmany``, ``read_row_group``, etc.). When system memory exceeds
|
|
366
|
+
*threshold_pct*, the caller sleeps in increments of *sleep_seconds*
|
|
367
|
+
until memory drops or *max_wait* is exhausted.
|
|
368
|
+
|
|
369
|
+
After a throttle event (or max_wait timeout), a cooldown of
|
|
370
|
+
*cooldown_seconds* prevents the gate from re-triggering on every
|
|
371
|
+
subsequent loop iteration — allowing workers to make real progress
|
|
372
|
+
between checks.
|
|
373
|
+
|
|
374
|
+
A no-op when ``threshold_pct <= 0``.
|
|
375
|
+
"""
|
|
376
|
+
|
|
377
|
+
def __init__(
|
|
378
|
+
self,
|
|
379
|
+
threshold_pct: float = 0.0,
|
|
380
|
+
sleep_seconds: float = 5.0,
|
|
381
|
+
max_wait: float = 300.0,
|
|
382
|
+
cooldown_seconds: float = 30.0,
|
|
383
|
+
):
|
|
384
|
+
self.threshold_pct = threshold_pct
|
|
385
|
+
self.sleep_seconds = sleep_seconds
|
|
386
|
+
self.max_wait = max_wait
|
|
387
|
+
self.cooldown_seconds = cooldown_seconds
|
|
388
|
+
self._log = get_logger("gate")
|
|
389
|
+
self._enabled = threshold_pct > 0
|
|
390
|
+
self._last_release: float = 0.0
|
|
391
|
+
|
|
392
|
+
def check(self, context_msg: str = "") -> float:
|
|
393
|
+
"""Block while system memory exceeds the threshold.
|
|
394
|
+
|
|
395
|
+
Returns the total seconds waited (0.0 if no throttling occurred).
|
|
396
|
+
Skips the check entirely if still within the cooldown window from
|
|
397
|
+
the last throttle event.
|
|
398
|
+
"""
|
|
399
|
+
if not self._enabled:
|
|
400
|
+
return 0.0
|
|
401
|
+
|
|
402
|
+
now = _time.monotonic()
|
|
403
|
+
if now - self._last_release < self.cooldown_seconds:
|
|
404
|
+
return 0.0
|
|
405
|
+
|
|
406
|
+
waited = 0.0
|
|
407
|
+
vm = psutil.virtual_memory()
|
|
408
|
+
if vm.percent <= self.threshold_pct:
|
|
409
|
+
return 0.0
|
|
410
|
+
|
|
411
|
+
self._log.warning(
|
|
412
|
+
"Memory pressure — throttling "
|
|
413
|
+
+ context(
|
|
414
|
+
reason=context_msg or "gate",
|
|
415
|
+
threshold=f"{self.threshold_pct:.0f}%",
|
|
416
|
+
**mem_fields(),
|
|
417
|
+
)
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
while vm.percent > self.threshold_pct and waited < self.max_wait:
|
|
421
|
+
_time.sleep(self.sleep_seconds)
|
|
422
|
+
waited += self.sleep_seconds
|
|
423
|
+
vm = psutil.virtual_memory()
|
|
424
|
+
self._log.info(
|
|
425
|
+
"Throttle wait "
|
|
426
|
+
+ context(
|
|
427
|
+
waited=f"{waited:.0f}s",
|
|
428
|
+
sys_pct=f"{vm.percent:.0f}%",
|
|
429
|
+
threshold=f"{self.threshold_pct:.0f}%",
|
|
430
|
+
sys_avail=_human_bytes(vm.available),
|
|
431
|
+
)
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
if waited > 0:
|
|
435
|
+
self._log.info(
|
|
436
|
+
"Throttle released "
|
|
437
|
+
+ context(
|
|
438
|
+
total_waited=f"{waited:.0f}s",
|
|
439
|
+
sys_pct=f"{vm.percent:.0f}%",
|
|
440
|
+
sys_avail=_human_bytes(vm.available),
|
|
441
|
+
)
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
self._last_release = _time.monotonic()
|
|
445
|
+
return waited
|
pybutt/exceptions.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
class PyButtError(Exception):
|
|
2
|
+
"""Base class for all PyButt-specific errors."""
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ConfigurationError(PyButtError, ValueError):
|
|
6
|
+
"""Raised for invalid application configuration."""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class EngineSelectionError(ConfigurationError):
|
|
10
|
+
"""Raised when an unsupported engine is selected."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class InvalidIdentifierError(ConfigurationError):
|
|
14
|
+
"""Raised when a SQL identifier is invalid."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class InvalidParameterError(ConfigurationError):
|
|
18
|
+
"""Raised when a TVF parameter string contains unsafe content."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ManifestError(PyButtError, ValueError):
|
|
22
|
+
"""Base class for manifest validation errors."""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ManifestNotFoundError(FileNotFoundError, ManifestError):
|
|
26
|
+
"""Raised when a manifest file cannot be found."""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class InvalidManifestError(ManifestError):
|
|
30
|
+
"""Raised when a manifest file contains invalid data."""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class InvalidManifestEntryError(InvalidManifestError):
|
|
34
|
+
"""Raised when a manifest entry is malformed."""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DuplicateManifestEntryError(InvalidManifestError):
|
|
38
|
+
"""Raised when a manifest contains duplicate file entries."""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class UnsupportedManifestVersionError(InvalidManifestError):
|
|
42
|
+
"""Raised when a manifest has an unsupported version."""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class UnsupportedManifestTypeError(InvalidManifestError):
|
|
46
|
+
"""Raised when a manifest type is not supported."""
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class MissingManifestEntryError(FileNotFoundError, InvalidManifestError):
|
|
50
|
+
"""Raised when a manifest references a missing Parquet file."""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class PathTraversalError(InvalidManifestError):
|
|
54
|
+
"""Raised when a manifest entry resolves outside its base directory."""
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class SchemaMismatchError(PyButtError, ValueError):
|
|
58
|
+
"""Raised when Parquet schema does not match the destination table schema."""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class DataExportError(PyButtError, RuntimeError):
|
|
62
|
+
"""Raised when exporting data fails."""
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class DataImportError(PyButtError, RuntimeError):
|
|
66
|
+
"""Raised when importing data fails."""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class BatchImportError(DataImportError):
|
|
70
|
+
"""Raised when a batch import fails after retries."""
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class RowGroupImportError(DataImportError):
|
|
74
|
+
"""Raised when a row group import fails after retries."""
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class RetryExceededError(PyButtError, RuntimeError):
|
|
78
|
+
"""Raised when retry logic exhausts all attempts."""
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class TableEmptyError(DataExportError):
|
|
82
|
+
"""Raised when the source table is empty or missing."""
|
pybutt/files/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from .combine import combine_parquet_files
|
|
2
|
+
from .inspect import inspect_manifest, inspect_parquet_file
|
|
3
|
+
from .manifest import (
|
|
4
|
+
MANIFEST_VERSION_1,
|
|
5
|
+
MANIFEST_VERSION_2,
|
|
6
|
+
SUPPORTED_MANIFEST_TYPES,
|
|
7
|
+
default_import_manifest_filename,
|
|
8
|
+
default_manifest_filename,
|
|
9
|
+
load_file_manifest,
|
|
10
|
+
load_manifest,
|
|
11
|
+
validate_manifest_entries,
|
|
12
|
+
write_manifest,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"MANIFEST_VERSION_1",
|
|
17
|
+
"MANIFEST_VERSION_2",
|
|
18
|
+
"SUPPORTED_MANIFEST_TYPES",
|
|
19
|
+
"default_manifest_filename",
|
|
20
|
+
"default_import_manifest_filename",
|
|
21
|
+
"load_file_manifest",
|
|
22
|
+
"load_manifest",
|
|
23
|
+
"validate_manifest_entries",
|
|
24
|
+
"write_manifest",
|
|
25
|
+
"inspect_manifest",
|
|
26
|
+
"inspect_parquet_file",
|
|
27
|
+
"combine_parquet_files",
|
|
28
|
+
]
|