dehelpers 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dehelpers/__init__.py +30 -0
- dehelpers/_redact.py +122 -0
- dehelpers/api.py +380 -0
- dehelpers/db.py +243 -0
- dehelpers/exceptions.py +66 -0
- dehelpers/logger.py +217 -0
- dehelpers/py.typed +1 -0
- dehelpers-0.1.0.dist-info/METADATA +279 -0
- dehelpers-0.1.0.dist-info/RECORD +11 -0
- dehelpers-0.1.0.dist-info/WHEEL +4 -0
- dehelpers-0.1.0.dist-info/licenses/LICENSE +21 -0
dehelpers/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""dehelpers: Lightweight utilities for data engineering pipelines."""
|
|
2
|
+
|
|
3
|
+
from dehelpers.api import NextLinkPagination, ResilientClient, RetryPolicy
|
|
4
|
+
from dehelpers.db import DatabaseManager
|
|
5
|
+
from dehelpers.exceptions import (
|
|
6
|
+
DatabaseError,
|
|
7
|
+
DPHError,
|
|
8
|
+
PaginationError,
|
|
9
|
+
RetryError,
|
|
10
|
+
)
|
|
11
|
+
from dehelpers.logger import LogContext, get_logger
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
# API
|
|
15
|
+
"ResilientClient",
|
|
16
|
+
"RetryPolicy",
|
|
17
|
+
"NextLinkPagination",
|
|
18
|
+
# Database
|
|
19
|
+
"DatabaseManager",
|
|
20
|
+
# Logger
|
|
21
|
+
"get_logger",
|
|
22
|
+
"LogContext",
|
|
23
|
+
# Exceptions
|
|
24
|
+
"DPHError",
|
|
25
|
+
"RetryError",
|
|
26
|
+
"PaginationError",
|
|
27
|
+
"DatabaseError",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
__version__ = "0.1.0"
|
dehelpers/_redact.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""Shared redaction utilities.
|
|
2
|
+
|
|
3
|
+
Private module — not part of the public API. Used by the logger,
|
|
4
|
+
API client, and database manager to strip sensitive values before
|
|
5
|
+
they reach log output or ``__repr__`` strings.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import copy
|
|
11
|
+
from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit
|
|
12
|
+
|
|
13
|
+
DEFAULT_SENSITIVE_KEYS: frozenset[str] = frozenset(
|
|
14
|
+
{
|
|
15
|
+
"password",
|
|
16
|
+
"secret",
|
|
17
|
+
"token",
|
|
18
|
+
"api_key",
|
|
19
|
+
"authorization",
|
|
20
|
+
"dsn",
|
|
21
|
+
"connection_string",
|
|
22
|
+
"credential",
|
|
23
|
+
"passphrase",
|
|
24
|
+
"private_key",
|
|
25
|
+
"client_secret",
|
|
26
|
+
}
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
REDACTED = "***REDACTED***"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _is_sensitive(key: str, sensitive_keys: frozenset[str]) -> bool:
|
|
33
|
+
"""Return True if *key* contains any sensitive substring (case-insensitive)."""
|
|
34
|
+
key_lower = key.lower()
|
|
35
|
+
return any(s in key_lower for s in sensitive_keys)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def redact_dict(
|
|
39
|
+
d: dict,
|
|
40
|
+
sensitive_keys: frozenset[str] | None = None,
|
|
41
|
+
extra_sensitive_keys: frozenset[str] | None = None,
|
|
42
|
+
) -> dict:
|
|
43
|
+
"""Deep-clone *d* and replace values whose keys match the sensitive set.
|
|
44
|
+
|
|
45
|
+
Matching is **case-insensitive substring** — e.g. a key named
|
|
46
|
+
``db_password`` matches ``password``.
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
d:
|
|
51
|
+
The dictionary to redact. Not mutated.
|
|
52
|
+
sensitive_keys:
|
|
53
|
+
Override the full sensitive-key set. When ``None``, uses
|
|
54
|
+
:data:`DEFAULT_SENSITIVE_KEYS`.
|
|
55
|
+
extra_sensitive_keys:
|
|
56
|
+
Additional keys to treat as sensitive, merged with *sensitive_keys*.
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
dict
|
|
61
|
+
A deep copy of *d* with sensitive values replaced by
|
|
62
|
+
``'***REDACTED***'``.
|
|
63
|
+
"""
|
|
64
|
+
keys = sensitive_keys if sensitive_keys is not None else DEFAULT_SENSITIVE_KEYS
|
|
65
|
+
if extra_sensitive_keys:
|
|
66
|
+
keys = keys | extra_sensitive_keys
|
|
67
|
+
|
|
68
|
+
return _redact_recursive(d, keys)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _redact_recursive(obj: object, sensitive_keys: frozenset[str]) -> object:
|
|
72
|
+
"""Recursively redact sensitive keys in nested structures."""
|
|
73
|
+
if isinstance(obj, dict):
|
|
74
|
+
result: dict = {}
|
|
75
|
+
for k, v in obj.items():
|
|
76
|
+
if isinstance(k, str) and _is_sensitive(k, sensitive_keys):
|
|
77
|
+
result[k] = REDACTED
|
|
78
|
+
else:
|
|
79
|
+
result[k] = _redact_recursive(v, sensitive_keys)
|
|
80
|
+
return result
|
|
81
|
+
if isinstance(obj, (list, tuple)):
|
|
82
|
+
redacted = [_redact_recursive(item, sensitive_keys) for item in obj]
|
|
83
|
+
return type(obj)(redacted)
|
|
84
|
+
return copy.deepcopy(obj) if isinstance(obj, (set, frozenset)) else obj
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def redact_url(
|
|
88
|
+
url: str,
|
|
89
|
+
sensitive_keys: frozenset[str] | None = None,
|
|
90
|
+
) -> str:
|
|
91
|
+
"""Return *url* with query-parameter values redacted for sensitive keys.
|
|
92
|
+
|
|
93
|
+
Parameters
|
|
94
|
+
----------
|
|
95
|
+
url:
|
|
96
|
+
The URL to redact.
|
|
97
|
+
sensitive_keys:
|
|
98
|
+
Override the full sensitive-key set. When ``None``, uses
|
|
99
|
+
:data:`DEFAULT_SENSITIVE_KEYS`.
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
str
|
|
104
|
+
The URL with matching query-parameter values replaced by
|
|
105
|
+
``'***REDACTED***'``.
|
|
106
|
+
"""
|
|
107
|
+
keys = sensitive_keys if sensitive_keys is not None else DEFAULT_SENSITIVE_KEYS
|
|
108
|
+
|
|
109
|
+
parts = urlsplit(url)
|
|
110
|
+
if not parts.query:
|
|
111
|
+
return url
|
|
112
|
+
|
|
113
|
+
params = parse_qs(parts.query, keep_blank_values=True)
|
|
114
|
+
redacted_params: dict[str, list[str]] = {}
|
|
115
|
+
for k, values in params.items():
|
|
116
|
+
if _is_sensitive(k, keys):
|
|
117
|
+
redacted_params[k] = [REDACTED] * len(values)
|
|
118
|
+
else:
|
|
119
|
+
redacted_params[k] = values
|
|
120
|
+
|
|
121
|
+
new_query = urlencode(redacted_params, doseq=True, quote_via=lambda s, safe="", encoding=None, errors=None: s)
|
|
122
|
+
return urlunsplit(parts._replace(query=new_query))
|
dehelpers/api.py
ADDED
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
"""Resilient HTTP client with bounded retries, exponential backoff,
|
|
2
|
+
jitter, total-timeout guard, and next-link pagination.
|
|
3
|
+
|
|
4
|
+
Usage::
|
|
5
|
+
|
|
6
|
+
from dehelpers import ResilientClient, RetryPolicy
|
|
7
|
+
|
|
8
|
+
client = ResilientClient()
|
|
9
|
+
resp = client.get("https://api.example.com/data")
|
|
10
|
+
|
|
11
|
+
# Paginate through all items
|
|
12
|
+
for item in client.paginate("https://api.example.com/items"):
|
|
13
|
+
process(item)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
import random
|
|
20
|
+
import time
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from typing import Any, Iterator
|
|
23
|
+
|
|
24
|
+
import requests
|
|
25
|
+
|
|
26
|
+
from dehelpers._redact import redact_url
|
|
27
|
+
from dehelpers.exceptions import PaginationError, RetryError
|
|
28
|
+
|
|
29
|
+
__all__ = ["RetryPolicy", "NextLinkPagination", "ResilientClient"]
|
|
30
|
+
|
|
31
|
+
# HTTP methods considered idempotent and therefore safe to retry.
|
|
32
|
+
_IDEMPOTENT_METHODS = frozenset({"GET", "HEAD", "OPTIONS"})
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# Retry Policy
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class RetryPolicy:
|
|
40
|
+
"""Configuration for retry behaviour.
|
|
41
|
+
|
|
42
|
+
Attributes
|
|
43
|
+
----------
|
|
44
|
+
max_retries:
|
|
45
|
+
Maximum number of *retries* (not total attempts). Total attempts
|
|
46
|
+
= ``max_retries + 1``.
|
|
47
|
+
backoff_base:
|
|
48
|
+
Base delay in seconds for exponential backoff.
|
|
49
|
+
backoff_max:
|
|
50
|
+
Maximum delay cap in seconds.
|
|
51
|
+
jitter:
|
|
52
|
+
If ``True``, adds random jitter to the delay to prevent
|
|
53
|
+
thundering-herd effects.
|
|
54
|
+
total_timeout:
|
|
55
|
+
Wall-clock cap in seconds measured from the **start of the first
|
|
56
|
+
attempt**. Retries abort if this is exceeded.
|
|
57
|
+
retryable_statuses:
|
|
58
|
+
HTTP status codes that trigger a retry.
|
|
59
|
+
retry_non_idempotent:
|
|
60
|
+
If ``True``, retries POST/PUT/DELETE. Default is ``False``
|
|
61
|
+
(only idempotent methods are retried).
|
|
62
|
+
connect_timeout:
|
|
63
|
+
Per-request TCP connect timeout in seconds.
|
|
64
|
+
read_timeout:
|
|
65
|
+
Per-request read timeout in seconds.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
max_retries: int = 3
|
|
69
|
+
backoff_base: float = 1.0
|
|
70
|
+
backoff_max: float = 30.0
|
|
71
|
+
jitter: bool = True
|
|
72
|
+
total_timeout: float = 120.0
|
|
73
|
+
retryable_statuses: frozenset[int] = field(
|
|
74
|
+
default_factory=lambda: frozenset({429, 500, 502, 503, 504})
|
|
75
|
+
)
|
|
76
|
+
retry_non_idempotent: bool = False
|
|
77
|
+
connect_timeout: float = 5.0
|
|
78
|
+
read_timeout: float = 30.0
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# Pagination
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
class NextLinkPagination:
|
|
85
|
+
"""Follows a ``next`` URL key in the JSON response.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
next_key:
|
|
90
|
+
Key in the JSON response that contains the next page URL.
|
|
91
|
+
results_key:
|
|
92
|
+
Key in the JSON response that contains the list of items.
|
|
93
|
+
max_pages:
|
|
94
|
+
Safety limit on the number of pages to fetch.
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
next_key: str = "next",
|
|
100
|
+
results_key: str = "results",
|
|
101
|
+
max_pages: int = 100,
|
|
102
|
+
) -> None:
|
|
103
|
+
self.next_key = next_key
|
|
104
|
+
self.results_key = results_key
|
|
105
|
+
self.max_pages = max_pages
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ---------------------------------------------------------------------------
|
|
109
|
+
# Client
|
|
110
|
+
# ---------------------------------------------------------------------------
|
|
111
|
+
class ResilientClient:
|
|
112
|
+
"""HTTP client with automatic retries, backoff, and pagination.
|
|
113
|
+
|
|
114
|
+
Parameters
|
|
115
|
+
----------
|
|
116
|
+
retry_policy:
|
|
117
|
+
Retry configuration. Uses sensible defaults when ``None``.
|
|
118
|
+
logger:
|
|
119
|
+
Logger instance. A default JSON logger is created when ``None``.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
def __init__(
|
|
123
|
+
self,
|
|
124
|
+
retry_policy: RetryPolicy | None = None,
|
|
125
|
+
logger: logging.Logger | None = None,
|
|
126
|
+
) -> None:
|
|
127
|
+
self._policy = retry_policy or RetryPolicy()
|
|
128
|
+
self._log = logger or logging.getLogger(__name__)
|
|
129
|
+
self._session = requests.Session()
|
|
130
|
+
|
|
131
|
+
# -- Public helpers -----------------------------------------------------
|
|
132
|
+
|
|
133
|
+
def get(self, url: str, **kwargs: Any) -> requests.Response:
|
|
134
|
+
"""Send a GET request with retry protection."""
|
|
135
|
+
return self.request("GET", url, **kwargs)
|
|
136
|
+
|
|
137
|
+
def post(self, url: str, **kwargs: Any) -> requests.Response:
|
|
138
|
+
"""Send a POST request with retry protection."""
|
|
139
|
+
return self.request("POST", url, **kwargs)
|
|
140
|
+
|
|
141
|
+
def put(self, url: str, **kwargs: Any) -> requests.Response:
|
|
142
|
+
"""Send a PUT request with retry protection."""
|
|
143
|
+
return self.request("PUT", url, **kwargs)
|
|
144
|
+
|
|
145
|
+
def delete(self, url: str, **kwargs: Any) -> requests.Response:
|
|
146
|
+
"""Send a DELETE request with retry protection."""
|
|
147
|
+
return self.request("DELETE", url, **kwargs)
|
|
148
|
+
|
|
149
|
+
# -- Core request -------------------------------------------------------
|
|
150
|
+
|
|
151
|
+
def request(self, method: str, url: str, **kwargs: Any) -> requests.Response:
|
|
152
|
+
"""Send an HTTP request with bounded retries and backoff.
|
|
153
|
+
|
|
154
|
+
Raises
|
|
155
|
+
------
|
|
156
|
+
RetryError
|
|
157
|
+
When all retry attempts are exhausted or the total timeout
|
|
158
|
+
is exceeded. The original exception is preserved as
|
|
159
|
+
``__cause__``.
|
|
160
|
+
requests.HTTPError
|
|
161
|
+
On non-retryable HTTP errors (e.g. 400, 401, 403, 404).
|
|
162
|
+
"""
|
|
163
|
+
policy = self._policy
|
|
164
|
+
method_upper = method.upper()
|
|
165
|
+
can_retry = (
|
|
166
|
+
method_upper in _IDEMPOTENT_METHODS or policy.retry_non_idempotent
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
timeout_tuple = kwargs.pop(
|
|
170
|
+
"timeout", (policy.connect_timeout, policy.read_timeout)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
safe_url = redact_url(url)
|
|
174
|
+
start = time.monotonic()
|
|
175
|
+
last_exception: Exception | None = None
|
|
176
|
+
last_status: int | None = None
|
|
177
|
+
|
|
178
|
+
for attempt in range(policy.max_retries + 1):
|
|
179
|
+
# Total-timeout guard.
|
|
180
|
+
elapsed = time.monotonic() - start
|
|
181
|
+
if attempt > 0 and elapsed >= policy.total_timeout:
|
|
182
|
+
raise RetryError(
|
|
183
|
+
f"Total timeout ({policy.total_timeout}s) exceeded after "
|
|
184
|
+
f"{attempt} attempt(s) for {method_upper} {safe_url}",
|
|
185
|
+
last_status=last_status,
|
|
186
|
+
attempts=attempt,
|
|
187
|
+
) from last_exception
|
|
188
|
+
|
|
189
|
+
try:
|
|
190
|
+
resp = self._session.request(
|
|
191
|
+
method_upper, url, timeout=timeout_tuple, **kwargs
|
|
192
|
+
)
|
|
193
|
+
last_status = resp.status_code
|
|
194
|
+
|
|
195
|
+
self._log.info(
|
|
196
|
+
"HTTP %s %s -> %d (attempt %d/%d)",
|
|
197
|
+
method_upper,
|
|
198
|
+
safe_url,
|
|
199
|
+
resp.status_code,
|
|
200
|
+
attempt + 1,
|
|
201
|
+
policy.max_retries + 1,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Success — return immediately.
|
|
205
|
+
if resp.status_code < 400:
|
|
206
|
+
return resp
|
|
207
|
+
|
|
208
|
+
# Retryable status?
|
|
209
|
+
if (
|
|
210
|
+
can_retry
|
|
211
|
+
and resp.status_code in policy.retryable_statuses
|
|
212
|
+
and attempt < policy.max_retries
|
|
213
|
+
):
|
|
214
|
+
delay = self._compute_delay(attempt, resp)
|
|
215
|
+
self._log.warning(
|
|
216
|
+
"Retryable %d for %s %s — sleeping %.2fs",
|
|
217
|
+
resp.status_code,
|
|
218
|
+
method_upper,
|
|
219
|
+
safe_url,
|
|
220
|
+
delay,
|
|
221
|
+
)
|
|
222
|
+
time.sleep(delay)
|
|
223
|
+
continue
|
|
224
|
+
|
|
225
|
+
# Non-retryable HTTP error — raise immediately.
|
|
226
|
+
resp.raise_for_status()
|
|
227
|
+
|
|
228
|
+
except requests.exceptions.HTTPError as exc:
|
|
229
|
+
# If the status is in retryable_statuses but we ran out of retries,
|
|
230
|
+
# wrap it in a RetryError. Otherwise (like 400, 401, 404), raise it immediately.
|
|
231
|
+
if last_status in policy.retryable_statuses:
|
|
232
|
+
raise RetryError(
|
|
233
|
+
f"All {policy.max_retries + 1} attempts exhausted for "
|
|
234
|
+
f"{method_upper} {safe_url} (last status: {last_status})",
|
|
235
|
+
last_status=last_status,
|
|
236
|
+
attempts=attempt + 1,
|
|
237
|
+
) from exc
|
|
238
|
+
raise exc
|
|
239
|
+
except requests.RequestException as exc:
|
|
240
|
+
last_exception = exc
|
|
241
|
+
if (
|
|
242
|
+
can_retry
|
|
243
|
+
and attempt < policy.max_retries
|
|
244
|
+
):
|
|
245
|
+
delay = self._compute_delay(attempt)
|
|
246
|
+
self._log.warning(
|
|
247
|
+
"Connection error for %s %s (attempt %d/%d) — %s — "
|
|
248
|
+
"sleeping %.2fs",
|
|
249
|
+
method_upper,
|
|
250
|
+
safe_url,
|
|
251
|
+
attempt + 1,
|
|
252
|
+
policy.max_retries + 1,
|
|
253
|
+
type(exc).__name__,
|
|
254
|
+
delay,
|
|
255
|
+
)
|
|
256
|
+
time.sleep(delay)
|
|
257
|
+
continue
|
|
258
|
+
raise RetryError(
|
|
259
|
+
f"Request failed after {attempt + 1} attempt(s) for "
|
|
260
|
+
f"{method_upper} {safe_url}: {exc}",
|
|
261
|
+
last_status=last_status,
|
|
262
|
+
attempts=attempt + 1,
|
|
263
|
+
) from exc
|
|
264
|
+
|
|
265
|
+
# All retries exhausted with an HTTP error status.
|
|
266
|
+
raise RetryError(
|
|
267
|
+
f"All {policy.max_retries + 1} attempts exhausted for "
|
|
268
|
+
f"{method_upper} {safe_url} (last status: {last_status})",
|
|
269
|
+
last_status=last_status,
|
|
270
|
+
attempts=policy.max_retries + 1,
|
|
271
|
+
) from last_exception
|
|
272
|
+
|
|
273
|
+
# -- Pagination ---------------------------------------------------------
|
|
274
|
+
|
|
275
|
+
def paginate(
|
|
276
|
+
self,
|
|
277
|
+
url: str,
|
|
278
|
+
pagination: NextLinkPagination | None = None,
|
|
279
|
+
**kwargs: Any,
|
|
280
|
+
) -> Iterator[dict]:
|
|
281
|
+
"""Yield individual items across paginated responses.
|
|
282
|
+
|
|
283
|
+
Parameters
|
|
284
|
+
----------
|
|
285
|
+
url:
|
|
286
|
+
Initial page URL.
|
|
287
|
+
pagination:
|
|
288
|
+
Pagination strategy. Defaults to :class:`NextLinkPagination`.
|
|
289
|
+
**kwargs:
|
|
290
|
+
Extra keyword arguments forwarded to each GET request.
|
|
291
|
+
|
|
292
|
+
Yields
|
|
293
|
+
------
|
|
294
|
+
dict
|
|
295
|
+
Individual items from each page.
|
|
296
|
+
|
|
297
|
+
Raises
|
|
298
|
+
------
|
|
299
|
+
PaginationError
|
|
300
|
+
On any failure. ``PaginationError.collected_items``
|
|
301
|
+
contains items fetched before the failure.
|
|
302
|
+
"""
|
|
303
|
+
pag = pagination or NextLinkPagination()
|
|
304
|
+
collected: list[dict] = []
|
|
305
|
+
current_url: str | None = url
|
|
306
|
+
|
|
307
|
+
for page_num in range(1, pag.max_pages + 1):
|
|
308
|
+
if current_url is None:
|
|
309
|
+
return
|
|
310
|
+
|
|
311
|
+
try:
|
|
312
|
+
resp = self.get(current_url, **kwargs)
|
|
313
|
+
resp.raise_for_status()
|
|
314
|
+
data = resp.json()
|
|
315
|
+
except Exception as exc:
|
|
316
|
+
raise PaginationError(
|
|
317
|
+
f"Failed on page {page_num}: {exc}",
|
|
318
|
+
collected_items=collected,
|
|
319
|
+
cause=exc,
|
|
320
|
+
) from exc
|
|
321
|
+
|
|
322
|
+
items = data.get(pag.results_key, [])
|
|
323
|
+
if not items:
|
|
324
|
+
return
|
|
325
|
+
|
|
326
|
+
collected.extend(items)
|
|
327
|
+
yield from items
|
|
328
|
+
|
|
329
|
+
# Validate the 'next' field.
|
|
330
|
+
next_val = data.get(pag.next_key)
|
|
331
|
+
if next_val is None:
|
|
332
|
+
return
|
|
333
|
+
if not isinstance(next_val, str):
|
|
334
|
+
raise PaginationError(
|
|
335
|
+
f"Expected '{pag.next_key}' to be a string URL or None, "
|
|
336
|
+
f"got {type(next_val).__name__}: {next_val!r}",
|
|
337
|
+
collected_items=collected,
|
|
338
|
+
)
|
|
339
|
+
current_url = next_val
|
|
340
|
+
|
|
341
|
+
self._log.warning("Reached max_pages limit (%d)", pag.max_pages)
|
|
342
|
+
|
|
343
|
+
# -- Internal -----------------------------------------------------------
|
|
344
|
+
|
|
345
|
+
def _compute_delay(
|
|
346
|
+
self,
|
|
347
|
+
attempt: int,
|
|
348
|
+
response: requests.Response | None = None,
|
|
349
|
+
) -> float:
|
|
350
|
+
"""Calculate backoff delay for the given attempt.
|
|
351
|
+
|
|
352
|
+
Respects ``Retry-After`` header on 429 responses when available.
|
|
353
|
+
"""
|
|
354
|
+
policy = self._policy
|
|
355
|
+
|
|
356
|
+
# Honour Retry-After header if present (429 responses).
|
|
357
|
+
if response is not None and response.status_code == 429:
|
|
358
|
+
retry_after = response.headers.get("Retry-After")
|
|
359
|
+
if retry_after is not None:
|
|
360
|
+
try:
|
|
361
|
+
return max(0.0, float(retry_after))
|
|
362
|
+
except (ValueError, TypeError):
|
|
363
|
+
pass # Fall through to normal backoff.
|
|
364
|
+
|
|
365
|
+
delay = min(policy.backoff_base * (2**attempt), policy.backoff_max)
|
|
366
|
+
if policy.jitter:
|
|
367
|
+
delay += random.uniform(0, delay * 0.25) # noqa: S311
|
|
368
|
+
return delay
|
|
369
|
+
|
|
370
|
+
# -- Cleanup ------------------------------------------------------------
|
|
371
|
+
|
|
372
|
+
def close(self) -> None:
|
|
373
|
+
"""Close the underlying requests session."""
|
|
374
|
+
self._session.close()
|
|
375
|
+
|
|
376
|
+
def __enter__(self) -> ResilientClient:
|
|
377
|
+
return self
|
|
378
|
+
|
|
379
|
+
def __exit__(self, *_: object) -> None:
|
|
380
|
+
self.close()
|
dehelpers/db.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""PostgreSQL-first database helper with safe connection pooling.
|
|
2
|
+
|
|
3
|
+
Built on SQLAlchemy 2.0 with context-managed sessions, pre-ping
|
|
4
|
+
health checks, connection recycling, and optional Pandas DataFrame
|
|
5
|
+
output.
|
|
6
|
+
|
|
7
|
+
Usage::
|
|
8
|
+
|
|
9
|
+
from dehelpers import DatabaseManager
|
|
10
|
+
|
|
11
|
+
with DatabaseManager() as db: # reads DATABASE_URL env var
|
|
12
|
+
rows = db.execute("SELECT * FROM users WHERE active = :active",
|
|
13
|
+
{"active": True})
|
|
14
|
+
df = db.to_dataframe("SELECT * FROM sales")
|
|
15
|
+
|
|
16
|
+
.. warning::
|
|
17
|
+
|
|
18
|
+
If using in forked environments (Airflow, multiprocessing), create
|
|
19
|
+
the ``DatabaseManager`` **inside each worker process** or call
|
|
20
|
+
:meth:`dispose` before forking.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import os
|
|
26
|
+
from typing import Any
|
|
27
|
+
|
|
28
|
+
from sqlalchemy import Row, create_engine, text
|
|
29
|
+
from sqlalchemy.exc import SQLAlchemyError
|
|
30
|
+
from sqlalchemy.orm import Session, sessionmaker
|
|
31
|
+
|
|
32
|
+
from dehelpers._redact import redact_url
|
|
33
|
+
from dehelpers.exceptions import DatabaseError
|
|
34
|
+
|
|
35
|
+
__all__ = ["DatabaseManager"]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DatabaseManager:
|
|
39
|
+
"""PostgreSQL connection manager with safe pooling defaults.
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
dsn:
|
|
44
|
+
SQLAlchemy connection URL. Falls back to the ``DATABASE_URL``
|
|
45
|
+
environment variable when ``None``.
|
|
46
|
+
pool_size:
|
|
47
|
+
Number of persistent connections in the pool.
|
|
48
|
+
max_overflow:
|
|
49
|
+
Maximum additional connections beyond *pool_size*.
|
|
50
|
+
pool_recycle:
|
|
51
|
+
Seconds before a connection is recycled (replaced).
|
|
52
|
+
pool_pre_ping:
|
|
53
|
+
If ``True``, issues a lightweight ``SELECT 1`` before checking
|
|
54
|
+
out a connection to verify it is still alive.
|
|
55
|
+
pool_timeout:
|
|
56
|
+
Seconds to wait for a connection from the pool before raising
|
|
57
|
+
:class:`~dehelpers.exceptions.DatabaseError`.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
dsn: str | None = None,
|
|
63
|
+
*,
|
|
64
|
+
pool_size: int = 5,
|
|
65
|
+
max_overflow: int = 2,
|
|
66
|
+
pool_recycle: int = 1800,
|
|
67
|
+
pool_pre_ping: bool = True,
|
|
68
|
+
pool_timeout: int = 30,
|
|
69
|
+
) -> None:
|
|
70
|
+
resolved_dsn = dsn or os.environ.get("DATABASE_URL")
|
|
71
|
+
if not resolved_dsn:
|
|
72
|
+
raise DatabaseError(
|
|
73
|
+
"No DSN provided and DATABASE_URL environment variable is not set."
|
|
74
|
+
)
|
|
75
|
+
self._dsn = resolved_dsn
|
|
76
|
+
|
|
77
|
+
# SQLite uses SingletonThreadPool which doesn't support pool_size,
|
|
78
|
+
# max_overflow, or pool_timeout. Only pass those for real backends.
|
|
79
|
+
engine_kwargs: dict[str, object] = {
|
|
80
|
+
"pool_recycle": pool_recycle,
|
|
81
|
+
"pool_pre_ping": pool_pre_ping,
|
|
82
|
+
}
|
|
83
|
+
if not resolved_dsn.startswith("sqlite"):
|
|
84
|
+
engine_kwargs.update(
|
|
85
|
+
pool_size=pool_size,
|
|
86
|
+
max_overflow=max_overflow,
|
|
87
|
+
pool_timeout=pool_timeout,
|
|
88
|
+
)
|
|
89
|
+
self._engine = create_engine(resolved_dsn, **engine_kwargs)
|
|
90
|
+
self._session_factory = sessionmaker(bind=self._engine)
|
|
91
|
+
|
|
92
|
+
# -- Context manager ----------------------------------------------------
|
|
93
|
+
|
|
94
|
+
def __enter__(self) -> DatabaseManager:
|
|
95
|
+
return self
|
|
96
|
+
|
|
97
|
+
def __exit__(self, *_: object) -> None:
|
|
98
|
+
self.dispose()
|
|
99
|
+
|
|
100
|
+
# -- Session management -------------------------------------------------
|
|
101
|
+
|
|
102
|
+
def session(self) -> _SessionContext:
|
|
103
|
+
"""Return a context manager that yields a SQLAlchemy ``Session``.
|
|
104
|
+
|
|
105
|
+
Auto-commits on clean exit and auto-rolls-back on exception.
|
|
106
|
+
|
|
107
|
+
Example::
|
|
108
|
+
|
|
109
|
+
with db.session() as session:
|
|
110
|
+
session.execute(text("INSERT INTO logs ..."))
|
|
111
|
+
"""
|
|
112
|
+
return _SessionContext(self._session_factory)
|
|
113
|
+
|
|
114
|
+
# -- Query shortcuts ----------------------------------------------------
|
|
115
|
+
|
|
116
|
+
def execute(
|
|
117
|
+
self,
|
|
118
|
+
sql: str,
|
|
119
|
+
params: dict[str, Any] | None = None,
|
|
120
|
+
) -> list[Row]:
|
|
121
|
+
"""Execute *sql* and return all rows as a ``list[Row]``.
|
|
122
|
+
|
|
123
|
+
The connection is returned to the pool immediately after.
|
|
124
|
+
|
|
125
|
+
Parameters
|
|
126
|
+
----------
|
|
127
|
+
sql:
|
|
128
|
+
SQL string (use ``:param`` style placeholders).
|
|
129
|
+
params:
|
|
130
|
+
Bind parameters.
|
|
131
|
+
|
|
132
|
+
Returns
|
|
133
|
+
-------
|
|
134
|
+
list[Row]
|
|
135
|
+
Rows from the query result. Each :class:`~sqlalchemy.engine.Row`
|
|
136
|
+
supports both index and attribute access.
|
|
137
|
+
"""
|
|
138
|
+
try:
|
|
139
|
+
with self._engine.connect() as conn:
|
|
140
|
+
result = conn.execute(text(sql), params or {})
|
|
141
|
+
rows = list(result.fetchall())
|
|
142
|
+
conn.commit()
|
|
143
|
+
return rows
|
|
144
|
+
except SQLAlchemyError as exc:
|
|
145
|
+
raise DatabaseError(f"Query execution failed: {exc}") from exc
|
|
146
|
+
|
|
147
|
+
def fetch_one(
|
|
148
|
+
self,
|
|
149
|
+
sql: str,
|
|
150
|
+
params: dict[str, Any] | None = None,
|
|
151
|
+
) -> Row | None:
|
|
152
|
+
"""Execute *sql* and return the first row, or ``None``.
|
|
153
|
+
|
|
154
|
+
Parameters
|
|
155
|
+
----------
|
|
156
|
+
sql:
|
|
157
|
+
SQL string.
|
|
158
|
+
params:
|
|
159
|
+
Bind parameters.
|
|
160
|
+
"""
|
|
161
|
+
try:
|
|
162
|
+
with self._engine.connect() as conn:
|
|
163
|
+
result = conn.execute(text(sql), params or {})
|
|
164
|
+
row = result.fetchone()
|
|
165
|
+
conn.commit()
|
|
166
|
+
return row
|
|
167
|
+
except SQLAlchemyError as exc:
|
|
168
|
+
raise DatabaseError(f"Query execution failed: {exc}") from exc
|
|
169
|
+
|
|
170
|
+
def to_dataframe(
|
|
171
|
+
self,
|
|
172
|
+
sql: str,
|
|
173
|
+
params: dict[str, Any] | None = None,
|
|
174
|
+
) -> Any: # -> pd.DataFrame (lazy import)
|
|
175
|
+
"""Execute *sql* and return the result as a Pandas DataFrame.
|
|
176
|
+
|
|
177
|
+
Requires the ``[dataframe]`` extra::
|
|
178
|
+
|
|
179
|
+
pip install dehelpers[dataframe]
|
|
180
|
+
|
|
181
|
+
Raises
|
|
182
|
+
------
|
|
183
|
+
ImportError
|
|
184
|
+
If ``pandas`` is not installed.
|
|
185
|
+
DatabaseError
|
|
186
|
+
On query failure.
|
|
187
|
+
"""
|
|
188
|
+
try:
|
|
189
|
+
import pandas as pd
|
|
190
|
+
except ImportError:
|
|
191
|
+
raise ImportError(
|
|
192
|
+
"pandas is required for to_dataframe(). "
|
|
193
|
+
"Install it with: pip install dehelpers[dataframe]"
|
|
194
|
+
) from None
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
with self._engine.connect() as conn:
|
|
198
|
+
return pd.read_sql(text(sql), conn, params=params or {})
|
|
199
|
+
except SQLAlchemyError as exc:
|
|
200
|
+
raise DatabaseError(
|
|
201
|
+
f"DataFrame query failed: {exc}"
|
|
202
|
+
) from exc
|
|
203
|
+
|
|
204
|
+
# -- Cleanup ------------------------------------------------------------
|
|
205
|
+
|
|
206
|
+
def dispose(self) -> None:
|
|
207
|
+
"""Dispose the engine and close all pooled connections."""
|
|
208
|
+
self._engine.dispose()
|
|
209
|
+
|
|
210
|
+
# -- Repr (redacted) ----------------------------------------------------
|
|
211
|
+
|
|
212
|
+
def __repr__(self) -> str:
|
|
213
|
+
safe = redact_url(self._dsn)
|
|
214
|
+
return f"DatabaseManager(dsn={safe!r})"
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# ---------------------------------------------------------------------------
|
|
218
|
+
# Session context manager
|
|
219
|
+
# ---------------------------------------------------------------------------
|
|
220
|
+
class _SessionContext:
|
|
221
|
+
"""Internal context manager wrapping a SQLAlchemy Session.
|
|
222
|
+
|
|
223
|
+
Commits on clean exit, rolls back on exception.
|
|
224
|
+
"""
|
|
225
|
+
|
|
226
|
+
def __init__(self, factory: sessionmaker) -> None:
|
|
227
|
+
self._factory = factory
|
|
228
|
+
self._session: Session | None = None
|
|
229
|
+
|
|
230
|
+
def __enter__(self) -> Session:
|
|
231
|
+
self._session = self._factory()
|
|
232
|
+
return self._session
|
|
233
|
+
|
|
234
|
+
def __exit__(self, exc_type: type | None, *_: object) -> None:
|
|
235
|
+
if self._session is None:
|
|
236
|
+
return
|
|
237
|
+
try:
|
|
238
|
+
if exc_type is None:
|
|
239
|
+
self._session.commit()
|
|
240
|
+
else:
|
|
241
|
+
self._session.rollback()
|
|
242
|
+
finally:
|
|
243
|
+
self._session.close()
|
dehelpers/exceptions.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Custom exceptions for dehelpers.
|
|
2
|
+
|
|
3
|
+
Centralised exception hierarchy so every module raises from here
|
|
4
|
+
instead of scattering ad-hoc exceptions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
__all__ = ["DPHError", "RetryError", "PaginationError", "DatabaseError"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DPHError(Exception):
|
|
11
|
+
"""Base exception for dehelpers."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RetryError(DPHError):
|
|
15
|
+
"""Raised when all retry attempts are exhausted or total timeout exceeded.
|
|
16
|
+
|
|
17
|
+
The original exception (connection error, timeout, etc.) is always
|
|
18
|
+
preserved as ``__cause__`` via ``raise RetryError(...) from original``.
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
last_status: HTTP status code of the last attempt, or ``None``
|
|
22
|
+
if the failure was a connection-level error.
|
|
23
|
+
attempts: Total number of attempts made (including the first).
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
message: str,
|
|
29
|
+
*,
|
|
30
|
+
last_status: int | None = None,
|
|
31
|
+
attempts: int = 0,
|
|
32
|
+
) -> None:
|
|
33
|
+
super().__init__(message)
|
|
34
|
+
self.last_status = last_status
|
|
35
|
+
self.attempts = attempts
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class PaginationError(DPHError):
|
|
39
|
+
"""Raised on pagination failure.
|
|
40
|
+
|
|
41
|
+
Carries items collected before the failure so callers can decide
|
|
42
|
+
whether to use partial results.
|
|
43
|
+
|
|
44
|
+
Attributes:
|
|
45
|
+
collected_items: Items successfully fetched before the error.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
message: str,
|
|
51
|
+
*,
|
|
52
|
+
collected_items: list[dict] | None = None,
|
|
53
|
+
cause: Exception | None = None,
|
|
54
|
+
) -> None:
|
|
55
|
+
super().__init__(message)
|
|
56
|
+
self.collected_items = collected_items or []
|
|
57
|
+
if cause is not None:
|
|
58
|
+
self.__cause__ = cause
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class DatabaseError(DPHError):
|
|
62
|
+
"""Raised on database operation failures.
|
|
63
|
+
|
|
64
|
+
Wraps SQLAlchemy or driver-level exceptions while keeping the
|
|
65
|
+
original available via ``__cause__``.
|
|
66
|
+
"""
|
dehelpers/logger.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""Structured JSON logger with automatic secret redaction.
|
|
2
|
+
|
|
3
|
+
Built entirely on the stdlib :mod:`logging` module — no third-party
|
|
4
|
+
dependencies. Every log record is emitted as a single JSON line with
|
|
5
|
+
a consistent schema, making it ready for containerised environments
|
|
6
|
+
like Airflow, AWS ECS, or Google Cloud Run.
|
|
7
|
+
|
|
8
|
+
Usage::
|
|
9
|
+
|
|
10
|
+
from dehelpers import get_logger, LogContext
|
|
11
|
+
|
|
12
|
+
log = get_logger("my_etl", job_id="daily-sales")
|
|
13
|
+
log.info("Starting extraction", extra={"source": "api"})
|
|
14
|
+
|
|
15
|
+
with LogContext(request_id="abc-123"):
|
|
16
|
+
log.info("Fetched page", extra={"page": 1})
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import json
|
|
22
|
+
import logging
|
|
23
|
+
import sys
|
|
24
|
+
import traceback
|
|
25
|
+
from contextvars import ContextVar
|
|
26
|
+
from datetime import datetime, timezone
|
|
27
|
+
from typing import Any
|
|
28
|
+
|
|
29
|
+
from dehelpers._redact import redact_dict
|
|
30
|
+
|
|
31
|
+
__all__ = ["get_logger", "LogContext"]
|
|
32
|
+
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
# Context variables for cross-cutting fields
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
_ctx_job_id: ContextVar[str | None] = ContextVar("job_id", default=None)
|
|
37
|
+
_ctx_request_id: ContextVar[str | None] = ContextVar("request_id", default=None)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class LogContext:
|
|
41
|
+
"""Context manager that injects ``job_id`` and/or ``request_id``
|
|
42
|
+
into every log record emitted within its scope.
|
|
43
|
+
|
|
44
|
+
Example::
|
|
45
|
+
|
|
46
|
+
with LogContext(job_id="etl-run-42", request_id="req-abc"):
|
|
47
|
+
logger.info("Processing")
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
*,
|
|
53
|
+
job_id: str | None = None,
|
|
54
|
+
request_id: str | None = None,
|
|
55
|
+
) -> None:
|
|
56
|
+
self._job_id = job_id
|
|
57
|
+
self._request_id = request_id
|
|
58
|
+
self._tokens: list = []
|
|
59
|
+
|
|
60
|
+
def __enter__(self) -> LogContext:
|
|
61
|
+
if self._job_id is not None:
|
|
62
|
+
self._tokens.append(_ctx_job_id.set(self._job_id))
|
|
63
|
+
if self._request_id is not None:
|
|
64
|
+
self._tokens.append(_ctx_request_id.set(self._request_id))
|
|
65
|
+
return self
|
|
66
|
+
|
|
67
|
+
def __exit__(self, *_: object) -> None:
|
|
68
|
+
for token in reversed(self._tokens):
|
|
69
|
+
token.var.reset(token)
|
|
70
|
+
self._tokens.clear()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
# JSON Formatter
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
class _JSONFormatter(logging.Formatter):
|
|
77
|
+
"""Formats log records as single-line JSON with redaction.
|
|
78
|
+
|
|
79
|
+
Schema (every record)::
|
|
80
|
+
|
|
81
|
+
{
|
|
82
|
+
"timestamp": "2026-07-02T11:43:50.123456Z",
|
|
83
|
+
"level": "INFO",
|
|
84
|
+
"message": "Fetched 200 rows",
|
|
85
|
+
"module": "db",
|
|
86
|
+
"function": "execute_query",
|
|
87
|
+
"job_id": "etl-daily-sales",
|
|
88
|
+
"request_id": null,
|
|
89
|
+
"error": null
|
|
90
|
+
}
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
# Fields injected by logging internals that we don't want in output.
|
|
94
|
+
_INTERNAL_KEYS = frozenset(
|
|
95
|
+
{
|
|
96
|
+
"name",
|
|
97
|
+
"msg",
|
|
98
|
+
"args",
|
|
99
|
+
"created",
|
|
100
|
+
"relativeCreated",
|
|
101
|
+
"exc_info",
|
|
102
|
+
"exc_text",
|
|
103
|
+
"stack_info",
|
|
104
|
+
"lineno",
|
|
105
|
+
"funcName",
|
|
106
|
+
"pathname",
|
|
107
|
+
"filename",
|
|
108
|
+
"levelno",
|
|
109
|
+
"levelname",
|
|
110
|
+
"module",
|
|
111
|
+
"msecs",
|
|
112
|
+
"process",
|
|
113
|
+
"processName",
|
|
114
|
+
"thread",
|
|
115
|
+
"threadName",
|
|
116
|
+
"taskName",
|
|
117
|
+
"message",
|
|
118
|
+
}
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def format(self, record: logging.LogRecord) -> str:
|
|
122
|
+
"""Return a single-line JSON string for *record*."""
|
|
123
|
+
try:
|
|
124
|
+
return self._safe_format(record)
|
|
125
|
+
except Exception:
|
|
126
|
+
# Recursion / serialization guard: fallback to plain text.
|
|
127
|
+
return json.dumps(
|
|
128
|
+
{
|
|
129
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
130
|
+
"level": "ERROR",
|
|
131
|
+
"message": f"[FORMATTER ERROR] {record.getMessage()}",
|
|
132
|
+
"module": getattr(record, "module", "unknown"),
|
|
133
|
+
"function": getattr(record, "funcName", "unknown"),
|
|
134
|
+
"_formatter_error": True,
|
|
135
|
+
}
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
def _safe_format(self, record: logging.LogRecord) -> str:
|
|
139
|
+
# Build the structured payload.
|
|
140
|
+
payload: dict[str, Any] = {
|
|
141
|
+
"timestamp": datetime.fromtimestamp(
|
|
142
|
+
record.created, tz=timezone.utc
|
|
143
|
+
).isoformat(),
|
|
144
|
+
"level": record.levelname,
|
|
145
|
+
"message": record.getMessage(),
|
|
146
|
+
"module": record.module,
|
|
147
|
+
"function": record.funcName,
|
|
148
|
+
"job_id": _ctx_job_id.get(),
|
|
149
|
+
"request_id": _ctx_request_id.get(),
|
|
150
|
+
"error": None,
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
# Merge user-supplied extra fields (redacted).
|
|
154
|
+
extras: dict[str, Any] = {
|
|
155
|
+
k: v
|
|
156
|
+
for k, v in record.__dict__.items()
|
|
157
|
+
if k not in self._INTERNAL_KEYS
|
|
158
|
+
}
|
|
159
|
+
if extras:
|
|
160
|
+
payload.update(redact_dict(extras))
|
|
161
|
+
|
|
162
|
+
# Serialise exception info if present.
|
|
163
|
+
if record.exc_info and record.exc_info[0] is not None:
|
|
164
|
+
exc_type, exc_value, exc_tb = record.exc_info
|
|
165
|
+
payload["error"] = {
|
|
166
|
+
"type": exc_type.__name__ if exc_type else "Unknown",
|
|
167
|
+
"message": str(exc_value),
|
|
168
|
+
"traceback": traceback.format_exception(
|
|
169
|
+
exc_type, exc_value, exc_tb
|
|
170
|
+
)[-3:], # last 3 frames to keep logs concise
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return json.dumps(payload, default=str)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# ---------------------------------------------------------------------------
|
|
177
|
+
# Public factory
|
|
178
|
+
# ---------------------------------------------------------------------------
|
|
179
|
+
def get_logger(
|
|
180
|
+
name: str,
|
|
181
|
+
*,
|
|
182
|
+
job_id: str | None = None,
|
|
183
|
+
level: int = logging.INFO,
|
|
184
|
+
) -> logging.Logger:
|
|
185
|
+
"""Return a stdlib :class:`~logging.Logger` with JSON formatting.
|
|
186
|
+
|
|
187
|
+
Parameters
|
|
188
|
+
----------
|
|
189
|
+
name:
|
|
190
|
+
Logger name (typically the module or pipeline name).
|
|
191
|
+
job_id:
|
|
192
|
+
Optional default job identifier injected into every record.
|
|
193
|
+
Can also be set/overridden at runtime via :class:`LogContext`.
|
|
194
|
+
level:
|
|
195
|
+
Logging level. Defaults to ``INFO``.
|
|
196
|
+
|
|
197
|
+
Returns
|
|
198
|
+
-------
|
|
199
|
+
logging.Logger
|
|
200
|
+
A configured logger that writes JSON to *stderr*.
|
|
201
|
+
"""
|
|
202
|
+
if job_id is not None:
|
|
203
|
+
_ctx_job_id.set(job_id)
|
|
204
|
+
|
|
205
|
+
logger = logging.getLogger(name)
|
|
206
|
+
logger.setLevel(level)
|
|
207
|
+
|
|
208
|
+
# Avoid duplicate handlers on repeated calls.
|
|
209
|
+
if not any(isinstance(h, logging.StreamHandler) for h in logger.handlers):
|
|
210
|
+
handler = logging.StreamHandler(sys.stderr)
|
|
211
|
+
handler.setFormatter(_JSONFormatter())
|
|
212
|
+
logger.addHandler(handler)
|
|
213
|
+
|
|
214
|
+
# Don't propagate to root logger to avoid double output.
|
|
215
|
+
logger.propagate = False
|
|
216
|
+
|
|
217
|
+
return logger
|
dehelpers/py.typed
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dehelpers
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight utilities for data engineering pipelines: resilient HTTP, PostgreSQL helpers, and structured logging.
|
|
5
|
+
Project-URL: Homepage, https://github.com/shard-c6/dehelpers
|
|
6
|
+
Project-URL: Repository, https://github.com/shard-c6/dehelpers
|
|
7
|
+
Author: Shardul Chogale
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Database
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
19
|
+
Classifier: Typing :: Typed
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: psycopg[binary]>=3.0
|
|
22
|
+
Requires-Dist: requests>=2.28
|
|
23
|
+
Requires-Dist: sqlalchemy>=2.0
|
|
24
|
+
Provides-Extra: dataframe
|
|
25
|
+
Requires-Dist: pandas>=2.0; extra == 'dataframe'
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest-postgresql; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: responses>=0.23; extra == 'dev'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# dehelpers
|
|
34
|
+
|
|
35
|
+
Lightweight, production-hardened Python utilities for data engineering pipelines.
|
|
36
|
+
|
|
37
|
+
**Resilient HTTP** · **PostgreSQL helpers** · **Structured JSON logging** — with automatic secret redaction, bounded retries, and safe connection pooling.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Architecture & Flow
|
|
42
|
+
|
|
43
|
+
```mermaid
|
|
44
|
+
graph TD
|
|
45
|
+
subgraph External [External APIs & Services]
|
|
46
|
+
REST_API[REST API Source]
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
subgraph DPH [dehelpers Package]
|
|
50
|
+
direction TB
|
|
51
|
+
subgraph Client [Resilient Client]
|
|
52
|
+
RC[ResilientClient] --> |Configured by| RP[RetryPolicy]
|
|
53
|
+
RC --> |Iterates with| NLP[NextLinkPagination]
|
|
54
|
+
RC --> |Sanitizes query| RU[redact_url]
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
subgraph Logger [Structured Logger]
|
|
58
|
+
GL[get_logger] --> |Formats record| JF[JSONFormatter]
|
|
59
|
+
LogCtx[LogContext] --> |Context injection| CV[job_id / request_id]
|
|
60
|
+
JF --> |Deep-redacts secrets| RD[redact_dict]
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
subgraph Database [Database Manager]
|
|
64
|
+
DBM[DatabaseManager] --> |Yields sessions| SC[_SessionContext]
|
|
65
|
+
DBM --> |Manages pool| SQLA[SQLAlchemy Engine]
|
|
66
|
+
DBM --> |Lazy Load| DF[Pandas DataFrame]
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
subgraph Target [Storage / Logs]
|
|
71
|
+
PG[(PostgreSQL DB)]
|
|
72
|
+
Stderr[Stderr / Cloud Logs]
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
REST_API ==> |Inbound Data| RC
|
|
76
|
+
RC --> |Yields items / logs events| GL
|
|
77
|
+
GL ==> |JSON Output| Stderr
|
|
78
|
+
RC --> |Normalized data| DBM
|
|
79
|
+
DBM ==> |Pool connections| PG
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Boundaries & Capabilities
|
|
85
|
+
|
|
86
|
+
Here is exactly what this package **is** and what it **is not**:
|
|
87
|
+
|
|
88
|
+
| Category / Layer | What this IS | What this IS NOT |
|
|
89
|
+
|:---|:---|:---|
|
|
90
|
+
| **API / HTTP** | A retry-protected wrapper around `requests.Session` with exponential backoff, jitter, and simple pagination. | An asynchronous network library (like `aiohttp` or `httpx`), fully-fledged HTTP client replacement, or GraphQL API wrapper. |
|
|
91
|
+
| **Database** | A thread-safe connection manager for PostgreSQL with pooling configuration, automated transaction commits/rollbacks, and lazy DataFrame output. | An Object-Relational Mapper (ORM) (like SQLModel/SQLAlchemy ORM), schema migration engine (like Alembic), or database administration tool. |
|
|
92
|
+
| **Logging** | A zero-dependency structured JSON formatter on top of standard `logging` with automatic deep secrets redaction. | A log routing system (like Fluentd/Logstash), file logger, metrics exporter, or complex log management server. |
|
|
93
|
+
| **Execution Context** | Designed for batch execution environments like Airflow tasks, ETL scripts, and containerized Docker runtimes. | Suitable for high-throughput, low-latency, real-time web servers or async microservices. |
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Comparison with Standard Setup
|
|
98
|
+
|
|
99
|
+
How this package compares to a standard DIY setup:
|
|
100
|
+
|
|
101
|
+
| Feature / Criteria | Standard Setup (`requests` + `logging` + `psycopg`) | `dehelpers` |
|
|
102
|
+
|:---|:---|:---|
|
|
103
|
+
| **Secret Leakage Protection** | Manual / None. Secrets easily print to stdout or appear in exception tracebacks. | **Automatic & Deep Recursive:** Redacts predefined secrets from nested metadata, logs, and query parameters. |
|
|
104
|
+
| **Retry & Jitter Strategy** | Manual loops or boilerplate `urllib3` retry configurations. | **Out-of-the-box resilience:** Exponential backoff with random jitter and clock-based `total_timeout` limit. |
|
|
105
|
+
| **Pagination Handling** | Custom pagination loop logic required for every API endpoint. | **Next-link strategy Protocol:** Yields individual items transparently and safely with validation. |
|
|
106
|
+
| **Connection Safety** | Connection leaks or transaction rollback failures if block managers are missed. | **Context-managed Session:** Engine-pooled with pre-ping checks, pool timeout, and auto-rollback. |
|
|
107
|
+
| **Dependency Footprint** | Heavy setup if installing frameworks like Loguru, Structlog, or heavy database utilities. | **Ultra-lightweight:** Base dependencies are minimal. Pandas is entirely optional and lazy-loaded. |
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Roadmap & What's Next
|
|
112
|
+
|
|
113
|
+
| Phase | Feature / Expansion | Target Use Case | Status |
|
|
114
|
+
|:---|:---|:---|:---|
|
|
115
|
+
| **v1.0** | Core Resilient HTTP, Postgres Pool, Redacted Logger | Personal ETL scripts & Airflow workflows | **Released** |
|
|
116
|
+
| **v1.1** | Cursor-based Pagination (`CursorPagination`) | Handling APIs that use cursor-based cursors | *Planned* |
|
|
117
|
+
| **v1.2** | Async Client Support (`AsyncResilientClient`) | High-throughput concurrent API extraction pipelines | *Planned* |
|
|
118
|
+
| **v1.3** | Parquet / Arrow Ingestion Support | High-performance bulk column-based ingestion | *Planned* |
|
|
119
|
+
| **v2.0** | Schema Validation Layer (`pydantic` integration) | Ingestion payload sanitization and schema contracts | *Conceptual* |
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Install
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
# Core (HTTP + DB + logging)
|
|
127
|
+
pip install dehelpers
|
|
128
|
+
|
|
129
|
+
# With Pandas DataFrame support
|
|
130
|
+
pip install dehelpers[dataframe]
|
|
131
|
+
|
|
132
|
+
# Development (tests)
|
|
133
|
+
pip install dehelpers[dev,dataframe]
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Requires Python ≥ 3.10.
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## Quickstart
|
|
141
|
+
|
|
142
|
+
### Resilient HTTP Client
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
from dehelpers import ResilientClient, RetryPolicy
|
|
146
|
+
|
|
147
|
+
# Custom policy: 5 retries, retry POST with opt-in
|
|
148
|
+
policy = RetryPolicy(max_retries=5, retry_non_idempotent=True)
|
|
149
|
+
client = ResilientClient(retry_policy=policy)
|
|
150
|
+
|
|
151
|
+
resp = client.get("https://api.example.com/data")
|
|
152
|
+
print(resp.json())
|
|
153
|
+
|
|
154
|
+
# Paginate through all items
|
|
155
|
+
for item in client.paginate("https://api.example.com/items"):
|
|
156
|
+
process(item)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### PostgreSQL Database Helper
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
from dehelpers import DatabaseManager
|
|
163
|
+
|
|
164
|
+
# Reads DATABASE_URL from environment by default
|
|
165
|
+
with DatabaseManager() as db:
|
|
166
|
+
rows = db.execute(
|
|
167
|
+
"SELECT * FROM users WHERE active = :active",
|
|
168
|
+
{"active": True},
|
|
169
|
+
)
|
|
170
|
+
print(f"Found {len(rows)} active users")
|
|
171
|
+
|
|
172
|
+
# Optional: load into a Pandas DataFrame
|
|
173
|
+
df = db.to_dataframe("SELECT * FROM sales WHERE date > :d", {"d": "2026-01-01"})
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### Structured JSON Logger
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
from dehelpers import get_logger, LogContext
|
|
180
|
+
|
|
181
|
+
log = get_logger("my_etl", job_id="daily-sales")
|
|
182
|
+
|
|
183
|
+
with LogContext(request_id="req-abc"):
|
|
184
|
+
log.info("Fetched data", extra={"row_count": 500})
|
|
185
|
+
# Output: {"timestamp": "...", "level": "INFO", "message": "Fetched data",
|
|
186
|
+
# "module": "...", "job_id": "daily-sales", "request_id": "req-abc",
|
|
187
|
+
# "row_count": 500, "error": null}
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
## Configuration
|
|
193
|
+
|
|
194
|
+
| Parameter | Default | Description |
|
|
195
|
+
|-----------|---------|-------------|
|
|
196
|
+
| `DATABASE_URL` (env var) | — | PostgreSQL connection string (fallback when `dsn` is not passed) |
|
|
197
|
+
| `pool_size` | 5 | Persistent connections in the pool |
|
|
198
|
+
| `max_overflow` | 2 | Extra connections beyond pool_size |
|
|
199
|
+
| `pool_recycle` | 1800 | Seconds before connection recycling |
|
|
200
|
+
| `pool_pre_ping` | True | Health-check connections before use |
|
|
201
|
+
| `pool_timeout` | 30 | Seconds to wait for a pool connection |
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## Security
|
|
206
|
+
|
|
207
|
+
### Automatic Redaction
|
|
208
|
+
|
|
209
|
+
The logger and API client automatically redact values for these keys in log output:
|
|
210
|
+
|
|
211
|
+
`password`, `secret`, `token`, `api_key`, `authorization`, `dsn`, `connection_string`, `credential`, `passphrase`, `private_key`, `client_secret`
|
|
212
|
+
|
|
213
|
+
Matching is **case-insensitive substring** — e.g. `db_password` matches `password`.
|
|
214
|
+
|
|
215
|
+
You can extend the redaction list:
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
from dehelpers._redact import redact_dict
|
|
219
|
+
|
|
220
|
+
result = redact_dict(
|
|
221
|
+
{"my_custom_secret": "value"},
|
|
222
|
+
extra_sensitive_keys=frozenset({"my_custom_secret"}),
|
|
223
|
+
)
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### ⚠️ Never Embed Secrets in URLs
|
|
227
|
+
|
|
228
|
+
URL query parameter values are redacted, but **path segments are not**. Never construct URLs like:
|
|
229
|
+
|
|
230
|
+
```
|
|
231
|
+
https://api.example.com/v1/token/abc123/data # BAD — token in path
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
Instead, pass secrets via headers or request body.
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
## Fork Safety (Airflow / Multiprocessing)
|
|
239
|
+
|
|
240
|
+
If you use `DatabaseManager` in a forked environment (e.g. Airflow workers, `multiprocessing`), you **must** either:
|
|
241
|
+
|
|
242
|
+
1. Create the `DatabaseManager` **inside each worker process**, or
|
|
243
|
+
2. Call `db.dispose()` **before** forking.
|
|
244
|
+
|
|
245
|
+
SQLAlchemy connection pools are not safe to share across forked processes.
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## Testing
|
|
250
|
+
|
|
251
|
+
### Unit tests (no PostgreSQL required)
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
pip install -e ".[dev,dataframe]"
|
|
255
|
+
pytest -v --tb=short -m "not postgres"
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
### PostgreSQL integration tests
|
|
259
|
+
|
|
260
|
+
```bash
|
|
261
|
+
# Start a local PostgreSQL
|
|
262
|
+
docker run -d --name pg-test -e POSTGRES_PASSWORD=test -p 5432:5432 postgres:16
|
|
263
|
+
|
|
264
|
+
# Run integration tests
|
|
265
|
+
DATABASE_URL="postgresql+psycopg://postgres:test@localhost:5432/postgres" \
|
|
266
|
+
pytest -m postgres -v
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
### Coverage
|
|
270
|
+
|
|
271
|
+
```bash
|
|
272
|
+
pytest --cov=dehelpers --cov-report=term-missing -m "not postgres"
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## License
|
|
278
|
+
|
|
279
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
dehelpers/__init__.py,sha256=KdUCfyOQhGUhavL8nwae2UpbJ-FoT0EY3SQ9whjIPlk,645
|
|
2
|
+
dehelpers/_redact.py,sha256=e5z1DcdpfRXZdXu-3jqPIjM1ZnKN2Y5e5hcnF1hZkc4,3672
|
|
3
|
+
dehelpers/api.py,sha256=1HROkWMGzepuvjii2tnE99ZTDWqW3YwzbtuTv-X_T8A,13079
|
|
4
|
+
dehelpers/db.py,sha256=uKY4C-X4jFc9N0osTViKNeZjXm7-Ib0vHmAdPFOLu_Q,7576
|
|
5
|
+
dehelpers/exceptions.py,sha256=0dfNcLYPAMcNZI0jldGb0bF_PPXygCmB_xuKLV9QObc,1818
|
|
6
|
+
dehelpers/logger.py,sha256=lfFNzAMGo7QsLS7tHvhYAC8FNkf3abxlJm7LRvQOnDU,6794
|
|
7
|
+
dehelpers/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
8
|
+
dehelpers-0.1.0.dist-info/METADATA,sha256=gmdyU0NbXeR2Z_uR0BJV0cFSWeuw1gBWw5IFN-_1LsU,9801
|
|
9
|
+
dehelpers-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
10
|
+
dehelpers-0.1.0.dist-info/licenses/LICENSE,sha256=eFS_S_z8RFffwvKMXclkzQlqbaMdyknmQaB549DZCBw,1072
|
|
11
|
+
dehelpers-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Shardul Chogale
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|