dehelpers 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dehelpers-0.1.0/.github/workflows/ci.yml +34 -0
- dehelpers-0.1.0/.github/workflows/publish-pypi.yml +35 -0
- dehelpers-0.1.0/.gitignore +47 -0
- dehelpers-0.1.0/LICENSE +21 -0
- dehelpers-0.1.0/PKG-INFO +279 -0
- dehelpers-0.1.0/README.md +247 -0
- dehelpers-0.1.0/pyproject.toml +52 -0
- dehelpers-0.1.0/src/dehelpers/__init__.py +30 -0
- dehelpers-0.1.0/src/dehelpers/_redact.py +122 -0
- dehelpers-0.1.0/src/dehelpers/api.py +380 -0
- dehelpers-0.1.0/src/dehelpers/db.py +243 -0
- dehelpers-0.1.0/src/dehelpers/exceptions.py +66 -0
- dehelpers-0.1.0/src/dehelpers/logger.py +217 -0
- dehelpers-0.1.0/src/dehelpers/py.typed +1 -0
- dehelpers-0.1.0/tests/__init__.py +1 -0
- dehelpers-0.1.0/tests/conftest.py +26 -0
- dehelpers-0.1.0/tests/test_api.py +303 -0
- dehelpers-0.1.0/tests/test_db.py +236 -0
- dehelpers-0.1.0/tests/test_logger.py +159 -0
- dehelpers-0.1.0/tests/test_redact.py +105 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
name: Python CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [ main ]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [ main ]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- name: Checkout code
|
|
18
|
+
uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
21
|
+
uses: actions/setup-python@v5
|
|
22
|
+
with:
|
|
23
|
+
python-version: ${{ matrix.python-version }}
|
|
24
|
+
cache: 'pip'
|
|
25
|
+
cache-dependency-path: pyproject.toml
|
|
26
|
+
|
|
27
|
+
- name: Install dependencies
|
|
28
|
+
run: |
|
|
29
|
+
python -m pip install --upgrade pip
|
|
30
|
+
pip install -e ".[dev,dataframe]"
|
|
31
|
+
|
|
32
|
+
- name: Run tests with coverage
|
|
33
|
+
run: |
|
|
34
|
+
pytest --cov=dehelpers --cov-report=term-missing -m "not postgres"
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
id-token: write # Required for PyPI OIDC Trusted Publishing
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
pypi-publish:
|
|
12
|
+
name: Upload release to PyPI
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
environment:
|
|
15
|
+
name: pypi
|
|
16
|
+
url: https://pypi.org/p/dehelpers
|
|
17
|
+
steps:
|
|
18
|
+
- name: Checkout code
|
|
19
|
+
uses: actions/checkout@v4
|
|
20
|
+
|
|
21
|
+
- name: Set up Python
|
|
22
|
+
uses: actions/setup-python@v5
|
|
23
|
+
with:
|
|
24
|
+
python-version: '3.11'
|
|
25
|
+
|
|
26
|
+
- name: Install build dependencies
|
|
27
|
+
run: |
|
|
28
|
+
python -m pip install --upgrade pip
|
|
29
|
+
pip install build
|
|
30
|
+
|
|
31
|
+
- name: Build package
|
|
32
|
+
run: python -m build
|
|
33
|
+
|
|
34
|
+
- name: Publish package distributions to PyPI
|
|
35
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# Distribution / packaging
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
share/python-wheels/
|
|
20
|
+
*.egg-info/
|
|
21
|
+
.installed.cfg
|
|
22
|
+
*.egg
|
|
23
|
+
MANIFEST
|
|
24
|
+
|
|
25
|
+
# Pytest / Coverage
|
|
26
|
+
.pytest_cache/
|
|
27
|
+
.coverage
|
|
28
|
+
htmlcov/
|
|
29
|
+
.cov/
|
|
30
|
+
.nox/
|
|
31
|
+
.tox/
|
|
32
|
+
|
|
33
|
+
# Environments
|
|
34
|
+
.env
|
|
35
|
+
.venv
|
|
36
|
+
env/
|
|
37
|
+
venv/
|
|
38
|
+
ENV/
|
|
39
|
+
env.bak/
|
|
40
|
+
venv.bak/
|
|
41
|
+
|
|
42
|
+
# IDEs
|
|
43
|
+
.vscode/
|
|
44
|
+
.idea/
|
|
45
|
+
*.swp
|
|
46
|
+
*.swo
|
|
47
|
+
.DS_Store
|
dehelpers-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Shardul Chogale
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
dehelpers-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dehelpers
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight utilities for data engineering pipelines: resilient HTTP, PostgreSQL helpers, and structured logging.
|
|
5
|
+
Project-URL: Homepage, https://github.com/shard-c6/dehelpers
|
|
6
|
+
Project-URL: Repository, https://github.com/shard-c6/dehelpers
|
|
7
|
+
Author: Shardul Chogale
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Database
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
19
|
+
Classifier: Typing :: Typed
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: psycopg[binary]>=3.0
|
|
22
|
+
Requires-Dist: requests>=2.28
|
|
23
|
+
Requires-Dist: sqlalchemy>=2.0
|
|
24
|
+
Provides-Extra: dataframe
|
|
25
|
+
Requires-Dist: pandas>=2.0; extra == 'dataframe'
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest-postgresql; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: responses>=0.23; extra == 'dev'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# dehelpers
|
|
34
|
+
|
|
35
|
+
Lightweight, production-hardened Python utilities for data engineering pipelines.
|
|
36
|
+
|
|
37
|
+
**Resilient HTTP** · **PostgreSQL helpers** · **Structured JSON logging** — with automatic secret redaction, bounded retries, and safe connection pooling.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Architecture & Flow
|
|
42
|
+
|
|
43
|
+
```mermaid
|
|
44
|
+
graph TD
|
|
45
|
+
subgraph External [External APIs & Services]
|
|
46
|
+
REST_API[REST API Source]
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
subgraph DPH [dehelpers Package]
|
|
50
|
+
direction TB
|
|
51
|
+
subgraph Client [Resilient Client]
|
|
52
|
+
RC[ResilientClient] --> |Configured by| RP[RetryPolicy]
|
|
53
|
+
RC --> |Iterates with| NLP[NextLinkPagination]
|
|
54
|
+
RC --> |Sanitizes query| RU[redact_url]
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
subgraph Logger [Structured Logger]
|
|
58
|
+
GL[get_logger] --> |Formats record| JF[JSONFormatter]
|
|
59
|
+
LogCtx[LogContext] --> |Context injection| CV[job_id / request_id]
|
|
60
|
+
JF --> |Deep-redacts secrets| RD[redact_dict]
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
subgraph Database [Database Manager]
|
|
64
|
+
DBM[DatabaseManager] --> |Yields sessions| SC[_SessionContext]
|
|
65
|
+
DBM --> |Manages pool| SQLA[SQLAlchemy Engine]
|
|
66
|
+
DBM --> |Lazy Load| DF[Pandas DataFrame]
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
subgraph Target [Storage / Logs]
|
|
71
|
+
PG[(PostgreSQL DB)]
|
|
72
|
+
Stderr[Stderr / Cloud Logs]
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
REST_API ==> |Inbound Data| RC
|
|
76
|
+
RC --> |Yields items / logs events| GL
|
|
77
|
+
GL ==> |JSON Output| Stderr
|
|
78
|
+
RC --> |Normalized data| DBM
|
|
79
|
+
DBM ==> |Pool connections| PG
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Boundaries & Capabilities
|
|
85
|
+
|
|
86
|
+
Here is exactly what this package **is** and what it **is not**:
|
|
87
|
+
|
|
88
|
+
| Category / Layer | What this IS | What this IS NOT |
|
|
89
|
+
|:---|:---|:---|
|
|
90
|
+
| **API / HTTP** | A retry-protected wrapper around `requests.Session` with exponential backoff, jitter, and simple pagination. | An asynchronous network library (like `aiohttp` or `httpx`), fully-fledged HTTP client replacement, or GraphQL API wrapper. |
|
|
91
|
+
| **Database** | A thread-safe connection manager for PostgreSQL with pooling configuration, automated transaction commits/rollbacks, and lazy DataFrame output. | An Object-Relational Mapper (ORM) (like SQLModel/SQLAlchemy ORM), schema migration engine (like Alembic), or database administration tool. |
|
|
92
|
+
| **Logging** | A zero-dependency structured JSON formatter on top of standard `logging` with automatic deep secrets redaction. | A log routing system (like Fluentd/Logstash), file logger, metrics exporter, or complex log management server. |
|
|
93
|
+
| **Execution Context** | Designed for batch execution environments like Airflow tasks, ETL scripts, and containerized Docker runtimes. | Suitable for high-throughput, low-latency, real-time web servers or async microservices. |
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Comparison with Standard Setup
|
|
98
|
+
|
|
99
|
+
How this package compares to a standard DIY setup:
|
|
100
|
+
|
|
101
|
+
| Feature / Criteria | Standard Setup (`requests` + `logging` + `psycopg`) | `dehelpers` |
|
|
102
|
+
|:---|:---|:---|
|
|
103
|
+
| **Secret Leakage Protection** | Manual / None. Secrets easily print to stdout or appear in exception tracebacks. | **Automatic & Deep Recursive:** Redacts predefined secrets from nested metadata, logs, and query parameters. |
|
|
104
|
+
| **Retry & Jitter Strategy** | Manual loops or boilerplate `urllib3` retry configurations. | **Out-of-the-box resilience:** Exponential backoff with random jitter and clock-based `total_timeout` limit. |
|
|
105
|
+
| **Pagination Handling** | Custom pagination loop logic required for every API endpoint. | **Next-link strategy Protocol:** Yields individual items transparently and safely with validation. |
|
|
106
|
+
| **Connection Safety** | Connection leaks or transaction rollback failures if block managers are missed. | **Context-managed Session:** Engine-pooled with pre-ping checks, pool timeout, and auto-rollback. |
|
|
107
|
+
| **Dependency Footprint** | Heavy setup if installing frameworks like Loguru, Structlog, or heavy database utilities. | **Ultra-lightweight:** Base dependencies are minimal. Pandas is entirely optional and lazy-loaded. |
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Roadmap & What's Next
|
|
112
|
+
|
|
113
|
+
| Phase | Feature / Expansion | Target Use Case | Status |
|
|
114
|
+
|:---|:---|:---|:---|
|
|
115
|
+
| **v1.0** | Core Resilient HTTP, Postgres Pool, Redacted Logger | Personal ETL scripts & Airflow workflows | **Released** |
|
|
116
|
+
| **v1.1** | Cursor-based Pagination (`CursorPagination`) | Handling APIs that use cursor-based cursors | *Planned* |
|
|
117
|
+
| **v1.2** | Async Client Support (`AsyncResilientClient`) | High-throughput concurrent API extraction pipelines | *Planned* |
|
|
118
|
+
| **v1.3** | Parquet / Arrow Ingestion Support | High-performance bulk column-based ingestion | *Planned* |
|
|
119
|
+
| **v2.0** | Schema Validation Layer (`pydantic` integration) | Ingestion payload sanitization and schema contracts | *Conceptual* |
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Install
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
# Core (HTTP + DB + logging)
|
|
127
|
+
pip install dehelpers
|
|
128
|
+
|
|
129
|
+
# With Pandas DataFrame support
|
|
130
|
+
pip install dehelpers[dataframe]
|
|
131
|
+
|
|
132
|
+
# Development (tests)
|
|
133
|
+
pip install dehelpers[dev,dataframe]
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Requires Python ≥ 3.10.
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## Quickstart
|
|
141
|
+
|
|
142
|
+
### Resilient HTTP Client
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
from dehelpers import ResilientClient, RetryPolicy
|
|
146
|
+
|
|
147
|
+
# Custom policy: 5 retries, retry POST with opt-in
|
|
148
|
+
policy = RetryPolicy(max_retries=5, retry_non_idempotent=True)
|
|
149
|
+
client = ResilientClient(retry_policy=policy)
|
|
150
|
+
|
|
151
|
+
resp = client.get("https://api.example.com/data")
|
|
152
|
+
print(resp.json())
|
|
153
|
+
|
|
154
|
+
# Paginate through all items
|
|
155
|
+
for item in client.paginate("https://api.example.com/items"):
|
|
156
|
+
process(item)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### PostgreSQL Database Helper
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
from dehelpers import DatabaseManager
|
|
163
|
+
|
|
164
|
+
# Reads DATABASE_URL from environment by default
|
|
165
|
+
with DatabaseManager() as db:
|
|
166
|
+
rows = db.execute(
|
|
167
|
+
"SELECT * FROM users WHERE active = :active",
|
|
168
|
+
{"active": True},
|
|
169
|
+
)
|
|
170
|
+
print(f"Found {len(rows)} active users")
|
|
171
|
+
|
|
172
|
+
# Optional: load into a Pandas DataFrame
|
|
173
|
+
df = db.to_dataframe("SELECT * FROM sales WHERE date > :d", {"d": "2026-01-01"})
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### Structured JSON Logger
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
from dehelpers import get_logger, LogContext
|
|
180
|
+
|
|
181
|
+
log = get_logger("my_etl", job_id="daily-sales")
|
|
182
|
+
|
|
183
|
+
with LogContext(request_id="req-abc"):
|
|
184
|
+
log.info("Fetched data", extra={"row_count": 500})
|
|
185
|
+
# Output: {"timestamp": "...", "level": "INFO", "message": "Fetched data",
|
|
186
|
+
# "module": "...", "job_id": "daily-sales", "request_id": "req-abc",
|
|
187
|
+
# "row_count": 500, "error": null}
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
## Configuration
|
|
193
|
+
|
|
194
|
+
| Parameter | Default | Description |
|
|
195
|
+
|-----------|---------|-------------|
|
|
196
|
+
| `DATABASE_URL` (env var) | — | PostgreSQL connection string (fallback when `dsn` is not passed) |
|
|
197
|
+
| `pool_size` | 5 | Persistent connections in the pool |
|
|
198
|
+
| `max_overflow` | 2 | Extra connections beyond pool_size |
|
|
199
|
+
| `pool_recycle` | 1800 | Seconds before connection recycling |
|
|
200
|
+
| `pool_pre_ping` | True | Health-check connections before use |
|
|
201
|
+
| `pool_timeout` | 30 | Seconds to wait for a pool connection |
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## Security
|
|
206
|
+
|
|
207
|
+
### Automatic Redaction
|
|
208
|
+
|
|
209
|
+
The logger and API client automatically redact values for these keys in log output:
|
|
210
|
+
|
|
211
|
+
`password`, `secret`, `token`, `api_key`, `authorization`, `dsn`, `connection_string`, `credential`, `passphrase`, `private_key`, `client_secret`
|
|
212
|
+
|
|
213
|
+
Matching is **case-insensitive substring** — e.g. `db_password` matches `password`.
|
|
214
|
+
|
|
215
|
+
You can extend the redaction list:
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
from dehelpers._redact import redact_dict
|
|
219
|
+
|
|
220
|
+
result = redact_dict(
|
|
221
|
+
{"my_custom_secret": "value"},
|
|
222
|
+
extra_sensitive_keys=frozenset({"my_custom_secret"}),
|
|
223
|
+
)
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### ⚠️ Never Embed Secrets in URLs
|
|
227
|
+
|
|
228
|
+
URL query parameter values are redacted, but **path segments are not**. Never construct URLs like:
|
|
229
|
+
|
|
230
|
+
```
|
|
231
|
+
https://api.example.com/v1/token/abc123/data # BAD — token in path
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
Instead, pass secrets via headers or request body.
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
## Fork Safety (Airflow / Multiprocessing)
|
|
239
|
+
|
|
240
|
+
If you use `DatabaseManager` in a forked environment (e.g. Airflow workers, `multiprocessing`), you **must** either:
|
|
241
|
+
|
|
242
|
+
1. Create the `DatabaseManager` **inside each worker process**, or
|
|
243
|
+
2. Call `db.dispose()` **before** forking.
|
|
244
|
+
|
|
245
|
+
SQLAlchemy connection pools are not safe to share across forked processes.
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## Testing
|
|
250
|
+
|
|
251
|
+
### Unit tests (no PostgreSQL required)
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
pip install -e ".[dev,dataframe]"
|
|
255
|
+
pytest -v --tb=short -m "not postgres"
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
### PostgreSQL integration tests
|
|
259
|
+
|
|
260
|
+
```bash
|
|
261
|
+
# Start a local PostgreSQL
|
|
262
|
+
docker run -d --name pg-test -e POSTGRES_PASSWORD=test -p 5432:5432 postgres:16
|
|
263
|
+
|
|
264
|
+
# Run integration tests
|
|
265
|
+
DATABASE_URL="postgresql+psycopg://postgres:test@localhost:5432/postgres" \
|
|
266
|
+
pytest -m postgres -v
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
### Coverage
|
|
270
|
+
|
|
271
|
+
```bash
|
|
272
|
+
pytest --cov=dehelpers --cov-report=term-missing -m "not postgres"
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## License
|
|
278
|
+
|
|
279
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
# dehelpers
|
|
2
|
+
|
|
3
|
+
Lightweight, production-hardened Python utilities for data engineering pipelines.
|
|
4
|
+
|
|
5
|
+
**Resilient HTTP** · **PostgreSQL helpers** · **Structured JSON logging** — with automatic secret redaction, bounded retries, and safe connection pooling.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Architecture & Flow
|
|
10
|
+
|
|
11
|
+
```mermaid
|
|
12
|
+
graph TD
|
|
13
|
+
subgraph External [External APIs & Services]
|
|
14
|
+
REST_API[REST API Source]
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
subgraph DPH [dehelpers Package]
|
|
18
|
+
direction TB
|
|
19
|
+
subgraph Client [Resilient Client]
|
|
20
|
+
RC[ResilientClient] --> |Configured by| RP[RetryPolicy]
|
|
21
|
+
RC --> |Iterates with| NLP[NextLinkPagination]
|
|
22
|
+
RC --> |Sanitizes query| RU[redact_url]
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
subgraph Logger [Structured Logger]
|
|
26
|
+
GL[get_logger] --> |Formats record| JF[JSONFormatter]
|
|
27
|
+
LogCtx[LogContext] --> |Context injection| CV[job_id / request_id]
|
|
28
|
+
JF --> |Deep-redacts secrets| RD[redact_dict]
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
subgraph Database [Database Manager]
|
|
32
|
+
DBM[DatabaseManager] --> |Yields sessions| SC[_SessionContext]
|
|
33
|
+
DBM --> |Manages pool| SQLA[SQLAlchemy Engine]
|
|
34
|
+
DBM --> |Lazy Load| DF[Pandas DataFrame]
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
subgraph Target [Storage / Logs]
|
|
39
|
+
PG[(PostgreSQL DB)]
|
|
40
|
+
Stderr[Stderr / Cloud Logs]
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
REST_API ==> |Inbound Data| RC
|
|
44
|
+
RC --> |Yields items / logs events| GL
|
|
45
|
+
GL ==> |JSON Output| Stderr
|
|
46
|
+
RC --> |Normalized data| DBM
|
|
47
|
+
DBM ==> |Pool connections| PG
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## Boundaries & Capabilities
|
|
53
|
+
|
|
54
|
+
Here is exactly what this package **is** and what it **is not**:
|
|
55
|
+
|
|
56
|
+
| Category / Layer | What this IS | What this IS NOT |
|
|
57
|
+
|:---|:---|:---|
|
|
58
|
+
| **API / HTTP** | A retry-protected wrapper around `requests.Session` with exponential backoff, jitter, and simple pagination. | An asynchronous network library (like `aiohttp` or `httpx`), fully-fledged HTTP client replacement, or GraphQL API wrapper. |
|
|
59
|
+
| **Database** | A thread-safe connection manager for PostgreSQL with pooling configuration, automated transaction commits/rollbacks, and lazy DataFrame output. | An Object-Relational Mapper (ORM) (like SQLModel/SQLAlchemy ORM), schema migration engine (like Alembic), or database administration tool. |
|
|
60
|
+
| **Logging** | A zero-dependency structured JSON formatter on top of standard `logging` with automatic deep secrets redaction. | A log routing system (like Fluentd/Logstash), file logger, metrics exporter, or complex log management server. |
|
|
61
|
+
| **Execution Context** | Designed for batch execution environments like Airflow tasks, ETL scripts, and containerized Docker runtimes. | Suitable for high-throughput, low-latency, real-time web servers or async microservices. |
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Comparison with Standard Setup
|
|
66
|
+
|
|
67
|
+
How this package compares to a standard DIY setup:
|
|
68
|
+
|
|
69
|
+
| Feature / Criteria | Standard Setup (`requests` + `logging` + `psycopg`) | `dehelpers` |
|
|
70
|
+
|:---|:---|:---|
|
|
71
|
+
| **Secret Leakage Protection** | Manual / None. Secrets easily print to stdout or appear in exception tracebacks. | **Automatic & Deep Recursive:** Redacts predefined secrets from nested metadata, logs, and query parameters. |
|
|
72
|
+
| **Retry & Jitter Strategy** | Manual loops or boilerplate `urllib3` retry configurations. | **Out-of-the-box resilience:** Exponential backoff with random jitter and clock-based `total_timeout` limit. |
|
|
73
|
+
| **Pagination Handling** | Custom pagination loop logic required for every API endpoint. | **Next-link strategy Protocol:** Yields individual items transparently and safely with validation. |
|
|
74
|
+
| **Connection Safety** | Connection leaks or transaction rollback failures if block managers are missed. | **Context-managed Session:** Engine-pooled with pre-ping checks, pool timeout, and auto-rollback. |
|
|
75
|
+
| **Dependency Footprint** | Heavy setup if installing frameworks like Loguru, Structlog, or heavy database utilities. | **Ultra-lightweight:** Base dependencies are minimal. Pandas is entirely optional and lazy-loaded. |
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Roadmap & What's Next
|
|
80
|
+
|
|
81
|
+
| Phase | Feature / Expansion | Target Use Case | Status |
|
|
82
|
+
|:---|:---|:---|:---|
|
|
83
|
+
| **v1.0** | Core Resilient HTTP, Postgres Pool, Redacted Logger | Personal ETL scripts & Airflow workflows | **Released** |
|
|
84
|
+
| **v1.1** | Cursor-based Pagination (`CursorPagination`) | Handling APIs that use cursor-based cursors | *Planned* |
|
|
85
|
+
| **v1.2** | Async Client Support (`AsyncResilientClient`) | High-throughput concurrent API extraction pipelines | *Planned* |
|
|
86
|
+
| **v1.3** | Parquet / Arrow Ingestion Support | High-performance bulk column-based ingestion | *Planned* |
|
|
87
|
+
| **v2.0** | Schema Validation Layer (`pydantic` integration) | Ingestion payload sanitization and schema contracts | *Conceptual* |
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Install
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
# Core (HTTP + DB + logging)
|
|
95
|
+
pip install dehelpers
|
|
96
|
+
|
|
97
|
+
# With Pandas DataFrame support
|
|
98
|
+
pip install dehelpers[dataframe]
|
|
99
|
+
|
|
100
|
+
# Development (tests)
|
|
101
|
+
pip install dehelpers[dev,dataframe]
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Requires Python ≥ 3.10.
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## Quickstart
|
|
109
|
+
|
|
110
|
+
### Resilient HTTP Client
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
from dehelpers import ResilientClient, RetryPolicy
|
|
114
|
+
|
|
115
|
+
# Custom policy: 5 retries, retry POST with opt-in
|
|
116
|
+
policy = RetryPolicy(max_retries=5, retry_non_idempotent=True)
|
|
117
|
+
client = ResilientClient(retry_policy=policy)
|
|
118
|
+
|
|
119
|
+
resp = client.get("https://api.example.com/data")
|
|
120
|
+
print(resp.json())
|
|
121
|
+
|
|
122
|
+
# Paginate through all items
|
|
123
|
+
for item in client.paginate("https://api.example.com/items"):
|
|
124
|
+
process(item)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### PostgreSQL Database Helper
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from dehelpers import DatabaseManager
|
|
131
|
+
|
|
132
|
+
# Reads DATABASE_URL from environment by default
|
|
133
|
+
with DatabaseManager() as db:
|
|
134
|
+
rows = db.execute(
|
|
135
|
+
"SELECT * FROM users WHERE active = :active",
|
|
136
|
+
{"active": True},
|
|
137
|
+
)
|
|
138
|
+
print(f"Found {len(rows)} active users")
|
|
139
|
+
|
|
140
|
+
# Optional: load into a Pandas DataFrame
|
|
141
|
+
df = db.to_dataframe("SELECT * FROM sales WHERE date > :d", {"d": "2026-01-01"})
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Structured JSON Logger
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
from dehelpers import get_logger, LogContext
|
|
148
|
+
|
|
149
|
+
log = get_logger("my_etl", job_id="daily-sales")
|
|
150
|
+
|
|
151
|
+
with LogContext(request_id="req-abc"):
|
|
152
|
+
log.info("Fetched data", extra={"row_count": 500})
|
|
153
|
+
# Output: {"timestamp": "...", "level": "INFO", "message": "Fetched data",
|
|
154
|
+
# "module": "...", "job_id": "daily-sales", "request_id": "req-abc",
|
|
155
|
+
# "row_count": 500, "error": null}
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Configuration
|
|
161
|
+
|
|
162
|
+
| Parameter | Default | Description |
|
|
163
|
+
|-----------|---------|-------------|
|
|
164
|
+
| `DATABASE_URL` (env var) | — | PostgreSQL connection string (fallback when `dsn` is not passed) |
|
|
165
|
+
| `pool_size` | 5 | Persistent connections in the pool |
|
|
166
|
+
| `max_overflow` | 2 | Extra connections beyond pool_size |
|
|
167
|
+
| `pool_recycle` | 1800 | Seconds before connection recycling |
|
|
168
|
+
| `pool_pre_ping` | True | Health-check connections before use |
|
|
169
|
+
| `pool_timeout` | 30 | Seconds to wait for a pool connection |
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## Security
|
|
174
|
+
|
|
175
|
+
### Automatic Redaction
|
|
176
|
+
|
|
177
|
+
The logger and API client automatically redact values for these keys in log output:
|
|
178
|
+
|
|
179
|
+
`password`, `secret`, `token`, `api_key`, `authorization`, `dsn`, `connection_string`, `credential`, `passphrase`, `private_key`, `client_secret`
|
|
180
|
+
|
|
181
|
+
Matching is **case-insensitive substring** — e.g. `db_password` matches `password`.
|
|
182
|
+
|
|
183
|
+
You can extend the redaction list:
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from dehelpers._redact import redact_dict
|
|
187
|
+
|
|
188
|
+
result = redact_dict(
|
|
189
|
+
{"my_custom_secret": "value"},
|
|
190
|
+
extra_sensitive_keys=frozenset({"my_custom_secret"}),
|
|
191
|
+
)
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
### ⚠️ Never Embed Secrets in URLs
|
|
195
|
+
|
|
196
|
+
URL query parameter values are redacted, but **path segments are not**. Never construct URLs like:
|
|
197
|
+
|
|
198
|
+
```
|
|
199
|
+
https://api.example.com/v1/token/abc123/data # BAD — token in path
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
Instead, pass secrets via headers or request body.
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## Fork Safety (Airflow / Multiprocessing)
|
|
207
|
+
|
|
208
|
+
If you use `DatabaseManager` in a forked environment (e.g. Airflow workers, `multiprocessing`), you **must** either:
|
|
209
|
+
|
|
210
|
+
1. Create the `DatabaseManager` **inside each worker process**, or
|
|
211
|
+
2. Call `db.dispose()` **before** forking.
|
|
212
|
+
|
|
213
|
+
SQLAlchemy connection pools are not safe to share across forked processes.
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
## Testing
|
|
218
|
+
|
|
219
|
+
### Unit tests (no PostgreSQL required)
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
pip install -e ".[dev,dataframe]"
|
|
223
|
+
pytest -v --tb=short -m "not postgres"
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### PostgreSQL integration tests
|
|
227
|
+
|
|
228
|
+
```bash
|
|
229
|
+
# Start a local PostgreSQL
|
|
230
|
+
docker run -d --name pg-test -e POSTGRES_PASSWORD=test -p 5432:5432 postgres:16
|
|
231
|
+
|
|
232
|
+
# Run integration tests
|
|
233
|
+
DATABASE_URL="postgresql+psycopg://postgres:test@localhost:5432/postgres" \
|
|
234
|
+
pytest -m postgres -v
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
### Coverage
|
|
238
|
+
|
|
239
|
+
```bash
|
|
240
|
+
pytest --cov=dehelpers --cov-report=term-missing -m "not postgres"
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
---
|
|
244
|
+
|
|
245
|
+
## License
|
|
246
|
+
|
|
247
|
+
MIT — see [LICENSE](LICENSE).
|