dehelpers 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,34 @@
1
+ name: Python CI
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
15
+
16
+ steps:
17
+ - name: Checkout code
18
+ uses: actions/checkout@v4
19
+
20
+ - name: Set up Python ${{ matrix.python-version }}
21
+ uses: actions/setup-python@v5
22
+ with:
23
+ python-version: ${{ matrix.python-version }}
24
+ cache: 'pip'
25
+ cache-dependency-path: pyproject.toml
26
+
27
+ - name: Install dependencies
28
+ run: |
29
+ python -m pip install --upgrade pip
30
+ pip install -e ".[dev,dataframe]"
31
+
32
+ - name: Run tests with coverage
33
+ run: |
34
+ pytest --cov=dehelpers --cov-report=term-missing -m "not postgres"
@@ -0,0 +1,35 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ id-token: write # Required for PyPI OIDC Trusted Publishing
9
+
10
+ jobs:
11
+ pypi-publish:
12
+ name: Upload release to PyPI
13
+ runs-on: ubuntu-latest
14
+ environment:
15
+ name: pypi
16
+ url: https://pypi.org/p/dehelpers
17
+ steps:
18
+ - name: Checkout code
19
+ uses: actions/checkout@v4
20
+
21
+ - name: Set up Python
22
+ uses: actions/setup-python@v5
23
+ with:
24
+ python-version: '3.11'
25
+
26
+ - name: Install build dependencies
27
+ run: |
28
+ python -m pip install --upgrade pip
29
+ pip install build
30
+
31
+ - name: Build package
32
+ run: python -m build
33
+
34
+ - name: Publish package distributions to PyPI
35
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,47 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Distribution / packaging
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ share/python-wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+ MANIFEST
24
+
25
+ # Pytest / Coverage
26
+ .pytest_cache/
27
+ .coverage
28
+ htmlcov/
29
+ .cov/
30
+ .nox/
31
+ .tox/
32
+
33
+ # Environments
34
+ .env
35
+ .venv
36
+ env/
37
+ venv/
38
+ ENV/
39
+ env.bak/
40
+ venv.bak/
41
+
42
+ # IDEs
43
+ .vscode/
44
+ .idea/
45
+ *.swp
46
+ *.swo
47
+ .DS_Store
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Shardul Chogale
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,279 @@
1
+ Metadata-Version: 2.4
2
+ Name: dehelpers
3
+ Version: 0.1.0
4
+ Summary: Lightweight utilities for data engineering pipelines: resilient HTTP, PostgreSQL helpers, and structured logging.
5
+ Project-URL: Homepage, https://github.com/shard-c6/dehelpers
6
+ Project-URL: Repository, https://github.com/shard-c6/dehelpers
7
+ Author: Shardul Chogale
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Database
18
+ Classifier: Topic :: Software Development :: Libraries
19
+ Classifier: Typing :: Typed
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: psycopg[binary]>=3.0
22
+ Requires-Dist: requests>=2.28
23
+ Requires-Dist: sqlalchemy>=2.0
24
+ Provides-Extra: dataframe
25
+ Requires-Dist: pandas>=2.0; extra == 'dataframe'
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest-cov; extra == 'dev'
28
+ Requires-Dist: pytest-postgresql; extra == 'dev'
29
+ Requires-Dist: pytest>=7.0; extra == 'dev'
30
+ Requires-Dist: responses>=0.23; extra == 'dev'
31
+ Description-Content-Type: text/markdown
32
+
33
+ # dehelpers
34
+
35
+ Lightweight, production-hardened Python utilities for data engineering pipelines.
36
+
37
+ **Resilient HTTP** · **PostgreSQL helpers** · **Structured JSON logging** — with automatic secret redaction, bounded retries, and safe connection pooling.
38
+
39
+ ---
40
+
41
+ ## Architecture & Flow
42
+
43
+ ```mermaid
44
+ graph TD
45
+ subgraph External [External APIs & Services]
46
+ REST_API[REST API Source]
47
+ end
48
+
49
+ subgraph DPH [dehelpers Package]
50
+ direction TB
51
+ subgraph Client [Resilient Client]
52
+ RC[ResilientClient] --> |Configured by| RP[RetryPolicy]
53
+ RC --> |Iterates with| NLP[NextLinkPagination]
54
+ RC --> |Sanitizes query| RU[redact_url]
55
+ end
56
+
57
+ subgraph Logger [Structured Logger]
58
+ GL[get_logger] --> |Formats record| JF[JSONFormatter]
59
+ LogCtx[LogContext] --> |Context injection| CV[job_id / request_id]
60
+ JF --> |Deep-redacts secrets| RD[redact_dict]
61
+ end
62
+
63
+ subgraph Database [Database Manager]
64
+ DBM[DatabaseManager] --> |Yields sessions| SC[_SessionContext]
65
+ DBM --> |Manages pool| SQLA[SQLAlchemy Engine]
66
+ DBM --> |Lazy Load| DF[Pandas DataFrame]
67
+ end
68
+ end
69
+
70
+ subgraph Target [Storage / Logs]
71
+ PG[(PostgreSQL DB)]
72
+ Stderr[Stderr / Cloud Logs]
73
+ end
74
+
75
+ REST_API ==> |Inbound Data| RC
76
+ RC --> |Yields items / logs events| GL
77
+ GL ==> |JSON Output| Stderr
78
+ RC --> |Normalized data| DBM
79
+ DBM ==> |Pool connections| PG
80
+ ```
81
+
82
+ ---
83
+
84
+ ## Boundaries & Capabilities
85
+
86
+ Here is exactly what this package **is** and what it **is not**:
87
+
88
+ | Category / Layer | What this IS | What this IS NOT |
89
+ |:---|:---|:---|
90
+ | **API / HTTP** | A retry-protected wrapper around `requests.Session` with exponential backoff, jitter, and simple pagination. | An asynchronous network library (like `aiohttp` or `httpx`), fully-fledged HTTP client replacement, or GraphQL API wrapper. |
91
+ | **Database** | A thread-safe connection manager for PostgreSQL with pooling configuration, automated transaction commits/rollbacks, and lazy DataFrame output. | An Object-Relational Mapper (ORM) (like SQLModel/SQLAlchemy ORM), schema migration engine (like Alembic), or database administration tool. |
92
+ | **Logging** | A zero-dependency structured JSON formatter on top of standard `logging` with automatic deep secrets redaction. | A log routing system (like Fluentd/Logstash), file logger, metrics exporter, or complex log management server. |
93
+ | **Execution Context** | Designed for batch execution environments like Airflow tasks, ETL scripts, and containerized Docker runtimes. | Suitable for high-throughput, low-latency, real-time web servers or async microservices. |
94
+
95
+ ---
96
+
97
+ ## Comparison with Standard Setup
98
+
99
+ How this package compares to a standard DIY setup:
100
+
101
+ | Feature / Criteria | Standard Setup (`requests` + `logging` + `psycopg`) | `dehelpers` |
102
+ |:---|:---|:---|
103
+ | **Secret Leakage Protection** | Manual / None. Secrets easily print to stdout or appear in exception tracebacks. | **Automatic & Deep Recursive:** Redacts predefined secrets from nested metadata, logs, and query parameters. |
104
+ | **Retry & Jitter Strategy** | Manual loops or boilerplate `urllib3` retry configurations. | **Out-of-the-box resilience:** Exponential backoff with random jitter and clock-based `total_timeout` limit. |
105
+ | **Pagination Handling** | Custom pagination loop logic required for every API endpoint. | **Next-link strategy Protocol:** Yields individual items transparently and safely with validation. |
106
+ | **Connection Safety** | Connection leaks or transaction rollback failures if block managers are missed. | **Context-managed Session:** Engine-pooled with pre-ping checks, pool timeout, and auto-rollback. |
107
+ | **Dependency Footprint** | Heavy setup if installing frameworks like Loguru, Structlog, or heavy database utilities. | **Ultra-lightweight:** Base dependencies are minimal. Pandas is entirely optional and lazy-loaded. |
108
+
109
+ ---
110
+
111
+ ## Roadmap & What's Next
112
+
113
+ | Phase | Feature / Expansion | Target Use Case | Status |
114
+ |:---|:---|:---|:---|
115
+ | **v1.0** | Core Resilient HTTP, Postgres Pool, Redacted Logger | Personal ETL scripts & Airflow workflows | **Released** |
116
+ | **v1.1** | Cursor-based Pagination (`CursorPagination`) | Handling APIs that use cursor-based cursors | *Planned* |
117
+ | **v1.2** | Async Client Support (`AsyncResilientClient`) | High-throughput concurrent API extraction pipelines | *Planned* |
118
+ | **v1.3** | Parquet / Arrow Ingestion Support | High-performance bulk column-based ingestion | *Planned* |
119
+ | **v2.0** | Schema Validation Layer (`pydantic` integration) | Ingestion payload sanitization and schema contracts | *Conceptual* |
120
+
121
+ ---
122
+
123
+ ## Install
124
+
125
+ ```bash
126
+ # Core (HTTP + DB + logging)
127
+ pip install dehelpers
128
+
129
+ # With Pandas DataFrame support
130
+ pip install dehelpers[dataframe]
131
+
132
+ # Development (tests)
133
+ pip install dehelpers[dev,dataframe]
134
+ ```
135
+
136
+ Requires Python ≥ 3.10.
137
+
138
+ ---
139
+
140
+ ## Quickstart
141
+
142
+ ### Resilient HTTP Client
143
+
144
+ ```python
145
+ from dehelpers import ResilientClient, RetryPolicy
146
+
147
+ # Custom policy: 5 retries, retry POST with opt-in
148
+ policy = RetryPolicy(max_retries=5, retry_non_idempotent=True)
149
+ client = ResilientClient(retry_policy=policy)
150
+
151
+ resp = client.get("https://api.example.com/data")
152
+ print(resp.json())
153
+
154
+ # Paginate through all items
155
+ for item in client.paginate("https://api.example.com/items"):
156
+ process(item)
157
+ ```
158
+
159
+ ### PostgreSQL Database Helper
160
+
161
+ ```python
162
+ from dehelpers import DatabaseManager
163
+
164
+ # Reads DATABASE_URL from environment by default
165
+ with DatabaseManager() as db:
166
+ rows = db.execute(
167
+ "SELECT * FROM users WHERE active = :active",
168
+ {"active": True},
169
+ )
170
+ print(f"Found {len(rows)} active users")
171
+
172
+ # Optional: load into a Pandas DataFrame
173
+ df = db.to_dataframe("SELECT * FROM sales WHERE date > :d", {"d": "2026-01-01"})
174
+ ```
175
+
176
+ ### Structured JSON Logger
177
+
178
+ ```python
179
+ from dehelpers import get_logger, LogContext
180
+
181
+ log = get_logger("my_etl", job_id="daily-sales")
182
+
183
+ with LogContext(request_id="req-abc"):
184
+ log.info("Fetched data", extra={"row_count": 500})
185
+ # Output: {"timestamp": "...", "level": "INFO", "message": "Fetched data",
186
+ # "module": "...", "job_id": "daily-sales", "request_id": "req-abc",
187
+ # "row_count": 500, "error": null}
188
+ ```
189
+
190
+ ---
191
+
192
+ ## Configuration
193
+
194
+ | Parameter | Default | Description |
195
+ |-----------|---------|-------------|
196
+ | `DATABASE_URL` (env var) | — | PostgreSQL connection string (fallback when `dsn` is not passed) |
197
+ | `pool_size` | 5 | Persistent connections in the pool |
198
+ | `max_overflow` | 2 | Extra connections beyond pool_size |
199
+ | `pool_recycle` | 1800 | Seconds before connection recycling |
200
+ | `pool_pre_ping` | True | Health-check connections before use |
201
+ | `pool_timeout` | 30 | Seconds to wait for a pool connection |
202
+
203
+ ---
204
+
205
+ ## Security
206
+
207
+ ### Automatic Redaction
208
+
209
+ The logger and API client automatically redact values for these keys in log output:
210
+
211
+ `password`, `secret`, `token`, `api_key`, `authorization`, `dsn`, `connection_string`, `credential`, `passphrase`, `private_key`, `client_secret`
212
+
213
+ Matching is **case-insensitive substring** — e.g. `db_password` matches `password`.
214
+
215
+ You can extend the redaction list:
216
+
217
+ ```python
218
+ from dehelpers._redact import redact_dict
219
+
220
+ result = redact_dict(
221
+ {"my_custom_secret": "value"},
222
+ extra_sensitive_keys=frozenset({"my_custom_secret"}),
223
+ )
224
+ ```
225
+
226
+ ### ⚠️ Never Embed Secrets in URLs
227
+
228
+ URL query parameter values are redacted, but **path segments are not**. Never construct URLs like:
229
+
230
+ ```
231
+ https://api.example.com/v1/token/abc123/data # BAD — token in path
232
+ ```
233
+
234
+ Instead, pass secrets via headers or request body.
235
+
236
+ ---
237
+
238
+ ## Fork Safety (Airflow / Multiprocessing)
239
+
240
+ If you use `DatabaseManager` in a forked environment (e.g. Airflow workers, `multiprocessing`), you **must** either:
241
+
242
+ 1. Create the `DatabaseManager` **inside each worker process**, or
243
+ 2. Call `db.dispose()` **before** forking.
244
+
245
+ SQLAlchemy connection pools are not safe to share across forked processes.
246
+
247
+ ---
248
+
249
+ ## Testing
250
+
251
+ ### Unit tests (no PostgreSQL required)
252
+
253
+ ```bash
254
+ pip install -e ".[dev,dataframe]"
255
+ pytest -v --tb=short -m "not postgres"
256
+ ```
257
+
258
+ ### PostgreSQL integration tests
259
+
260
+ ```bash
261
+ # Start a local PostgreSQL
262
+ docker run -d --name pg-test -e POSTGRES_PASSWORD=test -p 5432:5432 postgres:16
263
+
264
+ # Run integration tests
265
+ DATABASE_URL="postgresql+psycopg://postgres:test@localhost:5432/postgres" \
266
+ pytest -m postgres -v
267
+ ```
268
+
269
+ ### Coverage
270
+
271
+ ```bash
272
+ pytest --cov=dehelpers --cov-report=term-missing -m "not postgres"
273
+ ```
274
+
275
+ ---
276
+
277
+ ## License
278
+
279
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,247 @@
1
+ # dehelpers
2
+
3
+ Lightweight, production-hardened Python utilities for data engineering pipelines.
4
+
5
+ **Resilient HTTP** · **PostgreSQL helpers** · **Structured JSON logging** — with automatic secret redaction, bounded retries, and safe connection pooling.
6
+
7
+ ---
8
+
9
+ ## Architecture & Flow
10
+
11
+ ```mermaid
12
+ graph TD
13
+ subgraph External [External APIs & Services]
14
+ REST_API[REST API Source]
15
+ end
16
+
17
+ subgraph DPH [dehelpers Package]
18
+ direction TB
19
+ subgraph Client [Resilient Client]
20
+ RC[ResilientClient] --> |Configured by| RP[RetryPolicy]
21
+ RC --> |Iterates with| NLP[NextLinkPagination]
22
+ RC --> |Sanitizes query| RU[redact_url]
23
+ end
24
+
25
+ subgraph Logger [Structured Logger]
26
+ GL[get_logger] --> |Formats record| JF[JSONFormatter]
27
+ LogCtx[LogContext] --> |Context injection| CV[job_id / request_id]
28
+ JF --> |Deep-redacts secrets| RD[redact_dict]
29
+ end
30
+
31
+ subgraph Database [Database Manager]
32
+ DBM[DatabaseManager] --> |Yields sessions| SC[_SessionContext]
33
+ DBM --> |Manages pool| SQLA[SQLAlchemy Engine]
34
+ DBM --> |Lazy Load| DF[Pandas DataFrame]
35
+ end
36
+ end
37
+
38
+ subgraph Target [Storage / Logs]
39
+ PG[(PostgreSQL DB)]
40
+ Stderr[Stderr / Cloud Logs]
41
+ end
42
+
43
+ REST_API ==> |Inbound Data| RC
44
+ RC --> |Yields items / logs events| GL
45
+ GL ==> |JSON Output| Stderr
46
+ RC --> |Normalized data| DBM
47
+ DBM ==> |Pool connections| PG
48
+ ```
49
+
50
+ ---
51
+
52
+ ## Boundaries & Capabilities
53
+
54
+ Here is exactly what this package **is** and what it **is not**:
55
+
56
+ | Category / Layer | What this IS | What this IS NOT |
57
+ |:---|:---|:---|
58
+ | **API / HTTP** | A retry-protected wrapper around `requests.Session` with exponential backoff, jitter, and simple pagination. | An asynchronous network library (like `aiohttp` or `httpx`), fully-fledged HTTP client replacement, or GraphQL API wrapper. |
59
+ | **Database** | A thread-safe connection manager for PostgreSQL with pooling configuration, automated transaction commits/rollbacks, and lazy DataFrame output. | An Object-Relational Mapper (ORM) (like SQLModel/SQLAlchemy ORM), schema migration engine (like Alembic), or database administration tool. |
60
+ | **Logging** | A zero-dependency structured JSON formatter on top of standard `logging` with automatic deep secrets redaction. | A log routing system (like Fluentd/Logstash), file logger, metrics exporter, or complex log management server. |
61
+ | **Execution Context** | Designed for batch execution environments like Airflow tasks, ETL scripts, and containerized Docker runtimes. | Suitable for high-throughput, low-latency, real-time web servers or async microservices. |
62
+
63
+ ---
64
+
65
+ ## Comparison with Standard Setup
66
+
67
+ How this package compares to a standard DIY setup:
68
+
69
+ | Feature / Criteria | Standard Setup (`requests` + `logging` + `psycopg`) | `dehelpers` |
70
+ |:---|:---|:---|
71
+ | **Secret Leakage Protection** | Manual / None. Secrets easily print to stdout or appear in exception tracebacks. | **Automatic & Deep Recursive:** Redacts predefined secrets from nested metadata, logs, and query parameters. |
72
+ | **Retry & Jitter Strategy** | Manual loops or boilerplate `urllib3` retry configurations. | **Out-of-the-box resilience:** Exponential backoff with random jitter and clock-based `total_timeout` limit. |
73
+ | **Pagination Handling** | Custom pagination loop logic required for every API endpoint. | **Next-link strategy Protocol:** Yields individual items transparently and safely with validation. |
74
+ | **Connection Safety** | Connection leaks or transaction rollback failures if block managers are missed. | **Context-managed Session:** Engine-pooled with pre-ping checks, pool timeout, and auto-rollback. |
75
+ | **Dependency Footprint** | Heavy setup if installing frameworks like Loguru, Structlog, or heavy database utilities. | **Ultra-lightweight:** Base dependencies are minimal. Pandas is entirely optional and lazy-loaded. |
76
+
77
+ ---
78
+
79
+ ## Roadmap & What's Next
80
+
81
+ | Phase | Feature / Expansion | Target Use Case | Status |
82
+ |:---|:---|:---|:---|
83
+ | **v1.0** | Core Resilient HTTP, Postgres Pool, Redacted Logger | Personal ETL scripts & Airflow workflows | **Released** |
84
+ | **v1.1** | Cursor-based Pagination (`CursorPagination`) | Handling APIs that use cursor-based cursors | *Planned* |
85
+ | **v1.2** | Async Client Support (`AsyncResilientClient`) | High-throughput concurrent API extraction pipelines | *Planned* |
86
+ | **v1.3** | Parquet / Arrow Ingestion Support | High-performance bulk column-based ingestion | *Planned* |
87
+ | **v2.0** | Schema Validation Layer (`pydantic` integration) | Ingestion payload sanitization and schema contracts | *Conceptual* |
88
+
89
+ ---
90
+
91
+ ## Install
92
+
93
+ ```bash
94
+ # Core (HTTP + DB + logging)
95
+ pip install dehelpers
96
+
97
+ # With Pandas DataFrame support
98
+ pip install dehelpers[dataframe]
99
+
100
+ # Development (tests)
101
+ pip install dehelpers[dev,dataframe]
102
+ ```
103
+
104
+ Requires Python ≥ 3.10.
105
+
106
+ ---
107
+
108
+ ## Quickstart
109
+
110
+ ### Resilient HTTP Client
111
+
112
+ ```python
113
+ from dehelpers import ResilientClient, RetryPolicy
114
+
115
+ # Custom policy: 5 retries, retry POST with opt-in
116
+ policy = RetryPolicy(max_retries=5, retry_non_idempotent=True)
117
+ client = ResilientClient(retry_policy=policy)
118
+
119
+ resp = client.get("https://api.example.com/data")
120
+ print(resp.json())
121
+
122
+ # Paginate through all items
123
+ for item in client.paginate("https://api.example.com/items"):
124
+ process(item)
125
+ ```
126
+
127
+ ### PostgreSQL Database Helper
128
+
129
+ ```python
130
+ from dehelpers import DatabaseManager
131
+
132
+ # Reads DATABASE_URL from environment by default
133
+ with DatabaseManager() as db:
134
+ rows = db.execute(
135
+ "SELECT * FROM users WHERE active = :active",
136
+ {"active": True},
137
+ )
138
+ print(f"Found {len(rows)} active users")
139
+
140
+ # Optional: load into a Pandas DataFrame
141
+ df = db.to_dataframe("SELECT * FROM sales WHERE date > :d", {"d": "2026-01-01"})
142
+ ```
143
+
144
+ ### Structured JSON Logger
145
+
146
+ ```python
147
+ from dehelpers import get_logger, LogContext
148
+
149
+ log = get_logger("my_etl", job_id="daily-sales")
150
+
151
+ with LogContext(request_id="req-abc"):
152
+ log.info("Fetched data", extra={"row_count": 500})
153
+ # Output: {"timestamp": "...", "level": "INFO", "message": "Fetched data",
154
+ # "module": "...", "job_id": "daily-sales", "request_id": "req-abc",
155
+ # "row_count": 500, "error": null}
156
+ ```
157
+
158
+ ---
159
+
160
+ ## Configuration
161
+
162
+ | Parameter | Default | Description |
163
+ |-----------|---------|-------------|
164
+ | `DATABASE_URL` (env var) | — | PostgreSQL connection string (fallback when `dsn` is not passed) |
165
+ | `pool_size` | 5 | Persistent connections in the pool |
166
+ | `max_overflow` | 2 | Extra connections beyond pool_size |
167
+ | `pool_recycle` | 1800 | Seconds before connection recycling |
168
+ | `pool_pre_ping` | True | Health-check connections before use |
169
+ | `pool_timeout` | 30 | Seconds to wait for a pool connection |
170
+
171
+ ---
172
+
173
+ ## Security
174
+
175
+ ### Automatic Redaction
176
+
177
+ The logger and API client automatically redact values for these keys in log output:
178
+
179
+ `password`, `secret`, `token`, `api_key`, `authorization`, `dsn`, `connection_string`, `credential`, `passphrase`, `private_key`, `client_secret`
180
+
181
+ Matching is **case-insensitive substring** — e.g. `db_password` matches `password`.
182
+
183
+ You can extend the redaction list:
184
+
185
+ ```python
186
+ from dehelpers._redact import redact_dict
187
+
188
+ result = redact_dict(
189
+ {"my_custom_secret": "value"},
190
+ extra_sensitive_keys=frozenset({"my_custom_secret"}),
191
+ )
192
+ ```
193
+
194
+ ### ⚠️ Never Embed Secrets in URLs
195
+
196
+ URL query parameter values are redacted, but **path segments are not**. Never construct URLs like:
197
+
198
+ ```
199
+ https://api.example.com/v1/token/abc123/data # BAD — token in path
200
+ ```
201
+
202
+ Instead, pass secrets via headers or request body.
203
+
204
+ ---
205
+
206
+ ## Fork Safety (Airflow / Multiprocessing)
207
+
208
+ If you use `DatabaseManager` in a forked environment (e.g. Airflow workers, `multiprocessing`), you **must** either:
209
+
210
+ 1. Create the `DatabaseManager` **inside each worker process**, or
211
+ 2. Call `db.dispose()` **before** forking.
212
+
213
+ SQLAlchemy connection pools are not safe to share across forked processes.
214
+
215
+ ---
216
+
217
+ ## Testing
218
+
219
+ ### Unit tests (no PostgreSQL required)
220
+
221
+ ```bash
222
+ pip install -e ".[dev,dataframe]"
223
+ pytest -v --tb=short -m "not postgres"
224
+ ```
225
+
226
+ ### PostgreSQL integration tests
227
+
228
+ ```bash
229
+ # Start a local PostgreSQL
230
+ docker run -d --name pg-test -e POSTGRES_PASSWORD=test -p 5432:5432 postgres:16
231
+
232
+ # Run integration tests
233
+ DATABASE_URL="postgresql+psycopg://postgres:test@localhost:5432/postgres" \
234
+ pytest -m postgres -v
235
+ ```
236
+
237
+ ### Coverage
238
+
239
+ ```bash
240
+ pytest --cov=dehelpers --cov-report=term-missing -m "not postgres"
241
+ ```
242
+
243
+ ---
244
+
245
+ ## License
246
+
247
+ MIT — see [LICENSE](LICENSE).