airflow-toolkit 2.2.0__tar.gz → 2.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {airflow_toolkit-2.2.0/src/airflow_toolkit.egg-info → airflow_toolkit-2.4.0}/PKG-INFO +193 -7
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/README.md +186 -6
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/pyproject.toml +9 -1
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/compression_utils.py +2 -2
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/operators/filesystem_to_database.py +90 -6
- airflow_toolkit-2.4.0/src/airflow_toolkit/providers/filesystem/operators/auth.py +66 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/operators/http_to_filesystem.py +154 -82
- airflow_toolkit-2.4.0/src/airflow_toolkit/testing.py +59 -0
- airflow_toolkit-2.4.0/src/airflow_toolkit/types.py +51 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0/src/airflow_toolkit.egg-info}/PKG-INFO +193 -7
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/SOURCES.txt +3 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/requires.txt +8 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/LICENSE.txt +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/setup.cfg +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/_compact/airflow_shim.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/exceptions.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/filesystem_factory.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/filesystem_protocol.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/azure_databricks_volume_filesystem.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/azure_file_share_filesystem.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/blob_storage_filesystem.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/google_cloud_storage_filesystem.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/local_filesystem.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/s3_filesystem.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/sftp_filesystem.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/discord.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/email.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/slack.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/teams.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/context.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/protocols.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/hooks/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/hooks/azure_databricks.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/hooks/azure_file_share.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/operators/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/operators/duckdb_to_deltalake.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/sensors/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/sensors/filesystem_file.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/operators/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/operators/filesystem.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/tasks.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/package.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/py.typed +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/dependency_links.txt +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/entry_points.txt +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: airflow-toolkit
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.4.0
|
|
4
4
|
Summary: A toolkit of operators, hooks and utilities for Apache Airflow 3
|
|
5
5
|
Author-email: Biel Llobera <biel_llobera@dkl.digital>
|
|
6
6
|
Requires-Python: <3.15,>=3.11
|
|
@@ -32,6 +32,10 @@ Provides-Extra: duckdb
|
|
|
32
32
|
Requires-Dist: airflow-provider-duckdb>=0.1.2; extra == "duckdb"
|
|
33
33
|
Provides-Extra: sqlite
|
|
34
34
|
Requires-Dist: apache-airflow-providers-sqlite; extra == "sqlite"
|
|
35
|
+
Provides-Extra: excel
|
|
36
|
+
Requires-Dist: openpyxl>=3.1; extra == "excel"
|
|
37
|
+
Provides-Extra: avro
|
|
38
|
+
Requires-Dist: fastavro>=1.9; extra == "avro"
|
|
35
39
|
Provides-Extra: airflow3-full
|
|
36
40
|
Requires-Dist: apache-airflow<4,>=3; extra == "airflow3-full"
|
|
37
41
|
Requires-Dist: apache-airflow-providers-fab>=3.0.0; extra == "airflow3-full"
|
|
@@ -49,6 +53,8 @@ Requires-Dist: requests>=2.31.0; extra == "airflow3-full"
|
|
|
49
53
|
Requires-Dist: jmespath<2,>=1.0.1; extra == "airflow3-full"
|
|
50
54
|
Requires-Dist: airflow-provider-duckdb>=0.1.2; extra == "airflow3-full"
|
|
51
55
|
Requires-Dist: apache-airflow-providers-sqlite; extra == "airflow3-full"
|
|
56
|
+
Requires-Dist: openpyxl>=3.1; extra == "airflow3-full"
|
|
57
|
+
Requires-Dist: fastavro>=1.9; extra == "airflow3-full"
|
|
52
58
|
Dynamic: license-file
|
|
53
59
|
|
|
54
60
|
# Airflow Toolkit
|
|
@@ -136,10 +142,11 @@ pip install "airflow-toolkit[airflow3-full]"
|
|
|
136
142
|
| `google` | `providers-google` | GCS filesystem backend |
|
|
137
143
|
| `azure` | `providers-microsoft-azure` | Azure Blob / ADLS filesystem backend |
|
|
138
144
|
| `sftp` | `providers-sftp` | SFTP filesystem backend |
|
|
139
|
-
| `slack` | `providers-slack` | Slack failure notifications |
|
|
140
145
|
| `http` | `providers-http`, `requests`, `jmespath`, `pandas` | `HttpToFilesystem`, `MultiHttpToFilesystem` |
|
|
141
146
|
| `duckdb` | `airflow-provider-duckdb` | `DuckdbToDeltalake` operator |
|
|
142
147
|
| `sqlite` | `providers-sqlite` | SQLite as source or destination |
|
|
148
|
+
| `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` and `HttpToFilesystem` |
|
|
149
|
+
| `avro` | `fastavro` | Avro support in `FilesystemToDatabase` and `HttpToFilesystem` |
|
|
143
150
|
| `airflow3-full` | all of the above | Quick start / development |
|
|
144
151
|
|
|
145
152
|
---
|
|
@@ -184,7 +191,7 @@ Changing the connection's `conn_type` is all that is needed to switch backends
|
|
|
184
191
|
|
|
185
192
|
### HttpToFilesystem
|
|
186
193
|
|
|
187
|
-
Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, and custom response transformations.
|
|
194
|
+
Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, OAuth 2.0 authentication, rate limiting, and custom response transformations.
|
|
188
195
|
|
|
189
196
|
```python
|
|
190
197
|
from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import HttpToFilesystem
|
|
@@ -201,7 +208,7 @@ HttpToFilesystem(
|
|
|
201
208
|
)
|
|
202
209
|
```
|
|
203
210
|
|
|
204
|
-
With cursor-based pagination
|
|
211
|
+
**With cursor-based pagination:**
|
|
205
212
|
|
|
206
213
|
```python
|
|
207
214
|
def next_page(response):
|
|
@@ -223,9 +230,70 @@ HttpToFilesystem(
|
|
|
223
230
|
)
|
|
224
231
|
```
|
|
225
232
|
|
|
233
|
+
**With OAuth 2.0 Client Credentials:**
|
|
234
|
+
|
|
235
|
+
`OAuth2ClientCredentials.client_credentials()` returns a configured auth class that fetches the token lazily on the first request and refreshes it automatically 30 seconds before expiry — no manual token management required.
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
from airflow_toolkit.providers.filesystem.operators.auth import OAuth2ClientCredentials
|
|
239
|
+
|
|
240
|
+
HttpToFilesystem(
|
|
241
|
+
task_id='fetch_protected_data',
|
|
242
|
+
http_conn_id='my_api',
|
|
243
|
+
filesystem_conn_id='my_data_lake',
|
|
244
|
+
filesystem_path='raw/data/{{ ds }}/',
|
|
245
|
+
endpoint='/api/v1/data',
|
|
246
|
+
method='GET',
|
|
247
|
+
save_format='jsonl',
|
|
248
|
+
auth_type=OAuth2ClientCredentials.client_credentials(
|
|
249
|
+
token_url='https://auth.example.com/oauth2/token',
|
|
250
|
+
client_id='{{ var.value.oauth2_client_id }}',
|
|
251
|
+
client_secret='{{ var.value.oauth2_client_secret }}',
|
|
252
|
+
scope='read', # optional
|
|
253
|
+
),
|
|
254
|
+
)
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
**With rate limiting:**
|
|
258
|
+
|
|
259
|
+
Use `requests_per_second` to cap how fast paginated requests are sent. This is useful when the API enforces a rate limit.
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
HttpToFilesystem(
|
|
263
|
+
task_id='fetch_with_rate_limit',
|
|
264
|
+
http_conn_id='my_api',
|
|
265
|
+
filesystem_conn_id='my_data_lake',
|
|
266
|
+
filesystem_path='raw/events/{{ ds }}/',
|
|
267
|
+
endpoint='/api/v1/events',
|
|
268
|
+
method='GET',
|
|
269
|
+
pagination_function=next_page,
|
|
270
|
+
save_format='jsonl',
|
|
271
|
+
requests_per_second=3.0, # max 3 requests per second between pages
|
|
272
|
+
)
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
**Supported response formats:**
|
|
276
|
+
|
|
277
|
+
`save_format` controls how the response is written to the filesystem. For APIs that return binary formats natively (e.g. a reporting API that streams Excel files), set `source_format` to match the response content type:
|
|
278
|
+
|
|
279
|
+
| `source_format` / `save_format` | File extension | Notes |
|
|
280
|
+
|---|---|---|
|
|
281
|
+
| `json` | `.json` | Single JSON object or array |
|
|
282
|
+
| `jsonl` | `.jsonl` | Array response written as one record per line |
|
|
283
|
+
| `csv` | `.csv` | Raw CSV text from the response |
|
|
284
|
+
| `xml` | `.xml` | Raw XML text from the response |
|
|
285
|
+
| `parquet` | `.parquet` | Binary passthrough — API must return Parquet bytes |
|
|
286
|
+
| `excel` | `.xlsx` | Binary passthrough — API must return Excel bytes (requires `[excel]`) |
|
|
287
|
+
| `avro` | `.avro` | Binary passthrough — API must return Avro bytes (requires `[avro]`) |
|
|
288
|
+
| `fixed_width` | `.fwf` | Fixed-width text from the response |
|
|
289
|
+
|
|
290
|
+
All text and JSON formats support gzip/zip compression via the `compression` parameter.
|
|
291
|
+
|
|
226
292
|
### MultiHttpToFilesystem
|
|
227
293
|
|
|
228
|
-
Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file.
|
|
294
|
+
Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file. Requests can run **sequentially** (with optional rate limiting) or **in parallel** using a thread pool.
|
|
295
|
+
|
|
296
|
+
**Sequential with rate limiting:**
|
|
229
297
|
|
|
230
298
|
```python
|
|
231
299
|
from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import MultiHttpToFilesystem
|
|
@@ -237,6 +305,7 @@ MultiHttpToFilesystem(
|
|
|
237
305
|
filesystem_path='raw/reference/{{ ds }}/',
|
|
238
306
|
method='GET',
|
|
239
307
|
save_format='jsonl',
|
|
308
|
+
requests_per_second=2.0, # max 2 requests per second between calls
|
|
240
309
|
multi_requests=[
|
|
241
310
|
{'endpoint': '/api/v1/categories'},
|
|
242
311
|
{'endpoint': '/api/v1/statuses'},
|
|
@@ -245,6 +314,31 @@ MultiHttpToFilesystem(
|
|
|
245
314
|
)
|
|
246
315
|
```
|
|
247
316
|
|
|
317
|
+
**Parallel execution:**
|
|
318
|
+
|
|
319
|
+
Set `max_workers` to run requests concurrently using a thread pool. Each request writes to its own file — there are no file collisions.
|
|
320
|
+
|
|
321
|
+
```python
|
|
322
|
+
MultiHttpToFilesystem(
|
|
323
|
+
task_id='fetch_users_parallel',
|
|
324
|
+
http_conn_id='my_api',
|
|
325
|
+
filesystem_conn_id='my_data_lake',
|
|
326
|
+
filesystem_path='raw/users/{{ ds }}/',
|
|
327
|
+
method='GET',
|
|
328
|
+
save_format='json',
|
|
329
|
+
max_workers=5, # up to 5 concurrent threads
|
|
330
|
+
multi_requests=[
|
|
331
|
+
{'endpoint': '/api/v1/users/1'},
|
|
332
|
+
{'endpoint': '/api/v1/users/2'},
|
|
333
|
+
{'endpoint': '/api/v1/users/3'},
|
|
334
|
+
{'endpoint': '/api/v1/users/4'},
|
|
335
|
+
{'endpoint': '/api/v1/users/5'},
|
|
336
|
+
],
|
|
337
|
+
)
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
> Rate limiting (`requests_per_second`) applies only in sequential mode. In parallel mode the thread pool controls concurrency — use `max_workers` to avoid overwhelming the API.
|
|
341
|
+
|
|
248
342
|
Each entry in `multi_requests` can override any base parameter (`endpoint`, `method`, `headers`, `data`, `jmespath_expression`, `save_format`, `compression`).
|
|
249
343
|
|
|
250
344
|
### SQLToFilesystem
|
|
@@ -313,7 +407,9 @@ FilesystemToFilesystem(
|
|
|
313
407
|
|
|
314
408
|
### FilesystemToDatabase
|
|
315
409
|
|
|
316
|
-
Reads files
|
|
410
|
+
Reads files from any filesystem and loads them into any SQLAlchemy-compatible database. Handles schema drift automatically: columns present in the file but missing from the table are added; columns present in the table but missing from the file are filled with `NULL`.
|
|
411
|
+
|
|
412
|
+
**Supported formats:** `csv`, `json`, `parquet`, `excel`, `avro`, `fixed_width`.
|
|
317
413
|
|
|
318
414
|
```python
|
|
319
415
|
from airflow_toolkit.providers.deltalake.operators.filesystem_to_database import FilesystemToDatabaseOperator
|
|
@@ -325,7 +421,7 @@ FilesystemToDatabaseOperator(
|
|
|
325
421
|
filesystem_path='raw/orders/{{ ds }}/',
|
|
326
422
|
db_schema='public',
|
|
327
423
|
db_table='orders',
|
|
328
|
-
source_format='csv',
|
|
424
|
+
source_format='csv', # 'csv' | 'json' | 'parquet' | 'excel' | 'avro' | 'fixed_width'
|
|
329
425
|
table_aggregation_type='append', # 'append' | 'replace' | 'fail'
|
|
330
426
|
metadata={
|
|
331
427
|
'_ds': '{{ ds }}',
|
|
@@ -335,6 +431,52 @@ FilesystemToDatabaseOperator(
|
|
|
335
431
|
)
|
|
336
432
|
```
|
|
337
433
|
|
|
434
|
+
**Excel** (requires the `[excel]` extra):
|
|
435
|
+
|
|
436
|
+
```python
|
|
437
|
+
FilesystemToDatabaseOperator(
|
|
438
|
+
task_id='load_excel_report',
|
|
439
|
+
filesystem_conn_id='my_data_lake',
|
|
440
|
+
database_conn_id='my_postgres',
|
|
441
|
+
filesystem_path='raw/reports/{{ ds }}/',
|
|
442
|
+
db_table='monthly_report',
|
|
443
|
+
source_format='excel',
|
|
444
|
+
source_format_options={'sheet_name': 'Data'},
|
|
445
|
+
)
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
**Avro** (requires the `[avro]` extra):
|
|
449
|
+
|
|
450
|
+
```python
|
|
451
|
+
FilesystemToDatabaseOperator(
|
|
452
|
+
task_id='load_avro_events',
|
|
453
|
+
filesystem_conn_id='my_data_lake',
|
|
454
|
+
database_conn_id='my_postgres',
|
|
455
|
+
filesystem_path='raw/events/{{ ds }}/',
|
|
456
|
+
db_table='events',
|
|
457
|
+
source_format='avro',
|
|
458
|
+
)
|
|
459
|
+
```
|
|
460
|
+
|
|
461
|
+
**Fixed-width** (no extra required — pandas native):
|
|
462
|
+
|
|
463
|
+
```python
|
|
464
|
+
FilesystemToDatabaseOperator(
|
|
465
|
+
task_id='load_fixed_width',
|
|
466
|
+
filesystem_conn_id='my_data_lake',
|
|
467
|
+
database_conn_id='my_postgres',
|
|
468
|
+
filesystem_path='raw/exports/{{ ds }}/',
|
|
469
|
+
db_table='transactions',
|
|
470
|
+
source_format='fixed_width',
|
|
471
|
+
source_format_options={
|
|
472
|
+
'colspecs': [(0, 10), (10, 25), (25, 35)],
|
|
473
|
+
'names': ['date', 'description', 'amount'],
|
|
474
|
+
},
|
|
475
|
+
)
|
|
476
|
+
```
|
|
477
|
+
|
|
478
|
+
Each format is matched by file extension: `.csv`/`.csv.gz`, `.json`/`.json.gz`, `.parquet`/`.parquet.gz`, `.xlsx`/`.xls`, `.avro`, `.fwf`/`.txt`/`.dat`. Files with other extensions in the same prefix are silently skipped.
|
|
479
|
+
|
|
338
480
|
### DuckdbToDeltalake
|
|
339
481
|
|
|
340
482
|
Executes a DuckDB SQL query and writes the result directly to a Delta Lake table on Azure storage. Useful for in-process transformations that land results as an open table format.
|
|
@@ -530,6 +672,50 @@ Each environment maps to a distinct colour across all channels so alerts are rec
|
|
|
530
672
|
|
|
531
673
|
---
|
|
532
674
|
|
|
675
|
+
## Testing Utilities
|
|
676
|
+
|
|
677
|
+
### MockFilesystem
|
|
678
|
+
|
|
679
|
+
`MockFilesystem` is an in-memory implementation of `FilesystemProtocol` for unit testing. It requires no Docker, no cloud credentials, and no network — all files are stored in a plain Python dict.
|
|
680
|
+
|
|
681
|
+
```python
|
|
682
|
+
from airflow_toolkit.testing import MockFilesystem
|
|
683
|
+
|
|
684
|
+
# Pre-load files at construction time
|
|
685
|
+
fs = MockFilesystem({
|
|
686
|
+
"raw/orders/2024-01-01/data.csv": b"id,amount\n1,100\n2,200",
|
|
687
|
+
})
|
|
688
|
+
|
|
689
|
+
# Or write files programmatically
|
|
690
|
+
fs.write(b"id,amount\n3,300", "raw/orders/2024-01-02/data.csv")
|
|
691
|
+
|
|
692
|
+
# Inspect the result in assertions
|
|
693
|
+
assert fs.check_file("raw/orders/2024-01-01/data.csv")
|
|
694
|
+
assert len(fs.list_files("raw/orders/")) == 2
|
|
695
|
+
assert fs.files["raw/orders/2024-01-01/data.csv"] == b"id,amount\n1,100\n2,200"
|
|
696
|
+
```
|
|
697
|
+
|
|
698
|
+
Use it to patch `FilesystemFactory.get_data_lake_filesystem` in your operator tests:
|
|
699
|
+
|
|
700
|
+
```python
|
|
701
|
+
from unittest.mock import patch
|
|
702
|
+
from airflow_toolkit.testing import MockFilesystem
|
|
703
|
+
|
|
704
|
+
def test_my_pipeline(tmp_path):
|
|
705
|
+
fs = MockFilesystem({"data/file.csv": b"id,name\n1,Alice"})
|
|
706
|
+
|
|
707
|
+
with patch(
|
|
708
|
+
"airflow_toolkit.filesystems.filesystem_factory.FilesystemFactory.get_data_lake_filesystem",
|
|
709
|
+
return_value=fs,
|
|
710
|
+
):
|
|
711
|
+
# run your operator or task here
|
|
712
|
+
...
|
|
713
|
+
```
|
|
714
|
+
|
|
715
|
+
`MockFilesystem` implements the full `FilesystemProtocol`: `read`, `write`, `delete_file`, `create_prefix`, `delete_prefix`, `check_file`, `check_prefix`, `list_files`.
|
|
716
|
+
|
|
717
|
+
---
|
|
718
|
+
|
|
533
719
|
## Running Tests
|
|
534
720
|
|
|
535
721
|
### Integration tests
|
|
@@ -83,10 +83,11 @@ pip install "airflow-toolkit[airflow3-full]"
|
|
|
83
83
|
| `google` | `providers-google` | GCS filesystem backend |
|
|
84
84
|
| `azure` | `providers-microsoft-azure` | Azure Blob / ADLS filesystem backend |
|
|
85
85
|
| `sftp` | `providers-sftp` | SFTP filesystem backend |
|
|
86
|
-
| `slack` | `providers-slack` | Slack failure notifications |
|
|
87
86
|
| `http` | `providers-http`, `requests`, `jmespath`, `pandas` | `HttpToFilesystem`, `MultiHttpToFilesystem` |
|
|
88
87
|
| `duckdb` | `airflow-provider-duckdb` | `DuckdbToDeltalake` operator |
|
|
89
88
|
| `sqlite` | `providers-sqlite` | SQLite as source or destination |
|
|
89
|
+
| `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` and `HttpToFilesystem` |
|
|
90
|
+
| `avro` | `fastavro` | Avro support in `FilesystemToDatabase` and `HttpToFilesystem` |
|
|
90
91
|
| `airflow3-full` | all of the above | Quick start / development |
|
|
91
92
|
|
|
92
93
|
---
|
|
@@ -131,7 +132,7 @@ Changing the connection's `conn_type` is all that is needed to switch backends
|
|
|
131
132
|
|
|
132
133
|
### HttpToFilesystem
|
|
133
134
|
|
|
134
|
-
Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, and custom response transformations.
|
|
135
|
+
Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, OAuth 2.0 authentication, rate limiting, and custom response transformations.
|
|
135
136
|
|
|
136
137
|
```python
|
|
137
138
|
from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import HttpToFilesystem
|
|
@@ -148,7 +149,7 @@ HttpToFilesystem(
|
|
|
148
149
|
)
|
|
149
150
|
```
|
|
150
151
|
|
|
151
|
-
With cursor-based pagination
|
|
152
|
+
**With cursor-based pagination:**
|
|
152
153
|
|
|
153
154
|
```python
|
|
154
155
|
def next_page(response):
|
|
@@ -170,9 +171,70 @@ HttpToFilesystem(
|
|
|
170
171
|
)
|
|
171
172
|
```
|
|
172
173
|
|
|
174
|
+
**With OAuth 2.0 Client Credentials:**
|
|
175
|
+
|
|
176
|
+
`OAuth2ClientCredentials.client_credentials()` returns a configured auth class that fetches the token lazily on the first request and refreshes it automatically 30 seconds before expiry — no manual token management required.
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
from airflow_toolkit.providers.filesystem.operators.auth import OAuth2ClientCredentials
|
|
180
|
+
|
|
181
|
+
HttpToFilesystem(
|
|
182
|
+
task_id='fetch_protected_data',
|
|
183
|
+
http_conn_id='my_api',
|
|
184
|
+
filesystem_conn_id='my_data_lake',
|
|
185
|
+
filesystem_path='raw/data/{{ ds }}/',
|
|
186
|
+
endpoint='/api/v1/data',
|
|
187
|
+
method='GET',
|
|
188
|
+
save_format='jsonl',
|
|
189
|
+
auth_type=OAuth2ClientCredentials.client_credentials(
|
|
190
|
+
token_url='https://auth.example.com/oauth2/token',
|
|
191
|
+
client_id='{{ var.value.oauth2_client_id }}',
|
|
192
|
+
client_secret='{{ var.value.oauth2_client_secret }}',
|
|
193
|
+
scope='read', # optional
|
|
194
|
+
),
|
|
195
|
+
)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
**With rate limiting:**
|
|
199
|
+
|
|
200
|
+
Use `requests_per_second` to cap how fast paginated requests are sent. This is useful when the API enforces a rate limit.
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
HttpToFilesystem(
|
|
204
|
+
task_id='fetch_with_rate_limit',
|
|
205
|
+
http_conn_id='my_api',
|
|
206
|
+
filesystem_conn_id='my_data_lake',
|
|
207
|
+
filesystem_path='raw/events/{{ ds }}/',
|
|
208
|
+
endpoint='/api/v1/events',
|
|
209
|
+
method='GET',
|
|
210
|
+
pagination_function=next_page,
|
|
211
|
+
save_format='jsonl',
|
|
212
|
+
requests_per_second=3.0, # max 3 requests per second between pages
|
|
213
|
+
)
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
**Supported response formats:**
|
|
217
|
+
|
|
218
|
+
`save_format` controls how the response is written to the filesystem. For APIs that return binary formats natively (e.g. a reporting API that streams Excel files), set `source_format` to match the response content type:
|
|
219
|
+
|
|
220
|
+
| `source_format` / `save_format` | File extension | Notes |
|
|
221
|
+
|---|---|---|
|
|
222
|
+
| `json` | `.json` | Single JSON object or array |
|
|
223
|
+
| `jsonl` | `.jsonl` | Array response written as one record per line |
|
|
224
|
+
| `csv` | `.csv` | Raw CSV text from the response |
|
|
225
|
+
| `xml` | `.xml` | Raw XML text from the response |
|
|
226
|
+
| `parquet` | `.parquet` | Binary passthrough — API must return Parquet bytes |
|
|
227
|
+
| `excel` | `.xlsx` | Binary passthrough — API must return Excel bytes (requires `[excel]`) |
|
|
228
|
+
| `avro` | `.avro` | Binary passthrough — API must return Avro bytes (requires `[avro]`) |
|
|
229
|
+
| `fixed_width` | `.fwf` | Fixed-width text from the response |
|
|
230
|
+
|
|
231
|
+
All text and JSON formats support gzip/zip compression via the `compression` parameter.
|
|
232
|
+
|
|
173
233
|
### MultiHttpToFilesystem
|
|
174
234
|
|
|
175
|
-
Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file.
|
|
235
|
+
Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file. Requests can run **sequentially** (with optional rate limiting) or **in parallel** using a thread pool.
|
|
236
|
+
|
|
237
|
+
**Sequential with rate limiting:**
|
|
176
238
|
|
|
177
239
|
```python
|
|
178
240
|
from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import MultiHttpToFilesystem
|
|
@@ -184,6 +246,7 @@ MultiHttpToFilesystem(
|
|
|
184
246
|
filesystem_path='raw/reference/{{ ds }}/',
|
|
185
247
|
method='GET',
|
|
186
248
|
save_format='jsonl',
|
|
249
|
+
requests_per_second=2.0, # max 2 requests per second between calls
|
|
187
250
|
multi_requests=[
|
|
188
251
|
{'endpoint': '/api/v1/categories'},
|
|
189
252
|
{'endpoint': '/api/v1/statuses'},
|
|
@@ -192,6 +255,31 @@ MultiHttpToFilesystem(
|
|
|
192
255
|
)
|
|
193
256
|
```
|
|
194
257
|
|
|
258
|
+
**Parallel execution:**
|
|
259
|
+
|
|
260
|
+
Set `max_workers` to run requests concurrently using a thread pool. Each request writes to its own file — there are no file collisions.
|
|
261
|
+
|
|
262
|
+
```python
|
|
263
|
+
MultiHttpToFilesystem(
|
|
264
|
+
task_id='fetch_users_parallel',
|
|
265
|
+
http_conn_id='my_api',
|
|
266
|
+
filesystem_conn_id='my_data_lake',
|
|
267
|
+
filesystem_path='raw/users/{{ ds }}/',
|
|
268
|
+
method='GET',
|
|
269
|
+
save_format='json',
|
|
270
|
+
max_workers=5, # up to 5 concurrent threads
|
|
271
|
+
multi_requests=[
|
|
272
|
+
{'endpoint': '/api/v1/users/1'},
|
|
273
|
+
{'endpoint': '/api/v1/users/2'},
|
|
274
|
+
{'endpoint': '/api/v1/users/3'},
|
|
275
|
+
{'endpoint': '/api/v1/users/4'},
|
|
276
|
+
{'endpoint': '/api/v1/users/5'},
|
|
277
|
+
],
|
|
278
|
+
)
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
> Rate limiting (`requests_per_second`) applies only in sequential mode. In parallel mode the thread pool controls concurrency — use `max_workers` to avoid overwhelming the API.
|
|
282
|
+
|
|
195
283
|
Each entry in `multi_requests` can override any base parameter (`endpoint`, `method`, `headers`, `data`, `jmespath_expression`, `save_format`, `compression`).
|
|
196
284
|
|
|
197
285
|
### SQLToFilesystem
|
|
@@ -260,7 +348,9 @@ FilesystemToFilesystem(
|
|
|
260
348
|
|
|
261
349
|
### FilesystemToDatabase
|
|
262
350
|
|
|
263
|
-
Reads files
|
|
351
|
+
Reads files from any filesystem and loads them into any SQLAlchemy-compatible database. Handles schema drift automatically: columns present in the file but missing from the table are added; columns present in the table but missing from the file are filled with `NULL`.
|
|
352
|
+
|
|
353
|
+
**Supported formats:** `csv`, `json`, `parquet`, `excel`, `avro`, `fixed_width`.
|
|
264
354
|
|
|
265
355
|
```python
|
|
266
356
|
from airflow_toolkit.providers.deltalake.operators.filesystem_to_database import FilesystemToDatabaseOperator
|
|
@@ -272,7 +362,7 @@ FilesystemToDatabaseOperator(
|
|
|
272
362
|
filesystem_path='raw/orders/{{ ds }}/',
|
|
273
363
|
db_schema='public',
|
|
274
364
|
db_table='orders',
|
|
275
|
-
source_format='csv',
|
|
365
|
+
source_format='csv', # 'csv' | 'json' | 'parquet' | 'excel' | 'avro' | 'fixed_width'
|
|
276
366
|
table_aggregation_type='append', # 'append' | 'replace' | 'fail'
|
|
277
367
|
metadata={
|
|
278
368
|
'_ds': '{{ ds }}',
|
|
@@ -282,6 +372,52 @@ FilesystemToDatabaseOperator(
|
|
|
282
372
|
)
|
|
283
373
|
```
|
|
284
374
|
|
|
375
|
+
**Excel** (requires the `[excel]` extra):
|
|
376
|
+
|
|
377
|
+
```python
|
|
378
|
+
FilesystemToDatabaseOperator(
|
|
379
|
+
task_id='load_excel_report',
|
|
380
|
+
filesystem_conn_id='my_data_lake',
|
|
381
|
+
database_conn_id='my_postgres',
|
|
382
|
+
filesystem_path='raw/reports/{{ ds }}/',
|
|
383
|
+
db_table='monthly_report',
|
|
384
|
+
source_format='excel',
|
|
385
|
+
source_format_options={'sheet_name': 'Data'},
|
|
386
|
+
)
|
|
387
|
+
```
|
|
388
|
+
|
|
389
|
+
**Avro** (requires the `[avro]` extra):
|
|
390
|
+
|
|
391
|
+
```python
|
|
392
|
+
FilesystemToDatabaseOperator(
|
|
393
|
+
task_id='load_avro_events',
|
|
394
|
+
filesystem_conn_id='my_data_lake',
|
|
395
|
+
database_conn_id='my_postgres',
|
|
396
|
+
filesystem_path='raw/events/{{ ds }}/',
|
|
397
|
+
db_table='events',
|
|
398
|
+
source_format='avro',
|
|
399
|
+
)
|
|
400
|
+
```
|
|
401
|
+
|
|
402
|
+
**Fixed-width** (no extra required — pandas native):
|
|
403
|
+
|
|
404
|
+
```python
|
|
405
|
+
FilesystemToDatabaseOperator(
|
|
406
|
+
task_id='load_fixed_width',
|
|
407
|
+
filesystem_conn_id='my_data_lake',
|
|
408
|
+
database_conn_id='my_postgres',
|
|
409
|
+
filesystem_path='raw/exports/{{ ds }}/',
|
|
410
|
+
db_table='transactions',
|
|
411
|
+
source_format='fixed_width',
|
|
412
|
+
source_format_options={
|
|
413
|
+
'colspecs': [(0, 10), (10, 25), (25, 35)],
|
|
414
|
+
'names': ['date', 'description', 'amount'],
|
|
415
|
+
},
|
|
416
|
+
)
|
|
417
|
+
```
|
|
418
|
+
|
|
419
|
+
Each format is matched by file extension: `.csv`/`.csv.gz`, `.json`/`.json.gz`, `.parquet`/`.parquet.gz`, `.xlsx`/`.xls`, `.avro`, `.fwf`/`.txt`/`.dat`. Files with other extensions in the same prefix are silently skipped.
|
|
420
|
+
|
|
285
421
|
### DuckdbToDeltalake
|
|
286
422
|
|
|
287
423
|
Executes a DuckDB SQL query and writes the result directly to a Delta Lake table on Azure storage. Useful for in-process transformations that land results as an open table format.
|
|
@@ -477,6 +613,50 @@ Each environment maps to a distinct colour across all channels so alerts are rec
|
|
|
477
613
|
|
|
478
614
|
---
|
|
479
615
|
|
|
616
|
+
## Testing Utilities
|
|
617
|
+
|
|
618
|
+
### MockFilesystem
|
|
619
|
+
|
|
620
|
+
`MockFilesystem` is an in-memory implementation of `FilesystemProtocol` for unit testing. It requires no Docker, no cloud credentials, and no network — all files are stored in a plain Python dict.
|
|
621
|
+
|
|
622
|
+
```python
|
|
623
|
+
from airflow_toolkit.testing import MockFilesystem
|
|
624
|
+
|
|
625
|
+
# Pre-load files at construction time
|
|
626
|
+
fs = MockFilesystem({
|
|
627
|
+
"raw/orders/2024-01-01/data.csv": b"id,amount\n1,100\n2,200",
|
|
628
|
+
})
|
|
629
|
+
|
|
630
|
+
# Or write files programmatically
|
|
631
|
+
fs.write(b"id,amount\n3,300", "raw/orders/2024-01-02/data.csv")
|
|
632
|
+
|
|
633
|
+
# Inspect the result in assertions
|
|
634
|
+
assert fs.check_file("raw/orders/2024-01-01/data.csv")
|
|
635
|
+
assert len(fs.list_files("raw/orders/")) == 2
|
|
636
|
+
assert fs.files["raw/orders/2024-01-01/data.csv"] == b"id,amount\n1,100\n2,200"
|
|
637
|
+
```
|
|
638
|
+
|
|
639
|
+
Use it to patch `FilesystemFactory.get_data_lake_filesystem` in your operator tests:
|
|
640
|
+
|
|
641
|
+
```python
|
|
642
|
+
from unittest.mock import patch
|
|
643
|
+
from airflow_toolkit.testing import MockFilesystem
|
|
644
|
+
|
|
645
|
+
def test_my_pipeline(tmp_path):
|
|
646
|
+
fs = MockFilesystem({"data/file.csv": b"id,name\n1,Alice"})
|
|
647
|
+
|
|
648
|
+
with patch(
|
|
649
|
+
"airflow_toolkit.filesystems.filesystem_factory.FilesystemFactory.get_data_lake_filesystem",
|
|
650
|
+
return_value=fs,
|
|
651
|
+
):
|
|
652
|
+
# run your operator or task here
|
|
653
|
+
...
|
|
654
|
+
```
|
|
655
|
+
|
|
656
|
+
`MockFilesystem` implements the full `FilesystemProtocol`: `read`, `write`, `delete_file`, `create_prefix`, `delete_prefix`, `check_file`, `check_prefix`, `list_files`.
|
|
657
|
+
|
|
658
|
+
---
|
|
659
|
+
|
|
480
660
|
## Running Tests
|
|
481
661
|
|
|
482
662
|
### Integration tests
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "airflow-toolkit"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.4.0"
|
|
4
4
|
description = "A toolkit of operators, hooks and utilities for Apache Airflow 3"
|
|
5
5
|
authors = [{ name = "Biel Llobera", email = "biel_llobera@dkl.digital" }]
|
|
6
6
|
requires-python = ">=3.11,<3.15"
|
|
@@ -49,6 +49,12 @@ duckdb = [
|
|
|
49
49
|
sqlite = [
|
|
50
50
|
"apache-airflow-providers-sqlite",
|
|
51
51
|
]
|
|
52
|
+
excel = [
|
|
53
|
+
"openpyxl>=3.1",
|
|
54
|
+
]
|
|
55
|
+
avro = [
|
|
56
|
+
"fastavro>=1.9",
|
|
57
|
+
]
|
|
52
58
|
airflow3-full = [
|
|
53
59
|
"apache-airflow>=3,<4",
|
|
54
60
|
"apache-airflow-providers-fab>=3.0.0",
|
|
@@ -66,6 +72,8 @@ airflow3-full = [
|
|
|
66
72
|
"jmespath>=1.0.1,<2",
|
|
67
73
|
"airflow-provider-duckdb>=0.1.2",
|
|
68
74
|
"apache-airflow-providers-sqlite",
|
|
75
|
+
"openpyxl>=3.1",
|
|
76
|
+
"fastavro>=1.9",
|
|
69
77
|
]
|
|
70
78
|
|
|
71
79
|
[dependency-groups]
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import gzip
|
|
2
2
|
import zipfile
|
|
3
3
|
from io import BytesIO
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
from airflow_toolkit.types import CompressionOptions
|
|
5
6
|
|
|
6
7
|
DEFAULT_ZIP_FILENAME = "file.zip"
|
|
7
|
-
CompressionOptions = Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], None]
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def gzip_data(data: bytes) -> bytes:
|