airflow-toolkit 2.2.0__tar.gz → 2.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {airflow_toolkit-2.2.0/src/airflow_toolkit.egg-info → airflow_toolkit-2.4.0}/PKG-INFO +193 -7
  2. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/README.md +186 -6
  3. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/pyproject.toml +9 -1
  4. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/compression_utils.py +2 -2
  5. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/operators/filesystem_to_database.py +90 -6
  6. airflow_toolkit-2.4.0/src/airflow_toolkit/providers/filesystem/operators/auth.py +66 -0
  7. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/operators/http_to_filesystem.py +154 -82
  8. airflow_toolkit-2.4.0/src/airflow_toolkit/testing.py +59 -0
  9. airflow_toolkit-2.4.0/src/airflow_toolkit/types.py +51 -0
  10. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0/src/airflow_toolkit.egg-info}/PKG-INFO +193 -7
  11. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/SOURCES.txt +3 -0
  12. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/requires.txt +8 -0
  13. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/LICENSE.txt +0 -0
  14. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/setup.cfg +0 -0
  15. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/__init__.py +0 -0
  16. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/_compact/airflow_shim.py +0 -0
  17. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/exceptions.py +0 -0
  18. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/__init__.py +0 -0
  19. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/filesystem_factory.py +0 -0
  20. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/filesystem_protocol.py +0 -0
  21. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/__init__.py +0 -0
  22. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/azure_databricks_volume_filesystem.py +0 -0
  23. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/azure_file_share_filesystem.py +0 -0
  24. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/blob_storage_filesystem.py +0 -0
  25. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/google_cloud_storage_filesystem.py +0 -0
  26. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/local_filesystem.py +0 -0
  27. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/s3_filesystem.py +0 -0
  28. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/sftp_filesystem.py +0 -0
  29. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/__init__.py +0 -0
  30. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/__init__.py +0 -0
  31. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/discord.py +0 -0
  32. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/email.py +0 -0
  33. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/slack.py +0 -0
  34. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/teams.py +0 -0
  35. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/context.py +0 -0
  36. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/protocols.py +0 -0
  37. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/__init__.py +0 -0
  38. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/__init__.py +0 -0
  39. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/hooks/__init__.py +0 -0
  40. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/hooks/azure_databricks.py +0 -0
  41. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/hooks/azure_file_share.py +0 -0
  42. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/__init__.py +0 -0
  43. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/operators/__init__.py +0 -0
  44. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/operators/duckdb_to_deltalake.py +0 -0
  45. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/sensors/__init__.py +0 -0
  46. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/sensors/filesystem_file.py +0 -0
  47. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/__init__.py +0 -0
  48. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/operators/__init__.py +0 -0
  49. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/operators/filesystem.py +0 -0
  50. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/tasks.py +0 -0
  51. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/package.py +0 -0
  52. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/py.typed +0 -0
  53. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/dependency_links.txt +0 -0
  54. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/entry_points.txt +0 -0
  55. {airflow_toolkit-2.2.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: airflow-toolkit
3
- Version: 2.2.0
3
+ Version: 2.4.0
4
4
  Summary: A toolkit of operators, hooks and utilities for Apache Airflow 3
5
5
  Author-email: Biel Llobera <biel_llobera@dkl.digital>
6
6
  Requires-Python: <3.15,>=3.11
@@ -32,6 +32,10 @@ Provides-Extra: duckdb
32
32
  Requires-Dist: airflow-provider-duckdb>=0.1.2; extra == "duckdb"
33
33
  Provides-Extra: sqlite
34
34
  Requires-Dist: apache-airflow-providers-sqlite; extra == "sqlite"
35
+ Provides-Extra: excel
36
+ Requires-Dist: openpyxl>=3.1; extra == "excel"
37
+ Provides-Extra: avro
38
+ Requires-Dist: fastavro>=1.9; extra == "avro"
35
39
  Provides-Extra: airflow3-full
36
40
  Requires-Dist: apache-airflow<4,>=3; extra == "airflow3-full"
37
41
  Requires-Dist: apache-airflow-providers-fab>=3.0.0; extra == "airflow3-full"
@@ -49,6 +53,8 @@ Requires-Dist: requests>=2.31.0; extra == "airflow3-full"
49
53
  Requires-Dist: jmespath<2,>=1.0.1; extra == "airflow3-full"
50
54
  Requires-Dist: airflow-provider-duckdb>=0.1.2; extra == "airflow3-full"
51
55
  Requires-Dist: apache-airflow-providers-sqlite; extra == "airflow3-full"
56
+ Requires-Dist: openpyxl>=3.1; extra == "airflow3-full"
57
+ Requires-Dist: fastavro>=1.9; extra == "airflow3-full"
52
58
  Dynamic: license-file
53
59
 
54
60
  # Airflow Toolkit
@@ -136,10 +142,11 @@ pip install "airflow-toolkit[airflow3-full]"
136
142
  | `google` | `providers-google` | GCS filesystem backend |
137
143
  | `azure` | `providers-microsoft-azure` | Azure Blob / ADLS filesystem backend |
138
144
  | `sftp` | `providers-sftp` | SFTP filesystem backend |
139
- | `slack` | `providers-slack` | Slack failure notifications |
140
145
  | `http` | `providers-http`, `requests`, `jmespath`, `pandas` | `HttpToFilesystem`, `MultiHttpToFilesystem` |
141
146
  | `duckdb` | `airflow-provider-duckdb` | `DuckdbToDeltalake` operator |
142
147
  | `sqlite` | `providers-sqlite` | SQLite as source or destination |
148
+ | `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` and `HttpToFilesystem` |
149
+ | `avro` | `fastavro` | Avro support in `FilesystemToDatabase` and `HttpToFilesystem` |
143
150
  | `airflow3-full` | all of the above | Quick start / development |
144
151
 
145
152
  ---
@@ -184,7 +191,7 @@ Changing the connection's `conn_type` is all that is needed to switch backends
184
191
 
185
192
  ### HttpToFilesystem
186
193
 
187
- Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, and custom response transformations.
194
+ Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, OAuth 2.0 authentication, rate limiting, and custom response transformations.
188
195
 
189
196
  ```python
190
197
  from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import HttpToFilesystem
@@ -201,7 +208,7 @@ HttpToFilesystem(
201
208
  )
202
209
  ```
203
210
 
204
- With cursor-based pagination:
211
+ **With cursor-based pagination:**
205
212
 
206
213
  ```python
207
214
  def next_page(response):
@@ -223,9 +230,70 @@ HttpToFilesystem(
223
230
  )
224
231
  ```
225
232
 
233
+ **With OAuth 2.0 Client Credentials:**
234
+
235
+ `OAuth2ClientCredentials.client_credentials()` returns a configured auth class that fetches the token lazily on the first request and refreshes it automatically 30 seconds before expiry — no manual token management required.
236
+
237
+ ```python
238
+ from airflow_toolkit.providers.filesystem.operators.auth import OAuth2ClientCredentials
239
+
240
+ HttpToFilesystem(
241
+ task_id='fetch_protected_data',
242
+ http_conn_id='my_api',
243
+ filesystem_conn_id='my_data_lake',
244
+ filesystem_path='raw/data/{{ ds }}/',
245
+ endpoint='/api/v1/data',
246
+ method='GET',
247
+ save_format='jsonl',
248
+ auth_type=OAuth2ClientCredentials.client_credentials(
249
+ token_url='https://auth.example.com/oauth2/token',
250
+ client_id='{{ var.value.oauth2_client_id }}',
251
+ client_secret='{{ var.value.oauth2_client_secret }}',
252
+ scope='read', # optional
253
+ ),
254
+ )
255
+ ```
256
+
257
+ **With rate limiting:**
258
+
259
+ Use `requests_per_second` to cap how fast paginated requests are sent. This is useful when the API enforces a rate limit.
260
+
261
+ ```python
262
+ HttpToFilesystem(
263
+ task_id='fetch_with_rate_limit',
264
+ http_conn_id='my_api',
265
+ filesystem_conn_id='my_data_lake',
266
+ filesystem_path='raw/events/{{ ds }}/',
267
+ endpoint='/api/v1/events',
268
+ method='GET',
269
+ pagination_function=next_page,
270
+ save_format='jsonl',
271
+ requests_per_second=3.0, # max 3 requests per second between pages
272
+ )
273
+ ```
274
+
275
+ **Supported response formats:**
276
+
277
+ `save_format` controls how the response is written to the filesystem. For APIs that return binary formats natively (e.g. a reporting API that streams Excel files), set `source_format` to match the response content type:
278
+
279
+ | `source_format` / `save_format` | File extension | Notes |
280
+ |---|---|---|
281
+ | `json` | `.json` | Single JSON object or array |
282
+ | `jsonl` | `.jsonl` | Array response written as one record per line |
283
+ | `csv` | `.csv` | Raw CSV text from the response |
284
+ | `xml` | `.xml` | Raw XML text from the response |
285
+ | `parquet` | `.parquet` | Binary passthrough — API must return Parquet bytes |
286
+ | `excel` | `.xlsx` | Binary passthrough — API must return Excel bytes (requires `[excel]`) |
287
+ | `avro` | `.avro` | Binary passthrough — API must return Avro bytes (requires `[avro]`) |
288
+ | `fixed_width` | `.fwf` | Fixed-width text from the response |
289
+
290
+ All text and JSON formats support gzip/zip compression via the `compression` parameter.
291
+
226
292
  ### MultiHttpToFilesystem
227
293
 
228
- Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file. Useful for fetching multiple entities or date ranges without creating one task per request.
294
+ Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file. Requests can run **sequentially** (with optional rate limiting) or **in parallel** using a thread pool.
295
+
296
+ **Sequential with rate limiting:**
229
297
 
230
298
  ```python
231
299
  from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import MultiHttpToFilesystem
@@ -237,6 +305,7 @@ MultiHttpToFilesystem(
237
305
  filesystem_path='raw/reference/{{ ds }}/',
238
306
  method='GET',
239
307
  save_format='jsonl',
308
+ requests_per_second=2.0, # max 2 requests per second between calls
240
309
  multi_requests=[
241
310
  {'endpoint': '/api/v1/categories'},
242
311
  {'endpoint': '/api/v1/statuses'},
@@ -245,6 +314,31 @@ MultiHttpToFilesystem(
245
314
  )
246
315
  ```
247
316
 
317
+ **Parallel execution:**
318
+
319
+ Set `max_workers` to run requests concurrently using a thread pool. Each request writes to its own file — there are no file collisions.
320
+
321
+ ```python
322
+ MultiHttpToFilesystem(
323
+ task_id='fetch_users_parallel',
324
+ http_conn_id='my_api',
325
+ filesystem_conn_id='my_data_lake',
326
+ filesystem_path='raw/users/{{ ds }}/',
327
+ method='GET',
328
+ save_format='json',
329
+ max_workers=5, # up to 5 concurrent threads
330
+ multi_requests=[
331
+ {'endpoint': '/api/v1/users/1'},
332
+ {'endpoint': '/api/v1/users/2'},
333
+ {'endpoint': '/api/v1/users/3'},
334
+ {'endpoint': '/api/v1/users/4'},
335
+ {'endpoint': '/api/v1/users/5'},
336
+ ],
337
+ )
338
+ ```
339
+
340
+ > Rate limiting (`requests_per_second`) applies only in sequential mode. In parallel mode the thread pool controls concurrency — use `max_workers` to avoid overwhelming the API.
341
+
248
342
  Each entry in `multi_requests` can override any base parameter (`endpoint`, `method`, `headers`, `data`, `jmespath_expression`, `save_format`, `compression`).
249
343
 
250
344
  ### SQLToFilesystem
@@ -313,7 +407,9 @@ FilesystemToFilesystem(
313
407
 
314
408
  ### FilesystemToDatabase
315
409
 
316
- Reads files (CSV, JSON, or Parquet) from any filesystem and loads them into any SQLAlchemy-compatible database. Handles schema drift automatically: columns present in the file but missing from the table are added; columns present in the table but missing from the file are filled with `NULL`.
410
+ Reads files from any filesystem and loads them into any SQLAlchemy-compatible database. Handles schema drift automatically: columns present in the file but missing from the table are added; columns present in the table but missing from the file are filled with `NULL`.
411
+
412
+ **Supported formats:** `csv`, `json`, `parquet`, `excel`, `avro`, `fixed_width`.
317
413
 
318
414
  ```python
319
415
  from airflow_toolkit.providers.deltalake.operators.filesystem_to_database import FilesystemToDatabaseOperator
@@ -325,7 +421,7 @@ FilesystemToDatabaseOperator(
325
421
  filesystem_path='raw/orders/{{ ds }}/',
326
422
  db_schema='public',
327
423
  db_table='orders',
328
- source_format='csv',
424
+ source_format='csv', # 'csv' | 'json' | 'parquet' | 'excel' | 'avro' | 'fixed_width'
329
425
  table_aggregation_type='append', # 'append' | 'replace' | 'fail'
330
426
  metadata={
331
427
  '_ds': '{{ ds }}',
@@ -335,6 +431,52 @@ FilesystemToDatabaseOperator(
335
431
  )
336
432
  ```
337
433
 
434
+ **Excel** (requires the `[excel]` extra):
435
+
436
+ ```python
437
+ FilesystemToDatabaseOperator(
438
+ task_id='load_excel_report',
439
+ filesystem_conn_id='my_data_lake',
440
+ database_conn_id='my_postgres',
441
+ filesystem_path='raw/reports/{{ ds }}/',
442
+ db_table='monthly_report',
443
+ source_format='excel',
444
+ source_format_options={'sheet_name': 'Data'},
445
+ )
446
+ ```
447
+
448
+ **Avro** (requires the `[avro]` extra):
449
+
450
+ ```python
451
+ FilesystemToDatabaseOperator(
452
+ task_id='load_avro_events',
453
+ filesystem_conn_id='my_data_lake',
454
+ database_conn_id='my_postgres',
455
+ filesystem_path='raw/events/{{ ds }}/',
456
+ db_table='events',
457
+ source_format='avro',
458
+ )
459
+ ```
460
+
461
+ **Fixed-width** (no extra required — pandas native):
462
+
463
+ ```python
464
+ FilesystemToDatabaseOperator(
465
+ task_id='load_fixed_width',
466
+ filesystem_conn_id='my_data_lake',
467
+ database_conn_id='my_postgres',
468
+ filesystem_path='raw/exports/{{ ds }}/',
469
+ db_table='transactions',
470
+ source_format='fixed_width',
471
+ source_format_options={
472
+ 'colspecs': [(0, 10), (10, 25), (25, 35)],
473
+ 'names': ['date', 'description', 'amount'],
474
+ },
475
+ )
476
+ ```
477
+
478
+ Each format is matched by file extension: `.csv`/`.csv.gz`, `.json`/`.json.gz`, `.parquet`/`.parquet.gz`, `.xlsx`/`.xls`, `.avro`, `.fwf`/`.txt`/`.dat`. Files with other extensions in the same prefix are silently skipped.
479
+
338
480
  ### DuckdbToDeltalake
339
481
 
340
482
  Executes a DuckDB SQL query and writes the result directly to a Delta Lake table on Azure storage. Useful for in-process transformations that land results as an open table format.
@@ -530,6 +672,50 @@ Each environment maps to a distinct colour across all channels so alerts are rec
530
672
 
531
673
  ---
532
674
 
675
+ ## Testing Utilities
676
+
677
+ ### MockFilesystem
678
+
679
+ `MockFilesystem` is an in-memory implementation of `FilesystemProtocol` for unit testing. It requires no Docker, no cloud credentials, and no network — all files are stored in a plain Python dict.
680
+
681
+ ```python
682
+ from airflow_toolkit.testing import MockFilesystem
683
+
684
+ # Pre-load files at construction time
685
+ fs = MockFilesystem({
686
+ "raw/orders/2024-01-01/data.csv": b"id,amount\n1,100\n2,200",
687
+ })
688
+
689
+ # Or write files programmatically
690
+ fs.write(b"id,amount\n3,300", "raw/orders/2024-01-02/data.csv")
691
+
692
+ # Inspect the result in assertions
693
+ assert fs.check_file("raw/orders/2024-01-01/data.csv")
694
+ assert len(fs.list_files("raw/orders/")) == 2
695
+ assert fs.files["raw/orders/2024-01-01/data.csv"] == b"id,amount\n1,100\n2,200"
696
+ ```
697
+
698
+ Use it to patch `FilesystemFactory.get_data_lake_filesystem` in your operator tests:
699
+
700
+ ```python
701
+ from unittest.mock import patch
702
+ from airflow_toolkit.testing import MockFilesystem
703
+
704
+ def test_my_pipeline(tmp_path):
705
+ fs = MockFilesystem({"data/file.csv": b"id,name\n1,Alice"})
706
+
707
+ with patch(
708
+ "airflow_toolkit.filesystems.filesystem_factory.FilesystemFactory.get_data_lake_filesystem",
709
+ return_value=fs,
710
+ ):
711
+ # run your operator or task here
712
+ ...
713
+ ```
714
+
715
+ `MockFilesystem` implements the full `FilesystemProtocol`: `read`, `write`, `delete_file`, `create_prefix`, `delete_prefix`, `check_file`, `check_prefix`, `list_files`.
716
+
717
+ ---
718
+
533
719
  ## Running Tests
534
720
 
535
721
  ### Integration tests
@@ -83,10 +83,11 @@ pip install "airflow-toolkit[airflow3-full]"
83
83
  | `google` | `providers-google` | GCS filesystem backend |
84
84
  | `azure` | `providers-microsoft-azure` | Azure Blob / ADLS filesystem backend |
85
85
  | `sftp` | `providers-sftp` | SFTP filesystem backend |
86
- | `slack` | `providers-slack` | Slack failure notifications |
87
86
  | `http` | `providers-http`, `requests`, `jmespath`, `pandas` | `HttpToFilesystem`, `MultiHttpToFilesystem` |
88
87
  | `duckdb` | `airflow-provider-duckdb` | `DuckdbToDeltalake` operator |
89
88
  | `sqlite` | `providers-sqlite` | SQLite as source or destination |
89
+ | `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` and `HttpToFilesystem` |
90
+ | `avro` | `fastavro` | Avro support in `FilesystemToDatabase` and `HttpToFilesystem` |
90
91
  | `airflow3-full` | all of the above | Quick start / development |
91
92
 
92
93
  ---
@@ -131,7 +132,7 @@ Changing the connection's `conn_type` is all that is needed to switch backends
131
132
 
132
133
  ### HttpToFilesystem
133
134
 
134
- Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, and custom response transformations.
135
+ Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, OAuth 2.0 authentication, rate limiting, and custom response transformations.
135
136
 
136
137
  ```python
137
138
  from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import HttpToFilesystem
@@ -148,7 +149,7 @@ HttpToFilesystem(
148
149
  )
149
150
  ```
150
151
 
151
- With cursor-based pagination:
152
+ **With cursor-based pagination:**
152
153
 
153
154
  ```python
154
155
  def next_page(response):
@@ -170,9 +171,70 @@ HttpToFilesystem(
170
171
  )
171
172
  ```
172
173
 
174
+ **With OAuth 2.0 Client Credentials:**
175
+
176
+ `OAuth2ClientCredentials.client_credentials()` returns a configured auth class that fetches the token lazily on the first request and refreshes it automatically 30 seconds before expiry — no manual token management required.
177
+
178
+ ```python
179
+ from airflow_toolkit.providers.filesystem.operators.auth import OAuth2ClientCredentials
180
+
181
+ HttpToFilesystem(
182
+ task_id='fetch_protected_data',
183
+ http_conn_id='my_api',
184
+ filesystem_conn_id='my_data_lake',
185
+ filesystem_path='raw/data/{{ ds }}/',
186
+ endpoint='/api/v1/data',
187
+ method='GET',
188
+ save_format='jsonl',
189
+ auth_type=OAuth2ClientCredentials.client_credentials(
190
+ token_url='https://auth.example.com/oauth2/token',
191
+ client_id='{{ var.value.oauth2_client_id }}',
192
+ client_secret='{{ var.value.oauth2_client_secret }}',
193
+ scope='read', # optional
194
+ ),
195
+ )
196
+ ```
197
+
198
+ **With rate limiting:**
199
+
200
+ Use `requests_per_second` to cap how fast paginated requests are sent. This is useful when the API enforces a rate limit.
201
+
202
+ ```python
203
+ HttpToFilesystem(
204
+ task_id='fetch_with_rate_limit',
205
+ http_conn_id='my_api',
206
+ filesystem_conn_id='my_data_lake',
207
+ filesystem_path='raw/events/{{ ds }}/',
208
+ endpoint='/api/v1/events',
209
+ method='GET',
210
+ pagination_function=next_page,
211
+ save_format='jsonl',
212
+ requests_per_second=3.0, # max 3 requests per second between pages
213
+ )
214
+ ```
215
+
216
+ **Supported response formats:**
217
+
218
+ `save_format` controls how the response is written to the filesystem. For APIs that return binary formats natively (e.g. a reporting API that streams Excel files), set `source_format` to match the response content type:
219
+
220
+ | `source_format` / `save_format` | File extension | Notes |
221
+ |---|---|---|
222
+ | `json` | `.json` | Single JSON object or array |
223
+ | `jsonl` | `.jsonl` | Array response written as one record per line |
224
+ | `csv` | `.csv` | Raw CSV text from the response |
225
+ | `xml` | `.xml` | Raw XML text from the response |
226
+ | `parquet` | `.parquet` | Binary passthrough — API must return Parquet bytes |
227
+ | `excel` | `.xlsx` | Binary passthrough — API must return Excel bytes (requires `[excel]`) |
228
+ | `avro` | `.avro` | Binary passthrough — API must return Avro bytes (requires `[avro]`) |
229
+ | `fixed_width` | `.fwf` | Fixed-width text from the response |
230
+
231
+ All text and JSON formats support gzip/zip compression via the `compression` parameter.
232
+
173
233
  ### MultiHttpToFilesystem
174
234
 
175
- Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file. Useful for fetching multiple entities or date ranges without creating one task per request.
235
+ Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file. Requests can run **sequentially** (with optional rate limiting) or **in parallel** using a thread pool.
236
+
237
+ **Sequential with rate limiting:**
176
238
 
177
239
  ```python
178
240
  from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import MultiHttpToFilesystem
@@ -184,6 +246,7 @@ MultiHttpToFilesystem(
184
246
  filesystem_path='raw/reference/{{ ds }}/',
185
247
  method='GET',
186
248
  save_format='jsonl',
249
+ requests_per_second=2.0, # max 2 requests per second between calls
187
250
  multi_requests=[
188
251
  {'endpoint': '/api/v1/categories'},
189
252
  {'endpoint': '/api/v1/statuses'},
@@ -192,6 +255,31 @@ MultiHttpToFilesystem(
192
255
  )
193
256
  ```
194
257
 
258
+ **Parallel execution:**
259
+
260
+ Set `max_workers` to run requests concurrently using a thread pool. Each request writes to its own file — there are no file collisions.
261
+
262
+ ```python
263
+ MultiHttpToFilesystem(
264
+ task_id='fetch_users_parallel',
265
+ http_conn_id='my_api',
266
+ filesystem_conn_id='my_data_lake',
267
+ filesystem_path='raw/users/{{ ds }}/',
268
+ method='GET',
269
+ save_format='json',
270
+ max_workers=5, # up to 5 concurrent threads
271
+ multi_requests=[
272
+ {'endpoint': '/api/v1/users/1'},
273
+ {'endpoint': '/api/v1/users/2'},
274
+ {'endpoint': '/api/v1/users/3'},
275
+ {'endpoint': '/api/v1/users/4'},
276
+ {'endpoint': '/api/v1/users/5'},
277
+ ],
278
+ )
279
+ ```
280
+
281
+ > Rate limiting (`requests_per_second`) applies only in sequential mode. In parallel mode the thread pool controls concurrency — use `max_workers` to avoid overwhelming the API.
282
+
195
283
  Each entry in `multi_requests` can override any base parameter (`endpoint`, `method`, `headers`, `data`, `jmespath_expression`, `save_format`, `compression`).
196
284
 
197
285
  ### SQLToFilesystem
@@ -260,7 +348,9 @@ FilesystemToFilesystem(
260
348
 
261
349
  ### FilesystemToDatabase
262
350
 
263
- Reads files (CSV, JSON, or Parquet) from any filesystem and loads them into any SQLAlchemy-compatible database. Handles schema drift automatically: columns present in the file but missing from the table are added; columns present in the table but missing from the file are filled with `NULL`.
351
+ Reads files from any filesystem and loads them into any SQLAlchemy-compatible database. Handles schema drift automatically: columns present in the file but missing from the table are added; columns present in the table but missing from the file are filled with `NULL`.
352
+
353
+ **Supported formats:** `csv`, `json`, `parquet`, `excel`, `avro`, `fixed_width`.
264
354
 
265
355
  ```python
266
356
  from airflow_toolkit.providers.deltalake.operators.filesystem_to_database import FilesystemToDatabaseOperator
@@ -272,7 +362,7 @@ FilesystemToDatabaseOperator(
272
362
  filesystem_path='raw/orders/{{ ds }}/',
273
363
  db_schema='public',
274
364
  db_table='orders',
275
- source_format='csv',
365
+ source_format='csv', # 'csv' | 'json' | 'parquet' | 'excel' | 'avro' | 'fixed_width'
276
366
  table_aggregation_type='append', # 'append' | 'replace' | 'fail'
277
367
  metadata={
278
368
  '_ds': '{{ ds }}',
@@ -282,6 +372,52 @@ FilesystemToDatabaseOperator(
282
372
  )
283
373
  ```
284
374
 
375
+ **Excel** (requires the `[excel]` extra):
376
+
377
+ ```python
378
+ FilesystemToDatabaseOperator(
379
+ task_id='load_excel_report',
380
+ filesystem_conn_id='my_data_lake',
381
+ database_conn_id='my_postgres',
382
+ filesystem_path='raw/reports/{{ ds }}/',
383
+ db_table='monthly_report',
384
+ source_format='excel',
385
+ source_format_options={'sheet_name': 'Data'},
386
+ )
387
+ ```
388
+
389
+ **Avro** (requires the `[avro]` extra):
390
+
391
+ ```python
392
+ FilesystemToDatabaseOperator(
393
+ task_id='load_avro_events',
394
+ filesystem_conn_id='my_data_lake',
395
+ database_conn_id='my_postgres',
396
+ filesystem_path='raw/events/{{ ds }}/',
397
+ db_table='events',
398
+ source_format='avro',
399
+ )
400
+ ```
401
+
402
+ **Fixed-width** (no extra required — pandas native):
403
+
404
+ ```python
405
+ FilesystemToDatabaseOperator(
406
+ task_id='load_fixed_width',
407
+ filesystem_conn_id='my_data_lake',
408
+ database_conn_id='my_postgres',
409
+ filesystem_path='raw/exports/{{ ds }}/',
410
+ db_table='transactions',
411
+ source_format='fixed_width',
412
+ source_format_options={
413
+ 'colspecs': [(0, 10), (10, 25), (25, 35)],
414
+ 'names': ['date', 'description', 'amount'],
415
+ },
416
+ )
417
+ ```
418
+
419
+ Each format is matched by file extension: `.csv`/`.csv.gz`, `.json`/`.json.gz`, `.parquet`/`.parquet.gz`, `.xlsx`/`.xls`, `.avro`, `.fwf`/`.txt`/`.dat`. Files with other extensions in the same prefix are silently skipped.
420
+
285
421
  ### DuckdbToDeltalake
286
422
 
287
423
  Executes a DuckDB SQL query and writes the result directly to a Delta Lake table on Azure storage. Useful for in-process transformations that land results as an open table format.
@@ -477,6 +613,50 @@ Each environment maps to a distinct colour across all channels so alerts are rec
477
613
 
478
614
  ---
479
615
 
616
+ ## Testing Utilities
617
+
618
+ ### MockFilesystem
619
+
620
+ `MockFilesystem` is an in-memory implementation of `FilesystemProtocol` for unit testing. It requires no Docker, no cloud credentials, and no network — all files are stored in a plain Python dict.
621
+
622
+ ```python
623
+ from airflow_toolkit.testing import MockFilesystem
624
+
625
+ # Pre-load files at construction time
626
+ fs = MockFilesystem({
627
+ "raw/orders/2024-01-01/data.csv": b"id,amount\n1,100\n2,200",
628
+ })
629
+
630
+ # Or write files programmatically
631
+ fs.write(b"id,amount\n3,300", "raw/orders/2024-01-02/data.csv")
632
+
633
+ # Inspect the result in assertions
634
+ assert fs.check_file("raw/orders/2024-01-01/data.csv")
635
+ assert len(fs.list_files("raw/orders/")) == 2
636
+ assert fs.files["raw/orders/2024-01-01/data.csv"] == b"id,amount\n1,100\n2,200"
637
+ ```
638
+
639
+ Use it to patch `FilesystemFactory.get_data_lake_filesystem` in your operator tests:
640
+
641
+ ```python
642
+ from unittest.mock import patch
643
+ from airflow_toolkit.testing import MockFilesystem
644
+
645
+ def test_my_pipeline(tmp_path):
646
+ fs = MockFilesystem({"data/file.csv": b"id,name\n1,Alice"})
647
+
648
+ with patch(
649
+ "airflow_toolkit.filesystems.filesystem_factory.FilesystemFactory.get_data_lake_filesystem",
650
+ return_value=fs,
651
+ ):
652
+ # run your operator or task here
653
+ ...
654
+ ```
655
+
656
+ `MockFilesystem` implements the full `FilesystemProtocol`: `read`, `write`, `delete_file`, `create_prefix`, `delete_prefix`, `check_file`, `check_prefix`, `list_files`.
657
+
658
+ ---
659
+
480
660
  ## Running Tests
481
661
 
482
662
  ### Integration tests
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "airflow-toolkit"
3
- version = "2.2.0"
3
+ version = "2.4.0"
4
4
  description = "A toolkit of operators, hooks and utilities for Apache Airflow 3"
5
5
  authors = [{ name = "Biel Llobera", email = "biel_llobera@dkl.digital" }]
6
6
  requires-python = ">=3.11,<3.15"
@@ -49,6 +49,12 @@ duckdb = [
49
49
  sqlite = [
50
50
  "apache-airflow-providers-sqlite",
51
51
  ]
52
+ excel = [
53
+ "openpyxl>=3.1",
54
+ ]
55
+ avro = [
56
+ "fastavro>=1.9",
57
+ ]
52
58
  airflow3-full = [
53
59
  "apache-airflow>=3,<4",
54
60
  "apache-airflow-providers-fab>=3.0.0",
@@ -66,6 +72,8 @@ airflow3-full = [
66
72
  "jmespath>=1.0.1,<2",
67
73
  "airflow-provider-duckdb>=0.1.2",
68
74
  "apache-airflow-providers-sqlite",
75
+ "openpyxl>=3.1",
76
+ "fastavro>=1.9",
69
77
  ]
70
78
 
71
79
  [dependency-groups]
@@ -1,10 +1,10 @@
1
1
  import gzip
2
2
  import zipfile
3
3
  from io import BytesIO
4
- from typing import Literal, Union
4
+
5
+ from airflow_toolkit.types import CompressionOptions
5
6
 
6
7
  DEFAULT_ZIP_FILENAME = "file.zip"
7
- CompressionOptions = Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], None]
8
8
 
9
9
 
10
10
  def gzip_data(data: bytes) -> bytes: