airflow-toolkit 2.3.0__tar.gz → 2.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {airflow_toolkit-2.3.0/src/airflow_toolkit.egg-info → airflow_toolkit-2.4.0}/PKG-INFO +93 -6
  2. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/README.md +92 -5
  3. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/pyproject.toml +1 -1
  4. airflow_toolkit-2.4.0/src/airflow_toolkit/providers/filesystem/operators/auth.py +66 -0
  5. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/operators/http_to_filesystem.py +147 -47
  6. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0/src/airflow_toolkit.egg-info}/PKG-INFO +93 -6
  7. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/SOURCES.txt +1 -0
  8. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/LICENSE.txt +0 -0
  9. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/setup.cfg +0 -0
  10. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/__init__.py +0 -0
  11. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/_compact/airflow_shim.py +0 -0
  12. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/compression_utils.py +0 -0
  13. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/exceptions.py +0 -0
  14. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/__init__.py +0 -0
  15. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/filesystem_factory.py +0 -0
  16. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/filesystem_protocol.py +0 -0
  17. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/__init__.py +0 -0
  18. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/azure_databricks_volume_filesystem.py +0 -0
  19. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/azure_file_share_filesystem.py +0 -0
  20. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/blob_storage_filesystem.py +0 -0
  21. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/google_cloud_storage_filesystem.py +0 -0
  22. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/local_filesystem.py +0 -0
  23. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/s3_filesystem.py +0 -0
  24. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/sftp_filesystem.py +0 -0
  25. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/__init__.py +0 -0
  26. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/__init__.py +0 -0
  27. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/discord.py +0 -0
  28. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/email.py +0 -0
  29. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/slack.py +0 -0
  30. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/teams.py +0 -0
  31. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/context.py +0 -0
  32. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/protocols.py +0 -0
  33. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/__init__.py +0 -0
  34. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/__init__.py +0 -0
  35. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/hooks/__init__.py +0 -0
  36. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/hooks/azure_databricks.py +0 -0
  37. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/hooks/azure_file_share.py +0 -0
  38. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/__init__.py +0 -0
  39. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/operators/__init__.py +0 -0
  40. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/operators/duckdb_to_deltalake.py +0 -0
  41. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/operators/filesystem_to_database.py +0 -0
  42. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/sensors/__init__.py +0 -0
  43. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/sensors/filesystem_file.py +0 -0
  44. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/__init__.py +0 -0
  45. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/operators/__init__.py +0 -0
  46. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/operators/filesystem.py +0 -0
  47. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/tasks.py +0 -0
  48. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/package.py +0 -0
  49. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/py.typed +0 -0
  50. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/testing.py +0 -0
  51. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/types.py +0 -0
  52. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/dependency_links.txt +0 -0
  53. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/entry_points.txt +0 -0
  54. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/requires.txt +0 -0
  55. {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: airflow-toolkit
3
- Version: 2.3.0
3
+ Version: 2.4.0
4
4
  Summary: A toolkit of operators, hooks and utilities for Apache Airflow 3
5
5
  Author-email: Biel Llobera <biel_llobera@dkl.digital>
6
6
  Requires-Python: <3.15,>=3.11
@@ -145,8 +145,8 @@ pip install "airflow-toolkit[airflow3-full]"
145
145
  | `http` | `providers-http`, `requests`, `jmespath`, `pandas` | `HttpToFilesystem`, `MultiHttpToFilesystem` |
146
146
  | `duckdb` | `airflow-provider-duckdb` | `DuckdbToDeltalake` operator |
147
147
  | `sqlite` | `providers-sqlite` | SQLite as source or destination |
148
- | `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` |
149
- | `avro` | `fastavro` | Avro support in `FilesystemToDatabase` |
148
+ | `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` and `HttpToFilesystem` |
149
+ | `avro` | `fastavro` | Avro support in `FilesystemToDatabase` and `HttpToFilesystem` |
150
150
  | `airflow3-full` | all of the above | Quick start / development |
151
151
 
152
152
  ---
@@ -191,7 +191,7 @@ Changing the connection's `conn_type` is all that is needed to switch backends
191
191
 
192
192
  ### HttpToFilesystem
193
193
 
194
- Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, and custom response transformations.
194
+ Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, OAuth 2.0 authentication, rate limiting, and custom response transformations.
195
195
 
196
196
  ```python
197
197
  from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import HttpToFilesystem
@@ -208,7 +208,7 @@ HttpToFilesystem(
208
208
  )
209
209
  ```
210
210
 
211
- With cursor-based pagination:
211
+ **With cursor-based pagination:**
212
212
 
213
213
  ```python
214
214
  def next_page(response):
@@ -230,9 +230,70 @@ HttpToFilesystem(
230
230
  )
231
231
  ```
232
232
 
233
+ **With OAuth 2.0 Client Credentials:**
234
+
235
+ `OAuth2ClientCredentials.client_credentials()` returns a configured auth class that fetches the token lazily on the first request and refreshes it automatically 30 seconds before expiry — no manual token management required.
236
+
237
+ ```python
238
+ from airflow_toolkit.providers.filesystem.operators.auth import OAuth2ClientCredentials
239
+
240
+ HttpToFilesystem(
241
+ task_id='fetch_protected_data',
242
+ http_conn_id='my_api',
243
+ filesystem_conn_id='my_data_lake',
244
+ filesystem_path='raw/data/{{ ds }}/',
245
+ endpoint='/api/v1/data',
246
+ method='GET',
247
+ save_format='jsonl',
248
+ auth_type=OAuth2ClientCredentials.client_credentials(
249
+ token_url='https://auth.example.com/oauth2/token',
250
+ client_id='{{ var.value.oauth2_client_id }}',
251
+ client_secret='{{ var.value.oauth2_client_secret }}',
252
+ scope='read', # optional
253
+ ),
254
+ )
255
+ ```
256
+
257
+ **With rate limiting:**
258
+
259
+ Use `requests_per_second` to cap how fast paginated requests are sent. This is useful when the API enforces a rate limit.
260
+
261
+ ```python
262
+ HttpToFilesystem(
263
+ task_id='fetch_with_rate_limit',
264
+ http_conn_id='my_api',
265
+ filesystem_conn_id='my_data_lake',
266
+ filesystem_path='raw/events/{{ ds }}/',
267
+ endpoint='/api/v1/events',
268
+ method='GET',
269
+ pagination_function=next_page,
270
+ save_format='jsonl',
271
+ requests_per_second=3.0, # max 3 requests per second between pages
272
+ )
273
+ ```
274
+
275
+ **Supported response formats:**
276
+
277
+ `save_format` controls how the response is written to the filesystem. For APIs that return binary formats natively (e.g. a reporting API that streams Excel files), set `source_format` to match the response content type:
278
+
279
+ | `source_format` / `save_format` | File extension | Notes |
280
+ |---|---|---|
281
+ | `json` | `.json` | Single JSON object or array |
282
+ | `jsonl` | `.jsonl` | Array response written as one record per line |
283
+ | `csv` | `.csv` | Raw CSV text from the response |
284
+ | `xml` | `.xml` | Raw XML text from the response |
285
+ | `parquet` | `.parquet` | Binary passthrough — API must return Parquet bytes |
286
+ | `excel` | `.xlsx` | Binary passthrough — API must return Excel bytes (requires `[excel]`) |
287
+ | `avro` | `.avro` | Binary passthrough — API must return Avro bytes (requires `[avro]`) |
288
+ | `fixed_width` | `.fwf` | Fixed-width text from the response |
289
+
290
+ All text and JSON formats support gzip/zip compression via the `compression` parameter.
291
+
233
292
  ### MultiHttpToFilesystem
234
293
 
235
- Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file. Useful for fetching multiple entities or date ranges without creating one task per request.
294
+ Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file. Requests can run **sequentially** (with optional rate limiting) or **in parallel** using a thread pool.
295
+
296
+ **Sequential with rate limiting:**
236
297
 
237
298
  ```python
238
299
  from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import MultiHttpToFilesystem
@@ -244,6 +305,7 @@ MultiHttpToFilesystem(
244
305
  filesystem_path='raw/reference/{{ ds }}/',
245
306
  method='GET',
246
307
  save_format='jsonl',
308
+ requests_per_second=2.0, # max 2 requests per second between calls
247
309
  multi_requests=[
248
310
  {'endpoint': '/api/v1/categories'},
249
311
  {'endpoint': '/api/v1/statuses'},
@@ -252,6 +314,31 @@ MultiHttpToFilesystem(
252
314
  )
253
315
  ```
254
316
 
317
+ **Parallel execution:**
318
+
319
+ Set `max_workers` to run requests concurrently using a thread pool. Each request writes to its own file — there are no file collisions.
320
+
321
+ ```python
322
+ MultiHttpToFilesystem(
323
+ task_id='fetch_users_parallel',
324
+ http_conn_id='my_api',
325
+ filesystem_conn_id='my_data_lake',
326
+ filesystem_path='raw/users/{{ ds }}/',
327
+ method='GET',
328
+ save_format='json',
329
+ max_workers=5, # up to 5 concurrent threads
330
+ multi_requests=[
331
+ {'endpoint': '/api/v1/users/1'},
332
+ {'endpoint': '/api/v1/users/2'},
333
+ {'endpoint': '/api/v1/users/3'},
334
+ {'endpoint': '/api/v1/users/4'},
335
+ {'endpoint': '/api/v1/users/5'},
336
+ ],
337
+ )
338
+ ```
339
+
340
+ > Rate limiting (`requests_per_second`) applies only in sequential mode. In parallel mode the thread pool controls concurrency — use `max_workers` to avoid overwhelming the API.
341
+
255
342
  Each entry in `multi_requests` can override any base parameter (`endpoint`, `method`, `headers`, `data`, `jmespath_expression`, `save_format`, `compression`).
256
343
 
257
344
  ### SQLToFilesystem
@@ -86,8 +86,8 @@ pip install "airflow-toolkit[airflow3-full]"
86
86
  | `http` | `providers-http`, `requests`, `jmespath`, `pandas` | `HttpToFilesystem`, `MultiHttpToFilesystem` |
87
87
  | `duckdb` | `airflow-provider-duckdb` | `DuckdbToDeltalake` operator |
88
88
  | `sqlite` | `providers-sqlite` | SQLite as source or destination |
89
- | `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` |
90
- | `avro` | `fastavro` | Avro support in `FilesystemToDatabase` |
89
+ | `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` and `HttpToFilesystem` |
90
+ | `avro` | `fastavro` | Avro support in `FilesystemToDatabase` and `HttpToFilesystem` |
91
91
  | `airflow3-full` | all of the above | Quick start / development |
92
92
 
93
93
  ---
@@ -132,7 +132,7 @@ Changing the connection's `conn_type` is all that is needed to switch backends
132
132
 
133
133
  ### HttpToFilesystem
134
134
 
135
- Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, and custom response transformations.
135
+ Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, OAuth 2.0 authentication, rate limiting, and custom response transformations.
136
136
 
137
137
  ```python
138
138
  from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import HttpToFilesystem
@@ -149,7 +149,7 @@ HttpToFilesystem(
149
149
  )
150
150
  ```
151
151
 
152
- With cursor-based pagination:
152
+ **With cursor-based pagination:**
153
153
 
154
154
  ```python
155
155
  def next_page(response):
@@ -171,9 +171,70 @@ HttpToFilesystem(
171
171
  )
172
172
  ```
173
173
 
174
+ **With OAuth 2.0 Client Credentials:**
175
+
176
+ `OAuth2ClientCredentials.client_credentials()` returns a configured auth class that fetches the token lazily on the first request and refreshes it automatically 30 seconds before expiry — no manual token management required.
177
+
178
+ ```python
179
+ from airflow_toolkit.providers.filesystem.operators.auth import OAuth2ClientCredentials
180
+
181
+ HttpToFilesystem(
182
+ task_id='fetch_protected_data',
183
+ http_conn_id='my_api',
184
+ filesystem_conn_id='my_data_lake',
185
+ filesystem_path='raw/data/{{ ds }}/',
186
+ endpoint='/api/v1/data',
187
+ method='GET',
188
+ save_format='jsonl',
189
+ auth_type=OAuth2ClientCredentials.client_credentials(
190
+ token_url='https://auth.example.com/oauth2/token',
191
+ client_id='{{ var.value.oauth2_client_id }}',
192
+ client_secret='{{ var.value.oauth2_client_secret }}',
193
+ scope='read', # optional
194
+ ),
195
+ )
196
+ ```
197
+
198
+ **With rate limiting:**
199
+
200
+ Use `requests_per_second` to cap how fast paginated requests are sent. This is useful when the API enforces a rate limit.
201
+
202
+ ```python
203
+ HttpToFilesystem(
204
+ task_id='fetch_with_rate_limit',
205
+ http_conn_id='my_api',
206
+ filesystem_conn_id='my_data_lake',
207
+ filesystem_path='raw/events/{{ ds }}/',
208
+ endpoint='/api/v1/events',
209
+ method='GET',
210
+ pagination_function=next_page,
211
+ save_format='jsonl',
212
+ requests_per_second=3.0, # max 3 requests per second between pages
213
+ )
214
+ ```
215
+
216
+ **Supported response formats:**
217
+
218
+ `save_format` controls how the response is written to the filesystem. For APIs that return binary formats natively (e.g. a reporting API that streams Excel files), set `source_format` to match the response content type:
219
+
220
+ | `source_format` / `save_format` | File extension | Notes |
221
+ |---|---|---|
222
+ | `json` | `.json` | Single JSON object or array |
223
+ | `jsonl` | `.jsonl` | Array response written as one record per line |
224
+ | `csv` | `.csv` | Raw CSV text from the response |
225
+ | `xml` | `.xml` | Raw XML text from the response |
226
+ | `parquet` | `.parquet` | Binary passthrough — API must return Parquet bytes |
227
+ | `excel` | `.xlsx` | Binary passthrough — API must return Excel bytes (requires `[excel]`) |
228
+ | `avro` | `.avro` | Binary passthrough — API must return Avro bytes (requires `[avro]`) |
229
+ | `fixed_width` | `.fwf` | Fixed-width text from the response |
230
+
231
+ All text and JSON formats support gzip/zip compression via the `compression` parameter.
232
+
174
233
  ### MultiHttpToFilesystem
175
234
 
176
- Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file. Useful for fetching multiple entities or date ranges without creating one task per request.
235
+ Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file. Requests can run **sequentially** (with optional rate limiting) or **in parallel** using a thread pool.
236
+
237
+ **Sequential with rate limiting:**
177
238
 
178
239
  ```python
179
240
  from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import MultiHttpToFilesystem
@@ -185,6 +246,7 @@ MultiHttpToFilesystem(
185
246
  filesystem_path='raw/reference/{{ ds }}/',
186
247
  method='GET',
187
248
  save_format='jsonl',
249
+ requests_per_second=2.0, # max 2 requests per second between calls
188
250
  multi_requests=[
189
251
  {'endpoint': '/api/v1/categories'},
190
252
  {'endpoint': '/api/v1/statuses'},
@@ -193,6 +255,31 @@ MultiHttpToFilesystem(
193
255
  )
194
256
  ```
195
257
 
258
+ **Parallel execution:**
259
+
260
+ Set `max_workers` to run requests concurrently using a thread pool. Each request writes to its own file — there are no file collisions.
261
+
262
+ ```python
263
+ MultiHttpToFilesystem(
264
+ task_id='fetch_users_parallel',
265
+ http_conn_id='my_api',
266
+ filesystem_conn_id='my_data_lake',
267
+ filesystem_path='raw/users/{{ ds }}/',
268
+ method='GET',
269
+ save_format='json',
270
+ max_workers=5, # up to 5 concurrent threads
271
+ multi_requests=[
272
+ {'endpoint': '/api/v1/users/1'},
273
+ {'endpoint': '/api/v1/users/2'},
274
+ {'endpoint': '/api/v1/users/3'},
275
+ {'endpoint': '/api/v1/users/4'},
276
+ {'endpoint': '/api/v1/users/5'},
277
+ ],
278
+ )
279
+ ```
280
+
281
+ > Rate limiting (`requests_per_second`) applies only in sequential mode. In parallel mode the thread pool controls concurrency — use `max_workers` to avoid overwhelming the API.
282
+
196
283
  Each entry in `multi_requests` can override any base parameter (`endpoint`, `method`, `headers`, `data`, `jmespath_expression`, `save_format`, `compression`).
197
284
 
198
285
  ### SQLToFilesystem
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "airflow-toolkit"
3
- version = "2.3.0"
3
+ version = "2.4.0"
4
4
  description = "A toolkit of operators, hooks and utilities for Apache Airflow 3"
5
5
  authors = [{ name = "Biel Llobera", email = "biel_llobera@dkl.digital" }]
6
6
  requires-python = ">=3.11,<3.15"
@@ -0,0 +1,66 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from typing import TYPE_CHECKING
5
+
6
+ import requests
7
+ from requests.auth import AuthBase
8
+
9
+ if TYPE_CHECKING:
10
+ from requests import PreparedRequest
11
+
12
+
13
+ class OAuth2ClientCredentials:
14
+ """Factory for OAuth 2.0 Client Credentials auth.
15
+
16
+ Returns a configured AuthBase class that fetches and caches tokens,
17
+ refreshing automatically 30 seconds before expiry.
18
+
19
+ Usage:
20
+ auth_type=OAuth2ClientCredentials.client_credentials(
21
+ token_url="https://auth.example.com/token",
22
+ client_id="{{ var.value.client_id }}",
23
+ client_secret="{{ var.value.client_secret }}",
24
+ )
25
+ """
26
+
27
+ @staticmethod
28
+ def client_credentials(
29
+ token_url: str,
30
+ client_id: str,
31
+ client_secret: str,
32
+ scope: str | None = None,
33
+ ) -> type[AuthBase]:
34
+ """Return a configured AuthBase subclass for OAuth 2.0 Client Credentials.
35
+
36
+ Each call produces an independent class with its own token cache, so
37
+ multiple operators with different credentials do not share tokens.
38
+ """
39
+
40
+ class _OAuth2Auth(AuthBase):
41
+ _token: str | None = None
42
+ _expiry: float = 0.0
43
+
44
+ def __call__(self, r: "PreparedRequest") -> "PreparedRequest":
45
+ cls = type(self)
46
+ if cls._token is None or time.time() >= cls._expiry - 30:
47
+ cls._refresh()
48
+ r.headers["Authorization"] = f"Bearer {cls._token}"
49
+ return r
50
+
51
+ @classmethod
52
+ def _refresh(cls) -> None:
53
+ payload: dict[str, str] = {
54
+ "grant_type": "client_credentials",
55
+ "client_id": client_id,
56
+ "client_secret": client_secret,
57
+ }
58
+ if scope:
59
+ payload["scope"] = scope
60
+ resp = requests.post(token_url, data=payload)
61
+ resp.raise_for_status()
62
+ data = resp.json()
63
+ cls._token = data["access_token"]
64
+ cls._expiry = time.time() + float(data.get("expires_in", 3600))
65
+
66
+ return _OAuth2Auth
@@ -2,7 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  import logging
5
+ import time
5
6
  import uuid
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
8
  from io import BytesIO, StringIO
7
9
  from typing import (
8
10
  TYPE_CHECKING,
@@ -38,7 +40,10 @@ if TYPE_CHECKING:
38
40
 
39
41
  class HttpBatchOperator(HttpOperator):
40
42
  def execute(
41
- self, context: Context, use_new_data_parameters_on_pagination=False
43
+ self,
44
+ context: Context,
45
+ use_new_data_parameters_on_pagination: bool = False,
46
+ delay: float = 0.0,
42
47
  ) -> Generator[Any, None, None]:
43
48
  self.log.info("Calling HTTP method")
44
49
 
@@ -49,16 +54,22 @@ class HttpBatchOperator(HttpOperator):
49
54
  for response in self.paginate_sync(
50
55
  response=response,
51
56
  use_new_data_parameters_on_pagination=use_new_data_parameters_on_pagination,
57
+ delay=delay,
52
58
  ):
53
59
  yield self.process_response(context=context, response=response)
54
60
 
55
61
  def paginate_sync(
56
- self, response: Response, use_new_data_parameters_on_pagination=False
62
+ self,
63
+ response: Response,
64
+ use_new_data_parameters_on_pagination: bool = False,
65
+ delay: float = 0.0,
57
66
  ) -> Generator[Response, None, None]:
58
67
  if not self.pagination_function:
59
68
  return
60
69
 
61
70
  while True:
71
+ if delay > 0:
72
+ time.sleep(delay)
62
73
  next_page_params = self.pagination_function(response)
63
74
  if not next_page_params:
64
75
  break
@@ -71,7 +82,9 @@ class HttpBatchOperator(HttpOperator):
71
82
  return
72
83
 
73
84
  def _merge_next_page_parameters(
74
- self, next_page_params: dict, use_new_data_parameters_on_pagination=False
85
+ self,
86
+ next_page_params: dict,
87
+ use_new_data_parameters_on_pagination: bool = False,
75
88
  ) -> dict:
76
89
  """Merge initial request parameters with next page parameters.
77
90
 
@@ -119,7 +132,21 @@ class HttpToFilesystem(BaseOperator):
119
132
  template_fields_renderers = HttpOperator.template_fields_renderers
120
133
 
121
134
  json_response_source_format = ["json", "jsonl"]
122
- binary_response_source_format = ["parquet"]
135
+ binary_response_source_format = ["parquet", "excel", "avro"]
136
+
137
+ # Maps source_format/save_format → file extension.
138
+ # Formats that differ from their name (excel → xlsx, fixed_width → fwf) are listed
139
+ # explicitly; all others fall back to using the format name as-is.
140
+ _FORMAT_EXTENSIONS: dict[str, str] = {
141
+ "json": "json",
142
+ "jsonl": "jsonl",
143
+ "xml": "xml",
144
+ "csv": "csv",
145
+ "parquet": "parquet",
146
+ "excel": "xlsx",
147
+ "avro": "avro",
148
+ "fixed_width": "fwf",
149
+ }
123
150
 
124
151
  def __init__(
125
152
  self,
@@ -142,6 +169,7 @@ class HttpToFilesystem(BaseOperator):
142
169
  data_transformation_kwargs: dict[str, Any] | None = None,
143
170
  file_number_start: int = 1,
144
171
  strict_response_schema: bool = True,
172
+ requests_per_second: float | None = None,
145
173
  *args,
146
174
  **kwargs,
147
175
  ):
@@ -170,14 +198,15 @@ class HttpToFilesystem(BaseOperator):
170
198
  self.source_format = source_format if source_format else save_format
171
199
  self.file_number_start = file_number_start
172
200
  self.strict_response_schema = strict_response_schema
201
+ self.requests_per_second = requests_per_second
173
202
  self.kwargs = kwargs
174
203
 
175
204
  if (
176
- self.save_format in self.binary_response_source_format
205
+ self.source_format in self.binary_response_source_format
177
206
  and self.compression is not None
178
207
  ):
179
208
  raise ValueError(
180
- f"Compression is not supported for binary response save formats: {self.binary_response_source_format}"
209
+ f"Compression is not supported for binary source formats: {self.binary_response_source_format}"
181
210
  )
182
211
 
183
212
  if self.data_transformation and not callable(self.data_transformation):
@@ -204,10 +233,12 @@ class HttpToFilesystem(BaseOperator):
204
233
  response_filter=self._response_filter,
205
234
  pagination_function=self.pagination_function,
206
235
  )
236
+ delay = (1.0 / self.requests_per_second) if self.requests_per_second else 0.0
207
237
  for i, data in enumerate(
208
238
  http_batch_operator.execute(
209
239
  context,
210
240
  use_new_data_parameters_on_pagination=self.use_new_data_parameters_on_pagination,
241
+ delay=delay,
211
242
  ),
212
243
  start=self.file_number_start,
213
244
  ):
@@ -232,9 +263,9 @@ class HttpToFilesystem(BaseOperator):
232
263
  )
233
264
  filesystem_protocol.write(BytesIO(), success_file_path)
234
265
 
235
- def _file_name(self, n_part) -> str:
236
- file_name = f"part{n_part:04}.{self.save_format}"
237
-
266
+ def _file_name(self, n_part: int) -> str:
267
+ ext = self._FORMAT_EXTENSIONS.get(self.save_format, self.save_format)
268
+ file_name = f"part{n_part:04}.{ext}"
238
269
  if self.compression:
239
270
  file_name += f".{self.compression}"
240
271
  return file_name
@@ -263,7 +294,6 @@ class HttpToFilesystem(BaseOperator):
263
294
 
264
295
  self.response_filter_data = data
265
296
 
266
- # Check if we have a custom data transformation
267
297
  if self.data_transformation and self.data_transformation_kwargs:
268
298
  transformed = self.data_transformation(
269
299
  data, self.data_transformation_kwargs
@@ -273,8 +303,6 @@ class HttpToFilesystem(BaseOperator):
273
303
  transformed = self.data_transformation(data)
274
304
  return self._ensure_bytesio(transformed)
275
305
 
276
- # If we don't have a custom data transformation, use the default one based on the source_format
277
-
278
306
  match self.source_format:
279
307
  case "json":
280
308
  return json_to_binary(data, self.compression)
@@ -291,10 +319,10 @@ class HttpToFilesystem(BaseOperator):
291
319
  return BytesIO()
292
320
  return list_to_jsonl(data, self.compression)
293
321
 
294
- case "xml":
322
+ case "xml" | "fixed_width":
295
323
  return xml_to_binary(data, self.compression)
296
324
 
297
- case "parquet":
325
+ case "parquet" | "excel" | "avro":
298
326
  return self._ensure_bytesio(data)
299
327
 
300
328
  case "csv":
@@ -306,9 +334,6 @@ class HttpToFilesystem(BaseOperator):
306
334
  )
307
335
 
308
336
  def _ensure_bytesio(self, value: BytesIO | bytes | str) -> BytesIO:
309
- """
310
- Ensure the transformation output is a BytesIO object.
311
- """
312
337
  if isinstance(value, BytesIO):
313
338
  return value
314
339
  if isinstance(value, bytes):
@@ -329,25 +354,28 @@ class MultiHttpToFilesystem(HttpToFilesystem):
329
354
  Args:
330
355
  multi_requests: List of request specifications. Each item can override
331
356
  any base operator parameter for that specific request.
357
+ max_workers: Number of threads for parallel execution. None (default) runs
358
+ requests sequentially. Set to an integer to enable concurrency.
359
+ Note: rate limiting (requests_per_second) is only applied in
360
+ sequential mode.
332
361
 
333
362
  Example:
334
363
  MultiHttpToFilesystem(
335
364
  http_conn_id='api_connection',
336
- base_endpoint='/api/v1',
337
365
  headers={'Authorization': 'Bearer token'},
338
366
  multi_requests=[
339
367
  {'endpoint': '/users/1'},
340
368
  {'endpoint': '/users/2', 'method': 'POST', 'data': {...}},
341
- {'endpoint': '/orders', 'headers': {'Custom': 'Header'}}
369
+ {'endpoint': '/orders', 'headers': {'Custom': 'Header'}},
342
370
  ]
343
371
  )
344
372
 
345
373
  Notes:
346
374
  - Pagination is not supported
347
- - Requests are executed sequentially within the task
375
+ - Requests are executed sequentially by default; set max_workers for concurrency
348
376
  - Per-request values override base configuration with dict merging for
349
377
  headers/data, and replacement for other parameters
350
- - All validations are re-applied after each request configuration
378
+ - All validations are re-applied after each request configuration override
351
379
  """
352
380
 
353
381
  template_fields = HttpToFilesystem.template_fields + ["multi_requests"]
@@ -356,11 +384,15 @@ class MultiHttpToFilesystem(HttpToFilesystem):
356
384
  "multi_requests": "py",
357
385
  }
358
386
 
359
- # Allowed keys come from the TypedDict (so static + runtime stay in sync)
360
387
  _ALLOWED_KEYS = set(RequestSpec.__annotations__.keys())
361
388
 
362
- def __init__(self, *, multi_requests: list[RequestSpec], **kwargs):
363
- # No pagination in this multi operator
389
+ def __init__(
390
+ self,
391
+ *,
392
+ multi_requests: list[RequestSpec],
393
+ max_workers: int | None = None,
394
+ **kwargs,
395
+ ):
364
396
  if kwargs.get("pagination_function") is not None:
365
397
  raise ValueError("Pagination is not supported in MultiHttpToFilesystem")
366
398
 
@@ -369,6 +401,7 @@ class MultiHttpToFilesystem(HttpToFilesystem):
369
401
 
370
402
  super().__init__(**kwargs)
371
403
  self.multi_requests: list[RequestSpec] = multi_requests
404
+ self.max_workers = max_workers
372
405
 
373
406
  def _capture_request_state(self) -> RequestState:
374
407
  return {
@@ -396,29 +429,24 @@ class MultiHttpToFilesystem(HttpToFilesystem):
396
429
 
397
430
  @staticmethod
398
431
  def _merge_or_replace(base_val: Any, override_val: Any) -> Any:
399
- # Shallow-merge dicts; otherwise replace.
400
432
  if isinstance(base_val, dict) and isinstance(override_val, dict):
401
433
  return {**base_val, **override_val}
402
434
  return override_val
403
435
 
404
436
  def _apply_request_overrides(self, spec: RequestSpec, base: RequestState) -> None:
405
- """
406
- Apply per-request configuration overrides on top of base operator settings.
437
+ """Apply per-request configuration overrides on top of base operator settings.
407
438
 
408
439
  Args:
409
- spec: Request-specific configuration overrides
410
- base: Base operator configuration to restore after request
440
+ spec: Request-specific configuration overrides.
441
+ base: Base operator configuration to restore after request.
411
442
 
412
443
  Raises:
413
- ValueError: If spec contains unknown keys or invalid combinations
444
+ ValueError: If spec contains unknown keys or invalid combinations.
414
445
  """
415
-
416
- # Validate allowed override keys
417
446
  unknown = set(spec.keys()) - self._ALLOWED_KEYS
418
447
  if unknown:
419
448
  raise ValueError(f"Unknown keys in multi_requests item: {sorted(unknown)}")
420
449
 
421
- # Simple fields
422
450
  self.endpoint = spec.get("endpoint", base["endpoint"])
423
451
  self.method = spec.get("method", base["method"])
424
452
  self.auth_type = spec.get("auth_type", base["auth_type"])
@@ -426,7 +454,6 @@ class MultiHttpToFilesystem(HttpToFilesystem):
426
454
  "jmespath_expression", base["jmespath_expression"]
427
455
  )
428
456
 
429
- # Dict-like merge/replace behavior
430
457
  if "headers" in spec:
431
458
  self.headers = self._merge_or_replace(base["headers"], spec["headers"])
432
459
  else:
@@ -437,22 +464,19 @@ class MultiHttpToFilesystem(HttpToFilesystem):
437
464
  else:
438
465
  self.data = base["data"]
439
466
 
440
- # Formats & compression
441
467
  self.save_format = spec.get("save_format", base["save_format"])
442
468
  self.source_format = spec.get("source_format", base["source_format"])
443
469
  self.compression = spec.get("compression", base["compression"])
444
470
 
445
- # Validate this request's final state
446
471
  self._validate_current_request_state()
447
472
 
448
473
  def _validate_current_request_state(self) -> None:
449
- # Re-apply critical validations that may be affected by per-request overrides
450
474
  if (
451
- self.save_format in self.binary_response_source_format
475
+ self.source_format in self.binary_response_source_format
452
476
  and self.compression is not None
453
477
  ):
454
478
  raise ValueError(
455
- f"Compression is not supported for binary response save formats: "
479
+ f"Compression is not supported for binary source formats: "
456
480
  f"{self.binary_response_source_format}"
457
481
  )
458
482
  if self.data_transformation and not callable(self.data_transformation):
@@ -466,15 +490,91 @@ class MultiHttpToFilesystem(HttpToFilesystem):
466
490
  "data_transformation must be provided if data_transformation_kwargs is provided"
467
491
  )
468
492
 
469
- def execute(self, context) -> Any:
493
+ @staticmethod
494
+ def _execute_one_request(
495
+ base_op: "MultiHttpToFilesystem",
496
+ context: Context,
497
+ spec: RequestSpec,
498
+ base_state: RequestState,
499
+ file_number: int,
500
+ ) -> None:
501
+ """Execute a single request as an independent HttpToFilesystem (thread-safe).
502
+
503
+ Creates a standalone operator instance with the merged configuration so that
504
+ parallel workers never share mutable state.
505
+ """
506
+ unknown = set(spec.keys()) - MultiHttpToFilesystem._ALLOWED_KEYS
507
+ if unknown:
508
+ raise ValueError(f"Unknown keys in multi_requests item: {sorted(unknown)}")
509
+
510
+ merged: dict[str, Any] = {
511
+ "endpoint": spec.get("endpoint", base_state["endpoint"]),
512
+ "method": spec.get("method", base_state["method"]),
513
+ "auth_type": spec.get("auth_type", base_state["auth_type"]),
514
+ "jmespath_expression": spec.get(
515
+ "jmespath_expression", base_state["jmespath_expression"]
516
+ ),
517
+ "headers": MultiHttpToFilesystem._merge_or_replace(
518
+ base_state["headers"], spec["headers"]
519
+ )
520
+ if "headers" in spec
521
+ else base_state["headers"],
522
+ "data": MultiHttpToFilesystem._merge_or_replace(
523
+ base_state["data"], spec["data"]
524
+ )
525
+ if "data" in spec
526
+ else base_state["data"],
527
+ "save_format": spec.get("save_format", base_state["save_format"]),
528
+ "source_format": spec.get("source_format", base_state["source_format"]),
529
+ "compression": spec.get("compression", base_state["compression"]),
530
+ }
531
+
532
+ op = HttpToFilesystem(
533
+ task_id=f"http-{uuid.uuid4()}",
534
+ http_conn_id=base_op.http_conn_id,
535
+ filesystem_conn_id=base_op.filesystem_conn_id,
536
+ filesystem_path=base_op.filesystem_path,
537
+ data_transformation=base_op.data_transformation,
538
+ data_transformation_kwargs=base_op.data_transformation_kwargs or None,
539
+ create_file_on_success=base_op.create_file_on_success,
540
+ strict_response_schema=base_op.strict_response_schema,
541
+ requests_per_second=None,
542
+ file_number_start=file_number,
543
+ **merged,
544
+ )
545
+ op.execute(context)
546
+
547
+ def execute(self, context: Context) -> Any:
470
548
  base = self._capture_request_state()
471
- for i, spec in enumerate(self.multi_requests, start=1):
472
- self.file_number_start = i
473
- try:
474
- self._apply_request_overrides(spec, base)
475
- super().execute(context)
476
- finally:
477
- self._restore_request_state(base)
549
+
550
+ if self.max_workers:
551
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
552
+ futures = [
553
+ executor.submit(
554
+ MultiHttpToFilesystem._execute_one_request,
555
+ self,
556
+ context,
557
+ spec,
558
+ base,
559
+ i,
560
+ )
561
+ for i, spec in enumerate(self.multi_requests, start=1)
562
+ ]
563
+ for future in as_completed(futures):
564
+ future.result()
565
+ else:
566
+ delay = (
567
+ (1.0 / self.requests_per_second) if self.requests_per_second else 0.0
568
+ )
569
+ for i, spec in enumerate(self.multi_requests, start=1):
570
+ if i > 1 and delay > 0:
571
+ time.sleep(delay)
572
+ self.file_number_start = i
573
+ try:
574
+ self._apply_request_overrides(spec, base)
575
+ super().execute(context)
576
+ finally:
577
+ self._restore_request_state(base)
478
578
 
479
579
 
480
580
  def list_to_jsonl(data: list[dict], compression: "CompressionOptions") -> BytesIO:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: airflow-toolkit
3
- Version: 2.3.0
3
+ Version: 2.4.0
4
4
  Summary: A toolkit of operators, hooks and utilities for Apache Airflow 3
5
5
  Author-email: Biel Llobera <biel_llobera@dkl.digital>
6
6
  Requires-Python: <3.15,>=3.11
@@ -145,8 +145,8 @@ pip install "airflow-toolkit[airflow3-full]"
145
145
  | `http` | `providers-http`, `requests`, `jmespath`, `pandas` | `HttpToFilesystem`, `MultiHttpToFilesystem` |
146
146
  | `duckdb` | `airflow-provider-duckdb` | `DuckdbToDeltalake` operator |
147
147
  | `sqlite` | `providers-sqlite` | SQLite as source or destination |
148
- | `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` |
149
- | `avro` | `fastavro` | Avro support in `FilesystemToDatabase` |
148
+ | `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` and `HttpToFilesystem` |
149
+ | `avro` | `fastavro` | Avro support in `FilesystemToDatabase` and `HttpToFilesystem` |
150
150
  | `airflow3-full` | all of the above | Quick start / development |
151
151
 
152
152
  ---
@@ -191,7 +191,7 @@ Changing the connection's `conn_type` is all that is needed to switch backends
191
191
 
192
192
  ### HttpToFilesystem
193
193
 
194
- Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, and custom response transformations.
194
+ Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, OAuth 2.0 authentication, rate limiting, and custom response transformations.
195
195
 
196
196
  ```python
197
197
  from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import HttpToFilesystem
@@ -208,7 +208,7 @@ HttpToFilesystem(
208
208
  )
209
209
  ```
210
210
 
211
- With cursor-based pagination:
211
+ **With cursor-based pagination:**
212
212
 
213
213
  ```python
214
214
  def next_page(response):
@@ -230,9 +230,70 @@ HttpToFilesystem(
230
230
  )
231
231
  ```
232
232
 
233
+ **With OAuth 2.0 Client Credentials:**
234
+
235
+ `OAuth2ClientCredentials.client_credentials()` returns a configured auth class that fetches the token lazily on the first request and refreshes it automatically 30 seconds before expiry — no manual token management required.
236
+
237
+ ```python
238
+ from airflow_toolkit.providers.filesystem.operators.auth import OAuth2ClientCredentials
239
+
240
+ HttpToFilesystem(
241
+ task_id='fetch_protected_data',
242
+ http_conn_id='my_api',
243
+ filesystem_conn_id='my_data_lake',
244
+ filesystem_path='raw/data/{{ ds }}/',
245
+ endpoint='/api/v1/data',
246
+ method='GET',
247
+ save_format='jsonl',
248
+ auth_type=OAuth2ClientCredentials.client_credentials(
249
+ token_url='https://auth.example.com/oauth2/token',
250
+ client_id='{{ var.value.oauth2_client_id }}',
251
+ client_secret='{{ var.value.oauth2_client_secret }}',
252
+ scope='read', # optional
253
+ ),
254
+ )
255
+ ```
256
+
257
+ **With rate limiting:**
258
+
259
+ Use `requests_per_second` to cap how fast paginated requests are sent. This is useful when the API enforces a rate limit.
260
+
261
+ ```python
262
+ HttpToFilesystem(
263
+ task_id='fetch_with_rate_limit',
264
+ http_conn_id='my_api',
265
+ filesystem_conn_id='my_data_lake',
266
+ filesystem_path='raw/events/{{ ds }}/',
267
+ endpoint='/api/v1/events',
268
+ method='GET',
269
+ pagination_function=next_page,
270
+ save_format='jsonl',
271
+ requests_per_second=3.0, # max 3 requests per second between pages
272
+ )
273
+ ```
274
+
275
+ **Supported response formats:**
276
+
277
+ `save_format` controls how the response is written to the filesystem. For APIs that return binary formats natively (e.g. a reporting API that streams Excel files), set `source_format` to match the response content type:
278
+
279
+ | `source_format` / `save_format` | File extension | Notes |
280
+ |---|---|---|
281
+ | `json` | `.json` | Single JSON object or array |
282
+ | `jsonl` | `.jsonl` | Array response written as one record per line |
283
+ | `csv` | `.csv` | Raw CSV text from the response |
284
+ | `xml` | `.xml` | Raw XML text from the response |
285
+ | `parquet` | `.parquet` | Binary passthrough — API must return Parquet bytes |
286
+ | `excel` | `.xlsx` | Binary passthrough — API must return Excel bytes (requires `[excel]`) |
287
+ | `avro` | `.avro` | Binary passthrough — API must return Avro bytes (requires `[avro]`) |
288
+ | `fixed_width` | `.fwf` | Fixed-width text from the response |
289
+
290
+ All text and JSON formats support gzip/zip compression via the `compression` parameter.
291
+
233
292
  ### MultiHttpToFilesystem
234
293
 
235
- Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file. Useful for fetching multiple entities or date ranges without creating one task per request.
294
+ Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file. Requests can run **sequentially** (with optional rate limiting) or **in parallel** using a thread pool.
295
+
296
+ **Sequential with rate limiting:**
236
297
 
237
298
  ```python
238
299
  from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import MultiHttpToFilesystem
@@ -244,6 +305,7 @@ MultiHttpToFilesystem(
244
305
  filesystem_path='raw/reference/{{ ds }}/',
245
306
  method='GET',
246
307
  save_format='jsonl',
308
+ requests_per_second=2.0, # max 2 requests per second between calls
247
309
  multi_requests=[
248
310
  {'endpoint': '/api/v1/categories'},
249
311
  {'endpoint': '/api/v1/statuses'},
@@ -252,6 +314,31 @@ MultiHttpToFilesystem(
252
314
  )
253
315
  ```
254
316
 
317
+ **Parallel execution:**
318
+
319
+ Set `max_workers` to run requests concurrently using a thread pool. Each request writes to its own file — there are no file collisions.
320
+
321
+ ```python
322
+ MultiHttpToFilesystem(
323
+ task_id='fetch_users_parallel',
324
+ http_conn_id='my_api',
325
+ filesystem_conn_id='my_data_lake',
326
+ filesystem_path='raw/users/{{ ds }}/',
327
+ method='GET',
328
+ save_format='json',
329
+ max_workers=5, # up to 5 concurrent threads
330
+ multi_requests=[
331
+ {'endpoint': '/api/v1/users/1'},
332
+ {'endpoint': '/api/v1/users/2'},
333
+ {'endpoint': '/api/v1/users/3'},
334
+ {'endpoint': '/api/v1/users/4'},
335
+ {'endpoint': '/api/v1/users/5'},
336
+ ],
337
+ )
338
+ ```
339
+
340
+ > Rate limiting (`requests_per_second`) applies only in sequential mode. In parallel mode the thread pool controls concurrency — use `max_workers` to avoid overwhelming the API.
341
+
255
342
  Each entry in `multi_requests` can override any base parameter (`endpoint`, `method`, `headers`, `data`, `jmespath_expression`, `save_format`, `compression`).
256
343
 
257
344
  ### SQLToFilesystem
@@ -48,5 +48,6 @@ src/airflow_toolkit/providers/deltalake/sensors/filesystem_file.py
48
48
  src/airflow_toolkit/providers/filesystem/__init__.py
49
49
  src/airflow_toolkit/providers/filesystem/tasks.py
50
50
  src/airflow_toolkit/providers/filesystem/operators/__init__.py
51
+ src/airflow_toolkit/providers/filesystem/operators/auth.py
51
52
  src/airflow_toolkit/providers/filesystem/operators/filesystem.py
52
53
  src/airflow_toolkit/providers/filesystem/operators/http_to_filesystem.py