airflow-toolkit 2.3.0__tar.gz → 2.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {airflow_toolkit-2.3.0/src/airflow_toolkit.egg-info → airflow_toolkit-2.4.0}/PKG-INFO +93 -6
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/README.md +92 -5
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/pyproject.toml +1 -1
- airflow_toolkit-2.4.0/src/airflow_toolkit/providers/filesystem/operators/auth.py +66 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/operators/http_to_filesystem.py +147 -47
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0/src/airflow_toolkit.egg-info}/PKG-INFO +93 -6
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/SOURCES.txt +1 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/LICENSE.txt +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/setup.cfg +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/__init__.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/_compact/airflow_shim.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/compression_utils.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/exceptions.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/__init__.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/filesystem_factory.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/filesystem_protocol.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/__init__.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/azure_databricks_volume_filesystem.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/azure_file_share_filesystem.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/blob_storage_filesystem.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/google_cloud_storage_filesystem.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/local_filesystem.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/s3_filesystem.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/sftp_filesystem.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/__init__.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/__init__.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/discord.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/email.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/slack.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/teams.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/context.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/protocols.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/__init__.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/__init__.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/hooks/__init__.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/hooks/azure_databricks.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/hooks/azure_file_share.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/__init__.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/operators/__init__.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/operators/duckdb_to_deltalake.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/operators/filesystem_to_database.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/sensors/__init__.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/sensors/filesystem_file.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/__init__.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/operators/__init__.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/operators/filesystem.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/tasks.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/package.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/py.typed +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/testing.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/types.py +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/dependency_links.txt +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/entry_points.txt +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/requires.txt +0 -0
- {airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: airflow-toolkit
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.4.0
|
|
4
4
|
Summary: A toolkit of operators, hooks and utilities for Apache Airflow 3
|
|
5
5
|
Author-email: Biel Llobera <biel_llobera@dkl.digital>
|
|
6
6
|
Requires-Python: <3.15,>=3.11
|
|
@@ -145,8 +145,8 @@ pip install "airflow-toolkit[airflow3-full]"
|
|
|
145
145
|
| `http` | `providers-http`, `requests`, `jmespath`, `pandas` | `HttpToFilesystem`, `MultiHttpToFilesystem` |
|
|
146
146
|
| `duckdb` | `airflow-provider-duckdb` | `DuckdbToDeltalake` operator |
|
|
147
147
|
| `sqlite` | `providers-sqlite` | SQLite as source or destination |
|
|
148
|
-
| `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` |
|
|
149
|
-
| `avro` | `fastavro` | Avro support in `FilesystemToDatabase` |
|
|
148
|
+
| `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` and `HttpToFilesystem` |
|
|
149
|
+
| `avro` | `fastavro` | Avro support in `FilesystemToDatabase` and `HttpToFilesystem` |
|
|
150
150
|
| `airflow3-full` | all of the above | Quick start / development |
|
|
151
151
|
|
|
152
152
|
---
|
|
@@ -191,7 +191,7 @@ Changing the connection's `conn_type` is all that is needed to switch backends
|
|
|
191
191
|
|
|
192
192
|
### HttpToFilesystem
|
|
193
193
|
|
|
194
|
-
Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, and custom response transformations.
|
|
194
|
+
Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, OAuth 2.0 authentication, rate limiting, and custom response transformations.
|
|
195
195
|
|
|
196
196
|
```python
|
|
197
197
|
from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import HttpToFilesystem
|
|
@@ -208,7 +208,7 @@ HttpToFilesystem(
|
|
|
208
208
|
)
|
|
209
209
|
```
|
|
210
210
|
|
|
211
|
-
With cursor-based pagination
|
|
211
|
+
**With cursor-based pagination:**
|
|
212
212
|
|
|
213
213
|
```python
|
|
214
214
|
def next_page(response):
|
|
@@ -230,9 +230,70 @@ HttpToFilesystem(
|
|
|
230
230
|
)
|
|
231
231
|
```
|
|
232
232
|
|
|
233
|
+
**With OAuth 2.0 Client Credentials:**
|
|
234
|
+
|
|
235
|
+
`OAuth2ClientCredentials.client_credentials()` returns a configured auth class that fetches the token lazily on the first request and refreshes it automatically 30 seconds before expiry — no manual token management required.
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
from airflow_toolkit.providers.filesystem.operators.auth import OAuth2ClientCredentials
|
|
239
|
+
|
|
240
|
+
HttpToFilesystem(
|
|
241
|
+
task_id='fetch_protected_data',
|
|
242
|
+
http_conn_id='my_api',
|
|
243
|
+
filesystem_conn_id='my_data_lake',
|
|
244
|
+
filesystem_path='raw/data/{{ ds }}/',
|
|
245
|
+
endpoint='/api/v1/data',
|
|
246
|
+
method='GET',
|
|
247
|
+
save_format='jsonl',
|
|
248
|
+
auth_type=OAuth2ClientCredentials.client_credentials(
|
|
249
|
+
token_url='https://auth.example.com/oauth2/token',
|
|
250
|
+
client_id='{{ var.value.oauth2_client_id }}',
|
|
251
|
+
client_secret='{{ var.value.oauth2_client_secret }}',
|
|
252
|
+
scope='read', # optional
|
|
253
|
+
),
|
|
254
|
+
)
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
**With rate limiting:**
|
|
258
|
+
|
|
259
|
+
Use `requests_per_second` to cap how fast paginated requests are sent. This is useful when the API enforces a rate limit.
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
HttpToFilesystem(
|
|
263
|
+
task_id='fetch_with_rate_limit',
|
|
264
|
+
http_conn_id='my_api',
|
|
265
|
+
filesystem_conn_id='my_data_lake',
|
|
266
|
+
filesystem_path='raw/events/{{ ds }}/',
|
|
267
|
+
endpoint='/api/v1/events',
|
|
268
|
+
method='GET',
|
|
269
|
+
pagination_function=next_page,
|
|
270
|
+
save_format='jsonl',
|
|
271
|
+
requests_per_second=3.0, # max 3 requests per second between pages
|
|
272
|
+
)
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
**Supported response formats:**
|
|
276
|
+
|
|
277
|
+
`save_format` controls how the response is written to the filesystem. For APIs that return binary formats natively (e.g. a reporting API that streams Excel files), set `source_format` to match the response content type:
|
|
278
|
+
|
|
279
|
+
| `source_format` / `save_format` | File extension | Notes |
|
|
280
|
+
|---|---|---|
|
|
281
|
+
| `json` | `.json` | Single JSON object or array |
|
|
282
|
+
| `jsonl` | `.jsonl` | Array response written as one record per line |
|
|
283
|
+
| `csv` | `.csv` | Raw CSV text from the response |
|
|
284
|
+
| `xml` | `.xml` | Raw XML text from the response |
|
|
285
|
+
| `parquet` | `.parquet` | Binary passthrough — API must return Parquet bytes |
|
|
286
|
+
| `excel` | `.xlsx` | Binary passthrough — API must return Excel bytes (requires `[excel]`) |
|
|
287
|
+
| `avro` | `.avro` | Binary passthrough — API must return Avro bytes (requires `[avro]`) |
|
|
288
|
+
| `fixed_width` | `.fwf` | Fixed-width text from the response |
|
|
289
|
+
|
|
290
|
+
All text and JSON formats support gzip/zip compression via the `compression` parameter.
|
|
291
|
+
|
|
233
292
|
### MultiHttpToFilesystem
|
|
234
293
|
|
|
235
|
-
Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file.
|
|
294
|
+
Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file. Requests can run **sequentially** (with optional rate limiting) or **in parallel** using a thread pool.
|
|
295
|
+
|
|
296
|
+
**Sequential with rate limiting:**
|
|
236
297
|
|
|
237
298
|
```python
|
|
238
299
|
from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import MultiHttpToFilesystem
|
|
@@ -244,6 +305,7 @@ MultiHttpToFilesystem(
|
|
|
244
305
|
filesystem_path='raw/reference/{{ ds }}/',
|
|
245
306
|
method='GET',
|
|
246
307
|
save_format='jsonl',
|
|
308
|
+
requests_per_second=2.0, # max 2 requests per second between calls
|
|
247
309
|
multi_requests=[
|
|
248
310
|
{'endpoint': '/api/v1/categories'},
|
|
249
311
|
{'endpoint': '/api/v1/statuses'},
|
|
@@ -252,6 +314,31 @@ MultiHttpToFilesystem(
|
|
|
252
314
|
)
|
|
253
315
|
```
|
|
254
316
|
|
|
317
|
+
**Parallel execution:**
|
|
318
|
+
|
|
319
|
+
Set `max_workers` to run requests concurrently using a thread pool. Each request writes to its own file — there are no file collisions.
|
|
320
|
+
|
|
321
|
+
```python
|
|
322
|
+
MultiHttpToFilesystem(
|
|
323
|
+
task_id='fetch_users_parallel',
|
|
324
|
+
http_conn_id='my_api',
|
|
325
|
+
filesystem_conn_id='my_data_lake',
|
|
326
|
+
filesystem_path='raw/users/{{ ds }}/',
|
|
327
|
+
method='GET',
|
|
328
|
+
save_format='json',
|
|
329
|
+
max_workers=5, # up to 5 concurrent threads
|
|
330
|
+
multi_requests=[
|
|
331
|
+
{'endpoint': '/api/v1/users/1'},
|
|
332
|
+
{'endpoint': '/api/v1/users/2'},
|
|
333
|
+
{'endpoint': '/api/v1/users/3'},
|
|
334
|
+
{'endpoint': '/api/v1/users/4'},
|
|
335
|
+
{'endpoint': '/api/v1/users/5'},
|
|
336
|
+
],
|
|
337
|
+
)
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
> Rate limiting (`requests_per_second`) applies only in sequential mode. In parallel mode the thread pool controls concurrency — use `max_workers` to avoid overwhelming the API.
|
|
341
|
+
|
|
255
342
|
Each entry in `multi_requests` can override any base parameter (`endpoint`, `method`, `headers`, `data`, `jmespath_expression`, `save_format`, `compression`).
|
|
256
343
|
|
|
257
344
|
### SQLToFilesystem
|
|
@@ -86,8 +86,8 @@ pip install "airflow-toolkit[airflow3-full]"
|
|
|
86
86
|
| `http` | `providers-http`, `requests`, `jmespath`, `pandas` | `HttpToFilesystem`, `MultiHttpToFilesystem` |
|
|
87
87
|
| `duckdb` | `airflow-provider-duckdb` | `DuckdbToDeltalake` operator |
|
|
88
88
|
| `sqlite` | `providers-sqlite` | SQLite as source or destination |
|
|
89
|
-
| `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` |
|
|
90
|
-
| `avro` | `fastavro` | Avro support in `FilesystemToDatabase` |
|
|
89
|
+
| `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` and `HttpToFilesystem` |
|
|
90
|
+
| `avro` | `fastavro` | Avro support in `FilesystemToDatabase` and `HttpToFilesystem` |
|
|
91
91
|
| `airflow3-full` | all of the above | Quick start / development |
|
|
92
92
|
|
|
93
93
|
---
|
|
@@ -132,7 +132,7 @@ Changing the connection's `conn_type` is all that is needed to switch backends
|
|
|
132
132
|
|
|
133
133
|
### HttpToFilesystem
|
|
134
134
|
|
|
135
|
-
Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, and custom response transformations.
|
|
135
|
+
Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, OAuth 2.0 authentication, rate limiting, and custom response transformations.
|
|
136
136
|
|
|
137
137
|
```python
|
|
138
138
|
from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import HttpToFilesystem
|
|
@@ -149,7 +149,7 @@ HttpToFilesystem(
|
|
|
149
149
|
)
|
|
150
150
|
```
|
|
151
151
|
|
|
152
|
-
With cursor-based pagination
|
|
152
|
+
**With cursor-based pagination:**
|
|
153
153
|
|
|
154
154
|
```python
|
|
155
155
|
def next_page(response):
|
|
@@ -171,9 +171,70 @@ HttpToFilesystem(
|
|
|
171
171
|
)
|
|
172
172
|
```
|
|
173
173
|
|
|
174
|
+
**With OAuth 2.0 Client Credentials:**
|
|
175
|
+
|
|
176
|
+
`OAuth2ClientCredentials.client_credentials()` returns a configured auth class that fetches the token lazily on the first request and refreshes it automatically 30 seconds before expiry — no manual token management required.
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
from airflow_toolkit.providers.filesystem.operators.auth import OAuth2ClientCredentials
|
|
180
|
+
|
|
181
|
+
HttpToFilesystem(
|
|
182
|
+
task_id='fetch_protected_data',
|
|
183
|
+
http_conn_id='my_api',
|
|
184
|
+
filesystem_conn_id='my_data_lake',
|
|
185
|
+
filesystem_path='raw/data/{{ ds }}/',
|
|
186
|
+
endpoint='/api/v1/data',
|
|
187
|
+
method='GET',
|
|
188
|
+
save_format='jsonl',
|
|
189
|
+
auth_type=OAuth2ClientCredentials.client_credentials(
|
|
190
|
+
token_url='https://auth.example.com/oauth2/token',
|
|
191
|
+
client_id='{{ var.value.oauth2_client_id }}',
|
|
192
|
+
client_secret='{{ var.value.oauth2_client_secret }}',
|
|
193
|
+
scope='read', # optional
|
|
194
|
+
),
|
|
195
|
+
)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
**With rate limiting:**
|
|
199
|
+
|
|
200
|
+
Use `requests_per_second` to cap how fast paginated requests are sent. This is useful when the API enforces a rate limit.
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
HttpToFilesystem(
|
|
204
|
+
task_id='fetch_with_rate_limit',
|
|
205
|
+
http_conn_id='my_api',
|
|
206
|
+
filesystem_conn_id='my_data_lake',
|
|
207
|
+
filesystem_path='raw/events/{{ ds }}/',
|
|
208
|
+
endpoint='/api/v1/events',
|
|
209
|
+
method='GET',
|
|
210
|
+
pagination_function=next_page,
|
|
211
|
+
save_format='jsonl',
|
|
212
|
+
requests_per_second=3.0, # max 3 requests per second between pages
|
|
213
|
+
)
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
**Supported response formats:**
|
|
217
|
+
|
|
218
|
+
`save_format` controls how the response is written to the filesystem. For APIs that return binary formats natively (e.g. a reporting API that streams Excel files), set `source_format` to match the response content type:
|
|
219
|
+
|
|
220
|
+
| `source_format` / `save_format` | File extension | Notes |
|
|
221
|
+
|---|---|---|
|
|
222
|
+
| `json` | `.json` | Single JSON object or array |
|
|
223
|
+
| `jsonl` | `.jsonl` | Array response written as one record per line |
|
|
224
|
+
| `csv` | `.csv` | Raw CSV text from the response |
|
|
225
|
+
| `xml` | `.xml` | Raw XML text from the response |
|
|
226
|
+
| `parquet` | `.parquet` | Binary passthrough — API must return Parquet bytes |
|
|
227
|
+
| `excel` | `.xlsx` | Binary passthrough — API must return Excel bytes (requires `[excel]`) |
|
|
228
|
+
| `avro` | `.avro` | Binary passthrough — API must return Avro bytes (requires `[avro]`) |
|
|
229
|
+
| `fixed_width` | `.fwf` | Fixed-width text from the response |
|
|
230
|
+
|
|
231
|
+
All text and JSON formats support gzip/zip compression via the `compression` parameter.
|
|
232
|
+
|
|
174
233
|
### MultiHttpToFilesystem
|
|
175
234
|
|
|
176
|
-
Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file.
|
|
235
|
+
Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file. Requests can run **sequentially** (with optional rate limiting) or **in parallel** using a thread pool.
|
|
236
|
+
|
|
237
|
+
**Sequential with rate limiting:**
|
|
177
238
|
|
|
178
239
|
```python
|
|
179
240
|
from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import MultiHttpToFilesystem
|
|
@@ -185,6 +246,7 @@ MultiHttpToFilesystem(
|
|
|
185
246
|
filesystem_path='raw/reference/{{ ds }}/',
|
|
186
247
|
method='GET',
|
|
187
248
|
save_format='jsonl',
|
|
249
|
+
requests_per_second=2.0, # max 2 requests per second between calls
|
|
188
250
|
multi_requests=[
|
|
189
251
|
{'endpoint': '/api/v1/categories'},
|
|
190
252
|
{'endpoint': '/api/v1/statuses'},
|
|
@@ -193,6 +255,31 @@ MultiHttpToFilesystem(
|
|
|
193
255
|
)
|
|
194
256
|
```
|
|
195
257
|
|
|
258
|
+
**Parallel execution:**
|
|
259
|
+
|
|
260
|
+
Set `max_workers` to run requests concurrently using a thread pool. Each request writes to its own file — there are no file collisions.
|
|
261
|
+
|
|
262
|
+
```python
|
|
263
|
+
MultiHttpToFilesystem(
|
|
264
|
+
task_id='fetch_users_parallel',
|
|
265
|
+
http_conn_id='my_api',
|
|
266
|
+
filesystem_conn_id='my_data_lake',
|
|
267
|
+
filesystem_path='raw/users/{{ ds }}/',
|
|
268
|
+
method='GET',
|
|
269
|
+
save_format='json',
|
|
270
|
+
max_workers=5, # up to 5 concurrent threads
|
|
271
|
+
multi_requests=[
|
|
272
|
+
{'endpoint': '/api/v1/users/1'},
|
|
273
|
+
{'endpoint': '/api/v1/users/2'},
|
|
274
|
+
{'endpoint': '/api/v1/users/3'},
|
|
275
|
+
{'endpoint': '/api/v1/users/4'},
|
|
276
|
+
{'endpoint': '/api/v1/users/5'},
|
|
277
|
+
],
|
|
278
|
+
)
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
> Rate limiting (`requests_per_second`) applies only in sequential mode. In parallel mode the thread pool controls concurrency — use `max_workers` to avoid overwhelming the API.
|
|
282
|
+
|
|
196
283
|
Each entry in `multi_requests` can override any base parameter (`endpoint`, `method`, `headers`, `data`, `jmespath_expression`, `save_format`, `compression`).
|
|
197
284
|
|
|
198
285
|
### SQLToFilesystem
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "airflow-toolkit"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.4.0"
|
|
4
4
|
description = "A toolkit of operators, hooks and utilities for Apache Airflow 3"
|
|
5
5
|
authors = [{ name = "Biel Llobera", email = "biel_llobera@dkl.digital" }]
|
|
6
6
|
requires-python = ">=3.11,<3.15"
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
from requests.auth import AuthBase
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from requests import PreparedRequest
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class OAuth2ClientCredentials:
|
|
14
|
+
"""Factory for OAuth 2.0 Client Credentials auth.
|
|
15
|
+
|
|
16
|
+
Returns a configured AuthBase class that fetches and caches tokens,
|
|
17
|
+
refreshing automatically 30 seconds before expiry.
|
|
18
|
+
|
|
19
|
+
Usage:
|
|
20
|
+
auth_type=OAuth2ClientCredentials.client_credentials(
|
|
21
|
+
token_url="https://auth.example.com/token",
|
|
22
|
+
client_id="{{ var.value.client_id }}",
|
|
23
|
+
client_secret="{{ var.value.client_secret }}",
|
|
24
|
+
)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
def client_credentials(
|
|
29
|
+
token_url: str,
|
|
30
|
+
client_id: str,
|
|
31
|
+
client_secret: str,
|
|
32
|
+
scope: str | None = None,
|
|
33
|
+
) -> type[AuthBase]:
|
|
34
|
+
"""Return a configured AuthBase subclass for OAuth 2.0 Client Credentials.
|
|
35
|
+
|
|
36
|
+
Each call produces an independent class with its own token cache, so
|
|
37
|
+
multiple operators with different credentials do not share tokens.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
class _OAuth2Auth(AuthBase):
|
|
41
|
+
_token: str | None = None
|
|
42
|
+
_expiry: float = 0.0
|
|
43
|
+
|
|
44
|
+
def __call__(self, r: "PreparedRequest") -> "PreparedRequest":
|
|
45
|
+
cls = type(self)
|
|
46
|
+
if cls._token is None or time.time() >= cls._expiry - 30:
|
|
47
|
+
cls._refresh()
|
|
48
|
+
r.headers["Authorization"] = f"Bearer {cls._token}"
|
|
49
|
+
return r
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def _refresh(cls) -> None:
|
|
53
|
+
payload: dict[str, str] = {
|
|
54
|
+
"grant_type": "client_credentials",
|
|
55
|
+
"client_id": client_id,
|
|
56
|
+
"client_secret": client_secret,
|
|
57
|
+
}
|
|
58
|
+
if scope:
|
|
59
|
+
payload["scope"] = scope
|
|
60
|
+
resp = requests.post(token_url, data=payload)
|
|
61
|
+
resp.raise_for_status()
|
|
62
|
+
data = resp.json()
|
|
63
|
+
cls._token = data["access_token"]
|
|
64
|
+
cls._expiry = time.time() + float(data.get("expires_in", 3600))
|
|
65
|
+
|
|
66
|
+
return _OAuth2Auth
|
|
@@ -2,7 +2,9 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
|
+
import time
|
|
5
6
|
import uuid
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
8
|
from io import BytesIO, StringIO
|
|
7
9
|
from typing import (
|
|
8
10
|
TYPE_CHECKING,
|
|
@@ -38,7 +40,10 @@ if TYPE_CHECKING:
|
|
|
38
40
|
|
|
39
41
|
class HttpBatchOperator(HttpOperator):
|
|
40
42
|
def execute(
|
|
41
|
-
self,
|
|
43
|
+
self,
|
|
44
|
+
context: Context,
|
|
45
|
+
use_new_data_parameters_on_pagination: bool = False,
|
|
46
|
+
delay: float = 0.0,
|
|
42
47
|
) -> Generator[Any, None, None]:
|
|
43
48
|
self.log.info("Calling HTTP method")
|
|
44
49
|
|
|
@@ -49,16 +54,22 @@ class HttpBatchOperator(HttpOperator):
|
|
|
49
54
|
for response in self.paginate_sync(
|
|
50
55
|
response=response,
|
|
51
56
|
use_new_data_parameters_on_pagination=use_new_data_parameters_on_pagination,
|
|
57
|
+
delay=delay,
|
|
52
58
|
):
|
|
53
59
|
yield self.process_response(context=context, response=response)
|
|
54
60
|
|
|
55
61
|
def paginate_sync(
|
|
56
|
-
self,
|
|
62
|
+
self,
|
|
63
|
+
response: Response,
|
|
64
|
+
use_new_data_parameters_on_pagination: bool = False,
|
|
65
|
+
delay: float = 0.0,
|
|
57
66
|
) -> Generator[Response, None, None]:
|
|
58
67
|
if not self.pagination_function:
|
|
59
68
|
return
|
|
60
69
|
|
|
61
70
|
while True:
|
|
71
|
+
if delay > 0:
|
|
72
|
+
time.sleep(delay)
|
|
62
73
|
next_page_params = self.pagination_function(response)
|
|
63
74
|
if not next_page_params:
|
|
64
75
|
break
|
|
@@ -71,7 +82,9 @@ class HttpBatchOperator(HttpOperator):
|
|
|
71
82
|
return
|
|
72
83
|
|
|
73
84
|
def _merge_next_page_parameters(
|
|
74
|
-
self,
|
|
85
|
+
self,
|
|
86
|
+
next_page_params: dict,
|
|
87
|
+
use_new_data_parameters_on_pagination: bool = False,
|
|
75
88
|
) -> dict:
|
|
76
89
|
"""Merge initial request parameters with next page parameters.
|
|
77
90
|
|
|
@@ -119,7 +132,21 @@ class HttpToFilesystem(BaseOperator):
|
|
|
119
132
|
template_fields_renderers = HttpOperator.template_fields_renderers
|
|
120
133
|
|
|
121
134
|
json_response_source_format = ["json", "jsonl"]
|
|
122
|
-
binary_response_source_format = ["parquet"]
|
|
135
|
+
binary_response_source_format = ["parquet", "excel", "avro"]
|
|
136
|
+
|
|
137
|
+
# Maps source_format/save_format → file extension.
|
|
138
|
+
# Formats that differ from their name (excel → xlsx, fixed_width → fwf) are listed
|
|
139
|
+
# explicitly; all others fall back to using the format name as-is.
|
|
140
|
+
_FORMAT_EXTENSIONS: dict[str, str] = {
|
|
141
|
+
"json": "json",
|
|
142
|
+
"jsonl": "jsonl",
|
|
143
|
+
"xml": "xml",
|
|
144
|
+
"csv": "csv",
|
|
145
|
+
"parquet": "parquet",
|
|
146
|
+
"excel": "xlsx",
|
|
147
|
+
"avro": "avro",
|
|
148
|
+
"fixed_width": "fwf",
|
|
149
|
+
}
|
|
123
150
|
|
|
124
151
|
def __init__(
|
|
125
152
|
self,
|
|
@@ -142,6 +169,7 @@ class HttpToFilesystem(BaseOperator):
|
|
|
142
169
|
data_transformation_kwargs: dict[str, Any] | None = None,
|
|
143
170
|
file_number_start: int = 1,
|
|
144
171
|
strict_response_schema: bool = True,
|
|
172
|
+
requests_per_second: float | None = None,
|
|
145
173
|
*args,
|
|
146
174
|
**kwargs,
|
|
147
175
|
):
|
|
@@ -170,14 +198,15 @@ class HttpToFilesystem(BaseOperator):
|
|
|
170
198
|
self.source_format = source_format if source_format else save_format
|
|
171
199
|
self.file_number_start = file_number_start
|
|
172
200
|
self.strict_response_schema = strict_response_schema
|
|
201
|
+
self.requests_per_second = requests_per_second
|
|
173
202
|
self.kwargs = kwargs
|
|
174
203
|
|
|
175
204
|
if (
|
|
176
|
-
self.
|
|
205
|
+
self.source_format in self.binary_response_source_format
|
|
177
206
|
and self.compression is not None
|
|
178
207
|
):
|
|
179
208
|
raise ValueError(
|
|
180
|
-
f"Compression is not supported for binary
|
|
209
|
+
f"Compression is not supported for binary source formats: {self.binary_response_source_format}"
|
|
181
210
|
)
|
|
182
211
|
|
|
183
212
|
if self.data_transformation and not callable(self.data_transformation):
|
|
@@ -204,10 +233,12 @@ class HttpToFilesystem(BaseOperator):
|
|
|
204
233
|
response_filter=self._response_filter,
|
|
205
234
|
pagination_function=self.pagination_function,
|
|
206
235
|
)
|
|
236
|
+
delay = (1.0 / self.requests_per_second) if self.requests_per_second else 0.0
|
|
207
237
|
for i, data in enumerate(
|
|
208
238
|
http_batch_operator.execute(
|
|
209
239
|
context,
|
|
210
240
|
use_new_data_parameters_on_pagination=self.use_new_data_parameters_on_pagination,
|
|
241
|
+
delay=delay,
|
|
211
242
|
),
|
|
212
243
|
start=self.file_number_start,
|
|
213
244
|
):
|
|
@@ -232,9 +263,9 @@ class HttpToFilesystem(BaseOperator):
|
|
|
232
263
|
)
|
|
233
264
|
filesystem_protocol.write(BytesIO(), success_file_path)
|
|
234
265
|
|
|
235
|
-
def _file_name(self, n_part) -> str:
|
|
236
|
-
|
|
237
|
-
|
|
266
|
+
def _file_name(self, n_part: int) -> str:
|
|
267
|
+
ext = self._FORMAT_EXTENSIONS.get(self.save_format, self.save_format)
|
|
268
|
+
file_name = f"part{n_part:04}.{ext}"
|
|
238
269
|
if self.compression:
|
|
239
270
|
file_name += f".{self.compression}"
|
|
240
271
|
return file_name
|
|
@@ -263,7 +294,6 @@ class HttpToFilesystem(BaseOperator):
|
|
|
263
294
|
|
|
264
295
|
self.response_filter_data = data
|
|
265
296
|
|
|
266
|
-
# Check if we have a custom data transformation
|
|
267
297
|
if self.data_transformation and self.data_transformation_kwargs:
|
|
268
298
|
transformed = self.data_transformation(
|
|
269
299
|
data, self.data_transformation_kwargs
|
|
@@ -273,8 +303,6 @@ class HttpToFilesystem(BaseOperator):
|
|
|
273
303
|
transformed = self.data_transformation(data)
|
|
274
304
|
return self._ensure_bytesio(transformed)
|
|
275
305
|
|
|
276
|
-
# If we don't have a custom data transformation, use the default one based on the source_format
|
|
277
|
-
|
|
278
306
|
match self.source_format:
|
|
279
307
|
case "json":
|
|
280
308
|
return json_to_binary(data, self.compression)
|
|
@@ -291,10 +319,10 @@ class HttpToFilesystem(BaseOperator):
|
|
|
291
319
|
return BytesIO()
|
|
292
320
|
return list_to_jsonl(data, self.compression)
|
|
293
321
|
|
|
294
|
-
case "xml":
|
|
322
|
+
case "xml" | "fixed_width":
|
|
295
323
|
return xml_to_binary(data, self.compression)
|
|
296
324
|
|
|
297
|
-
case "parquet":
|
|
325
|
+
case "parquet" | "excel" | "avro":
|
|
298
326
|
return self._ensure_bytesio(data)
|
|
299
327
|
|
|
300
328
|
case "csv":
|
|
@@ -306,9 +334,6 @@ class HttpToFilesystem(BaseOperator):
|
|
|
306
334
|
)
|
|
307
335
|
|
|
308
336
|
def _ensure_bytesio(self, value: BytesIO | bytes | str) -> BytesIO:
|
|
309
|
-
"""
|
|
310
|
-
Ensure the transformation output is a BytesIO object.
|
|
311
|
-
"""
|
|
312
337
|
if isinstance(value, BytesIO):
|
|
313
338
|
return value
|
|
314
339
|
if isinstance(value, bytes):
|
|
@@ -329,25 +354,28 @@ class MultiHttpToFilesystem(HttpToFilesystem):
|
|
|
329
354
|
Args:
|
|
330
355
|
multi_requests: List of request specifications. Each item can override
|
|
331
356
|
any base operator parameter for that specific request.
|
|
357
|
+
max_workers: Number of threads for parallel execution. None (default) runs
|
|
358
|
+
requests sequentially. Set to an integer to enable concurrency.
|
|
359
|
+
Note: rate limiting (requests_per_second) is only applied in
|
|
360
|
+
sequential mode.
|
|
332
361
|
|
|
333
362
|
Example:
|
|
334
363
|
MultiHttpToFilesystem(
|
|
335
364
|
http_conn_id='api_connection',
|
|
336
|
-
base_endpoint='/api/v1',
|
|
337
365
|
headers={'Authorization': 'Bearer token'},
|
|
338
366
|
multi_requests=[
|
|
339
367
|
{'endpoint': '/users/1'},
|
|
340
368
|
{'endpoint': '/users/2', 'method': 'POST', 'data': {...}},
|
|
341
|
-
{'endpoint': '/orders', 'headers': {'Custom': 'Header'}}
|
|
369
|
+
{'endpoint': '/orders', 'headers': {'Custom': 'Header'}},
|
|
342
370
|
]
|
|
343
371
|
)
|
|
344
372
|
|
|
345
373
|
Notes:
|
|
346
374
|
- Pagination is not supported
|
|
347
|
-
- Requests are executed sequentially
|
|
375
|
+
- Requests are executed sequentially by default; set max_workers for concurrency
|
|
348
376
|
- Per-request values override base configuration with dict merging for
|
|
349
377
|
headers/data, and replacement for other parameters
|
|
350
|
-
- All validations are re-applied after each request configuration
|
|
378
|
+
- All validations are re-applied after each request configuration override
|
|
351
379
|
"""
|
|
352
380
|
|
|
353
381
|
template_fields = HttpToFilesystem.template_fields + ["multi_requests"]
|
|
@@ -356,11 +384,15 @@ class MultiHttpToFilesystem(HttpToFilesystem):
|
|
|
356
384
|
"multi_requests": "py",
|
|
357
385
|
}
|
|
358
386
|
|
|
359
|
-
# Allowed keys come from the TypedDict (so static + runtime stay in sync)
|
|
360
387
|
_ALLOWED_KEYS = set(RequestSpec.__annotations__.keys())
|
|
361
388
|
|
|
362
|
-
def __init__(
|
|
363
|
-
|
|
389
|
+
def __init__(
|
|
390
|
+
self,
|
|
391
|
+
*,
|
|
392
|
+
multi_requests: list[RequestSpec],
|
|
393
|
+
max_workers: int | None = None,
|
|
394
|
+
**kwargs,
|
|
395
|
+
):
|
|
364
396
|
if kwargs.get("pagination_function") is not None:
|
|
365
397
|
raise ValueError("Pagination is not supported in MultiHttpToFilesystem")
|
|
366
398
|
|
|
@@ -369,6 +401,7 @@ class MultiHttpToFilesystem(HttpToFilesystem):
|
|
|
369
401
|
|
|
370
402
|
super().__init__(**kwargs)
|
|
371
403
|
self.multi_requests: list[RequestSpec] = multi_requests
|
|
404
|
+
self.max_workers = max_workers
|
|
372
405
|
|
|
373
406
|
def _capture_request_state(self) -> RequestState:
|
|
374
407
|
return {
|
|
@@ -396,29 +429,24 @@ class MultiHttpToFilesystem(HttpToFilesystem):
|
|
|
396
429
|
|
|
397
430
|
@staticmethod
|
|
398
431
|
def _merge_or_replace(base_val: Any, override_val: Any) -> Any:
|
|
399
|
-
# Shallow-merge dicts; otherwise replace.
|
|
400
432
|
if isinstance(base_val, dict) and isinstance(override_val, dict):
|
|
401
433
|
return {**base_val, **override_val}
|
|
402
434
|
return override_val
|
|
403
435
|
|
|
404
436
|
def _apply_request_overrides(self, spec: RequestSpec, base: RequestState) -> None:
|
|
405
|
-
"""
|
|
406
|
-
Apply per-request configuration overrides on top of base operator settings.
|
|
437
|
+
"""Apply per-request configuration overrides on top of base operator settings.
|
|
407
438
|
|
|
408
439
|
Args:
|
|
409
|
-
spec: Request-specific configuration overrides
|
|
410
|
-
base: Base operator configuration to restore after request
|
|
440
|
+
spec: Request-specific configuration overrides.
|
|
441
|
+
base: Base operator configuration to restore after request.
|
|
411
442
|
|
|
412
443
|
Raises:
|
|
413
|
-
ValueError: If spec contains unknown keys or invalid combinations
|
|
444
|
+
ValueError: If spec contains unknown keys or invalid combinations.
|
|
414
445
|
"""
|
|
415
|
-
|
|
416
|
-
# Validate allowed override keys
|
|
417
446
|
unknown = set(spec.keys()) - self._ALLOWED_KEYS
|
|
418
447
|
if unknown:
|
|
419
448
|
raise ValueError(f"Unknown keys in multi_requests item: {sorted(unknown)}")
|
|
420
449
|
|
|
421
|
-
# Simple fields
|
|
422
450
|
self.endpoint = spec.get("endpoint", base["endpoint"])
|
|
423
451
|
self.method = spec.get("method", base["method"])
|
|
424
452
|
self.auth_type = spec.get("auth_type", base["auth_type"])
|
|
@@ -426,7 +454,6 @@ class MultiHttpToFilesystem(HttpToFilesystem):
|
|
|
426
454
|
"jmespath_expression", base["jmespath_expression"]
|
|
427
455
|
)
|
|
428
456
|
|
|
429
|
-
# Dict-like merge/replace behavior
|
|
430
457
|
if "headers" in spec:
|
|
431
458
|
self.headers = self._merge_or_replace(base["headers"], spec["headers"])
|
|
432
459
|
else:
|
|
@@ -437,22 +464,19 @@ class MultiHttpToFilesystem(HttpToFilesystem):
|
|
|
437
464
|
else:
|
|
438
465
|
self.data = base["data"]
|
|
439
466
|
|
|
440
|
-
# Formats & compression
|
|
441
467
|
self.save_format = spec.get("save_format", base["save_format"])
|
|
442
468
|
self.source_format = spec.get("source_format", base["source_format"])
|
|
443
469
|
self.compression = spec.get("compression", base["compression"])
|
|
444
470
|
|
|
445
|
-
# Validate this request's final state
|
|
446
471
|
self._validate_current_request_state()
|
|
447
472
|
|
|
448
473
|
def _validate_current_request_state(self) -> None:
|
|
449
|
-
# Re-apply critical validations that may be affected by per-request overrides
|
|
450
474
|
if (
|
|
451
|
-
self.
|
|
475
|
+
self.source_format in self.binary_response_source_format
|
|
452
476
|
and self.compression is not None
|
|
453
477
|
):
|
|
454
478
|
raise ValueError(
|
|
455
|
-
f"Compression is not supported for binary
|
|
479
|
+
f"Compression is not supported for binary source formats: "
|
|
456
480
|
f"{self.binary_response_source_format}"
|
|
457
481
|
)
|
|
458
482
|
if self.data_transformation and not callable(self.data_transformation):
|
|
@@ -466,15 +490,91 @@ class MultiHttpToFilesystem(HttpToFilesystem):
|
|
|
466
490
|
"data_transformation must be provided if data_transformation_kwargs is provided"
|
|
467
491
|
)
|
|
468
492
|
|
|
469
|
-
|
|
493
|
+
@staticmethod
|
|
494
|
+
def _execute_one_request(
|
|
495
|
+
base_op: "MultiHttpToFilesystem",
|
|
496
|
+
context: Context,
|
|
497
|
+
spec: RequestSpec,
|
|
498
|
+
base_state: RequestState,
|
|
499
|
+
file_number: int,
|
|
500
|
+
) -> None:
|
|
501
|
+
"""Execute a single request as an independent HttpToFilesystem (thread-safe).
|
|
502
|
+
|
|
503
|
+
Creates a standalone operator instance with the merged configuration so that
|
|
504
|
+
parallel workers never share mutable state.
|
|
505
|
+
"""
|
|
506
|
+
unknown = set(spec.keys()) - MultiHttpToFilesystem._ALLOWED_KEYS
|
|
507
|
+
if unknown:
|
|
508
|
+
raise ValueError(f"Unknown keys in multi_requests item: {sorted(unknown)}")
|
|
509
|
+
|
|
510
|
+
merged: dict[str, Any] = {
|
|
511
|
+
"endpoint": spec.get("endpoint", base_state["endpoint"]),
|
|
512
|
+
"method": spec.get("method", base_state["method"]),
|
|
513
|
+
"auth_type": spec.get("auth_type", base_state["auth_type"]),
|
|
514
|
+
"jmespath_expression": spec.get(
|
|
515
|
+
"jmespath_expression", base_state["jmespath_expression"]
|
|
516
|
+
),
|
|
517
|
+
"headers": MultiHttpToFilesystem._merge_or_replace(
|
|
518
|
+
base_state["headers"], spec["headers"]
|
|
519
|
+
)
|
|
520
|
+
if "headers" in spec
|
|
521
|
+
else base_state["headers"],
|
|
522
|
+
"data": MultiHttpToFilesystem._merge_or_replace(
|
|
523
|
+
base_state["data"], spec["data"]
|
|
524
|
+
)
|
|
525
|
+
if "data" in spec
|
|
526
|
+
else base_state["data"],
|
|
527
|
+
"save_format": spec.get("save_format", base_state["save_format"]),
|
|
528
|
+
"source_format": spec.get("source_format", base_state["source_format"]),
|
|
529
|
+
"compression": spec.get("compression", base_state["compression"]),
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
op = HttpToFilesystem(
|
|
533
|
+
task_id=f"http-{uuid.uuid4()}",
|
|
534
|
+
http_conn_id=base_op.http_conn_id,
|
|
535
|
+
filesystem_conn_id=base_op.filesystem_conn_id,
|
|
536
|
+
filesystem_path=base_op.filesystem_path,
|
|
537
|
+
data_transformation=base_op.data_transformation,
|
|
538
|
+
data_transformation_kwargs=base_op.data_transformation_kwargs or None,
|
|
539
|
+
create_file_on_success=base_op.create_file_on_success,
|
|
540
|
+
strict_response_schema=base_op.strict_response_schema,
|
|
541
|
+
requests_per_second=None,
|
|
542
|
+
file_number_start=file_number,
|
|
543
|
+
**merged,
|
|
544
|
+
)
|
|
545
|
+
op.execute(context)
|
|
546
|
+
|
|
547
|
+
def execute(self, context: Context) -> Any:
|
|
470
548
|
base = self._capture_request_state()
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
549
|
+
|
|
550
|
+
if self.max_workers:
|
|
551
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
552
|
+
futures = [
|
|
553
|
+
executor.submit(
|
|
554
|
+
MultiHttpToFilesystem._execute_one_request,
|
|
555
|
+
self,
|
|
556
|
+
context,
|
|
557
|
+
spec,
|
|
558
|
+
base,
|
|
559
|
+
i,
|
|
560
|
+
)
|
|
561
|
+
for i, spec in enumerate(self.multi_requests, start=1)
|
|
562
|
+
]
|
|
563
|
+
for future in as_completed(futures):
|
|
564
|
+
future.result()
|
|
565
|
+
else:
|
|
566
|
+
delay = (
|
|
567
|
+
(1.0 / self.requests_per_second) if self.requests_per_second else 0.0
|
|
568
|
+
)
|
|
569
|
+
for i, spec in enumerate(self.multi_requests, start=1):
|
|
570
|
+
if i > 1 and delay > 0:
|
|
571
|
+
time.sleep(delay)
|
|
572
|
+
self.file_number_start = i
|
|
573
|
+
try:
|
|
574
|
+
self._apply_request_overrides(spec, base)
|
|
575
|
+
super().execute(context)
|
|
576
|
+
finally:
|
|
577
|
+
self._restore_request_state(base)
|
|
478
578
|
|
|
479
579
|
|
|
480
580
|
def list_to_jsonl(data: list[dict], compression: "CompressionOptions") -> BytesIO:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: airflow-toolkit
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.4.0
|
|
4
4
|
Summary: A toolkit of operators, hooks and utilities for Apache Airflow 3
|
|
5
5
|
Author-email: Biel Llobera <biel_llobera@dkl.digital>
|
|
6
6
|
Requires-Python: <3.15,>=3.11
|
|
@@ -145,8 +145,8 @@ pip install "airflow-toolkit[airflow3-full]"
|
|
|
145
145
|
| `http` | `providers-http`, `requests`, `jmespath`, `pandas` | `HttpToFilesystem`, `MultiHttpToFilesystem` |
|
|
146
146
|
| `duckdb` | `airflow-provider-duckdb` | `DuckdbToDeltalake` operator |
|
|
147
147
|
| `sqlite` | `providers-sqlite` | SQLite as source or destination |
|
|
148
|
-
| `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` |
|
|
149
|
-
| `avro` | `fastavro` | Avro support in `FilesystemToDatabase` |
|
|
148
|
+
| `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` and `HttpToFilesystem` |
|
|
149
|
+
| `avro` | `fastavro` | Avro support in `FilesystemToDatabase` and `HttpToFilesystem` |
|
|
150
150
|
| `airflow3-full` | all of the above | Quick start / development |
|
|
151
151
|
|
|
152
152
|
---
|
|
@@ -191,7 +191,7 @@ Changing the connection's `conn_type` is all that is needed to switch backends
|
|
|
191
191
|
|
|
192
192
|
### HttpToFilesystem
|
|
193
193
|
|
|
194
|
-
Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, and custom response transformations.
|
|
194
|
+
Calls an HTTP endpoint and writes the response to any filesystem. Supports pagination, JMESPath filtering, compression, OAuth 2.0 authentication, rate limiting, and custom response transformations.
|
|
195
195
|
|
|
196
196
|
```python
|
|
197
197
|
from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import HttpToFilesystem
|
|
@@ -208,7 +208,7 @@ HttpToFilesystem(
|
|
|
208
208
|
)
|
|
209
209
|
```
|
|
210
210
|
|
|
211
|
-
With cursor-based pagination
|
|
211
|
+
**With cursor-based pagination:**
|
|
212
212
|
|
|
213
213
|
```python
|
|
214
214
|
def next_page(response):
|
|
@@ -230,9 +230,70 @@ HttpToFilesystem(
|
|
|
230
230
|
)
|
|
231
231
|
```
|
|
232
232
|
|
|
233
|
+
**With OAuth 2.0 Client Credentials:**
|
|
234
|
+
|
|
235
|
+
`OAuth2ClientCredentials.client_credentials()` returns a configured auth class that fetches the token lazily on the first request and refreshes it automatically 30 seconds before expiry — no manual token management required.
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
from airflow_toolkit.providers.filesystem.operators.auth import OAuth2ClientCredentials
|
|
239
|
+
|
|
240
|
+
HttpToFilesystem(
|
|
241
|
+
task_id='fetch_protected_data',
|
|
242
|
+
http_conn_id='my_api',
|
|
243
|
+
filesystem_conn_id='my_data_lake',
|
|
244
|
+
filesystem_path='raw/data/{{ ds }}/',
|
|
245
|
+
endpoint='/api/v1/data',
|
|
246
|
+
method='GET',
|
|
247
|
+
save_format='jsonl',
|
|
248
|
+
auth_type=OAuth2ClientCredentials.client_credentials(
|
|
249
|
+
token_url='https://auth.example.com/oauth2/token',
|
|
250
|
+
client_id='{{ var.value.oauth2_client_id }}',
|
|
251
|
+
client_secret='{{ var.value.oauth2_client_secret }}',
|
|
252
|
+
scope='read', # optional
|
|
253
|
+
),
|
|
254
|
+
)
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
**With rate limiting:**
|
|
258
|
+
|
|
259
|
+
Use `requests_per_second` to cap how fast paginated requests are sent. This is useful when the API enforces a rate limit.
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
HttpToFilesystem(
|
|
263
|
+
task_id='fetch_with_rate_limit',
|
|
264
|
+
http_conn_id='my_api',
|
|
265
|
+
filesystem_conn_id='my_data_lake',
|
|
266
|
+
filesystem_path='raw/events/{{ ds }}/',
|
|
267
|
+
endpoint='/api/v1/events',
|
|
268
|
+
method='GET',
|
|
269
|
+
pagination_function=next_page,
|
|
270
|
+
save_format='jsonl',
|
|
271
|
+
requests_per_second=3.0, # max 3 requests per second between pages
|
|
272
|
+
)
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
**Supported response formats:**
|
|
276
|
+
|
|
277
|
+
`save_format` controls how the response is written to the filesystem. For APIs that return binary formats natively (e.g. a reporting API that streams Excel files), set `source_format` to match the response content type:
|
|
278
|
+
|
|
279
|
+
| `source_format` / `save_format` | File extension | Notes |
|
|
280
|
+
|---|---|---|
|
|
281
|
+
| `json` | `.json` | Single JSON object or array |
|
|
282
|
+
| `jsonl` | `.jsonl` | Array response written as one record per line |
|
|
283
|
+
| `csv` | `.csv` | Raw CSV text from the response |
|
|
284
|
+
| `xml` | `.xml` | Raw XML text from the response |
|
|
285
|
+
| `parquet` | `.parquet` | Binary passthrough — API must return Parquet bytes |
|
|
286
|
+
| `excel` | `.xlsx` | Binary passthrough — API must return Excel bytes (requires `[excel]`) |
|
|
287
|
+
| `avro` | `.avro` | Binary passthrough — API must return Avro bytes (requires `[avro]`) |
|
|
288
|
+
| `fixed_width` | `.fwf` | Fixed-width text from the response |
|
|
289
|
+
|
|
290
|
+
All text and JSON formats support gzip/zip compression via the `compression` parameter.
|
|
291
|
+
|
|
233
292
|
### MultiHttpToFilesystem
|
|
234
293
|
|
|
235
|
-
Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file.
|
|
294
|
+
Runs multiple HTTP requests in a single Airflow task, saving each response as a separate file. Requests can run **sequentially** (with optional rate limiting) or **in parallel** using a thread pool.
|
|
295
|
+
|
|
296
|
+
**Sequential with rate limiting:**
|
|
236
297
|
|
|
237
298
|
```python
|
|
238
299
|
from airflow_toolkit.providers.filesystem.operators.http_to_filesystem import MultiHttpToFilesystem
|
|
@@ -244,6 +305,7 @@ MultiHttpToFilesystem(
|
|
|
244
305
|
filesystem_path='raw/reference/{{ ds }}/',
|
|
245
306
|
method='GET',
|
|
246
307
|
save_format='jsonl',
|
|
308
|
+
requests_per_second=2.0, # max 2 requests per second between calls
|
|
247
309
|
multi_requests=[
|
|
248
310
|
{'endpoint': '/api/v1/categories'},
|
|
249
311
|
{'endpoint': '/api/v1/statuses'},
|
|
@@ -252,6 +314,31 @@ MultiHttpToFilesystem(
|
|
|
252
314
|
)
|
|
253
315
|
```
|
|
254
316
|
|
|
317
|
+
**Parallel execution:**
|
|
318
|
+
|
|
319
|
+
Set `max_workers` to run requests concurrently using a thread pool. Each request writes to its own file — there are no file collisions.
|
|
320
|
+
|
|
321
|
+
```python
|
|
322
|
+
MultiHttpToFilesystem(
|
|
323
|
+
task_id='fetch_users_parallel',
|
|
324
|
+
http_conn_id='my_api',
|
|
325
|
+
filesystem_conn_id='my_data_lake',
|
|
326
|
+
filesystem_path='raw/users/{{ ds }}/',
|
|
327
|
+
method='GET',
|
|
328
|
+
save_format='json',
|
|
329
|
+
max_workers=5, # up to 5 concurrent threads
|
|
330
|
+
multi_requests=[
|
|
331
|
+
{'endpoint': '/api/v1/users/1'},
|
|
332
|
+
{'endpoint': '/api/v1/users/2'},
|
|
333
|
+
{'endpoint': '/api/v1/users/3'},
|
|
334
|
+
{'endpoint': '/api/v1/users/4'},
|
|
335
|
+
{'endpoint': '/api/v1/users/5'},
|
|
336
|
+
],
|
|
337
|
+
)
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
> Rate limiting (`requests_per_second`) applies only in sequential mode. In parallel mode the thread pool controls concurrency — use `max_workers` to avoid overwhelming the API.
|
|
341
|
+
|
|
255
342
|
Each entry in `multi_requests` can override any base parameter (`endpoint`, `method`, `headers`, `data`, `jmespath_expression`, `save_format`, `compression`).
|
|
256
343
|
|
|
257
344
|
### SQLToFilesystem
|
|
@@ -48,5 +48,6 @@ src/airflow_toolkit/providers/deltalake/sensors/filesystem_file.py
|
|
|
48
48
|
src/airflow_toolkit/providers/filesystem/__init__.py
|
|
49
49
|
src/airflow_toolkit/providers/filesystem/tasks.py
|
|
50
50
|
src/airflow_toolkit/providers/filesystem/operators/__init__.py
|
|
51
|
+
src/airflow_toolkit/providers/filesystem/operators/auth.py
|
|
51
52
|
src/airflow_toolkit/providers/filesystem/operators/filesystem.py
|
|
52
53
|
src/airflow_toolkit/providers/filesystem/operators/http_to_filesystem.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/_compact/airflow_shim.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/filesystems/impl/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/email.py
RENAMED
|
File without changes
|
{airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/slack.py
RENAMED
|
File without changes
|
{airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/channels/teams.py
RENAMED
|
File without changes
|
{airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/notifications/context.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/azure/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/deltalake/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit/providers/filesystem/tasks.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{airflow_toolkit-2.3.0 → airflow_toolkit-2.4.0}/src/airflow_toolkit.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|