airflow-toolkit 2.2.0__tar.gz → 2.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {airflow_toolkit-2.2.0/src/airflow_toolkit.egg-info → airflow_toolkit-2.3.0}/PKG-INFO +103 -4
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/README.md +96 -3
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/pyproject.toml +9 -1
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/compression_utils.py +2 -2
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/deltalake/operators/filesystem_to_database.py +90 -6
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/filesystem/operators/http_to_filesystem.py +7 -35
- airflow_toolkit-2.3.0/src/airflow_toolkit/testing.py +59 -0
- airflow_toolkit-2.3.0/src/airflow_toolkit/types.py +51 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0/src/airflow_toolkit.egg-info}/PKG-INFO +103 -4
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit.egg-info/SOURCES.txt +2 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit.egg-info/requires.txt +8 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/LICENSE.txt +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/setup.cfg +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/_compact/airflow_shim.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/exceptions.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/filesystem_factory.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/filesystem_protocol.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/azure_databricks_volume_filesystem.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/azure_file_share_filesystem.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/blob_storage_filesystem.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/google_cloud_storage_filesystem.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/local_filesystem.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/s3_filesystem.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/sftp_filesystem.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/channels/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/channels/discord.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/channels/email.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/channels/slack.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/channels/teams.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/context.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/protocols.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/azure/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/azure/hooks/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/azure/hooks/azure_databricks.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/azure/hooks/azure_file_share.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/deltalake/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/deltalake/operators/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/deltalake/operators/duckdb_to_deltalake.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/deltalake/sensors/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/deltalake/sensors/filesystem_file.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/filesystem/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/filesystem/operators/__init__.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/filesystem/operators/filesystem.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/filesystem/tasks.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/package.py +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/py.typed +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit.egg-info/dependency_links.txt +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit.egg-info/entry_points.txt +0 -0
- {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: airflow-toolkit
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.3.0
|
|
4
4
|
Summary: A toolkit of operators, hooks and utilities for Apache Airflow 3
|
|
5
5
|
Author-email: Biel Llobera <biel_llobera@dkl.digital>
|
|
6
6
|
Requires-Python: <3.15,>=3.11
|
|
@@ -32,6 +32,10 @@ Provides-Extra: duckdb
|
|
|
32
32
|
Requires-Dist: airflow-provider-duckdb>=0.1.2; extra == "duckdb"
|
|
33
33
|
Provides-Extra: sqlite
|
|
34
34
|
Requires-Dist: apache-airflow-providers-sqlite; extra == "sqlite"
|
|
35
|
+
Provides-Extra: excel
|
|
36
|
+
Requires-Dist: openpyxl>=3.1; extra == "excel"
|
|
37
|
+
Provides-Extra: avro
|
|
38
|
+
Requires-Dist: fastavro>=1.9; extra == "avro"
|
|
35
39
|
Provides-Extra: airflow3-full
|
|
36
40
|
Requires-Dist: apache-airflow<4,>=3; extra == "airflow3-full"
|
|
37
41
|
Requires-Dist: apache-airflow-providers-fab>=3.0.0; extra == "airflow3-full"
|
|
@@ -49,6 +53,8 @@ Requires-Dist: requests>=2.31.0; extra == "airflow3-full"
|
|
|
49
53
|
Requires-Dist: jmespath<2,>=1.0.1; extra == "airflow3-full"
|
|
50
54
|
Requires-Dist: airflow-provider-duckdb>=0.1.2; extra == "airflow3-full"
|
|
51
55
|
Requires-Dist: apache-airflow-providers-sqlite; extra == "airflow3-full"
|
|
56
|
+
Requires-Dist: openpyxl>=3.1; extra == "airflow3-full"
|
|
57
|
+
Requires-Dist: fastavro>=1.9; extra == "airflow3-full"
|
|
52
58
|
Dynamic: license-file
|
|
53
59
|
|
|
54
60
|
# Airflow Toolkit
|
|
@@ -136,10 +142,11 @@ pip install "airflow-toolkit[airflow3-full]"
|
|
|
136
142
|
| `google` | `providers-google` | GCS filesystem backend |
|
|
137
143
|
| `azure` | `providers-microsoft-azure` | Azure Blob / ADLS filesystem backend |
|
|
138
144
|
| `sftp` | `providers-sftp` | SFTP filesystem backend |
|
|
139
|
-
| `slack` | `providers-slack` | Slack failure notifications |
|
|
140
145
|
| `http` | `providers-http`, `requests`, `jmespath`, `pandas` | `HttpToFilesystem`, `MultiHttpToFilesystem` |
|
|
141
146
|
| `duckdb` | `airflow-provider-duckdb` | `DuckdbToDeltalake` operator |
|
|
142
147
|
| `sqlite` | `providers-sqlite` | SQLite as source or destination |
|
|
148
|
+
| `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` |
|
|
149
|
+
| `avro` | `fastavro` | Avro support in `FilesystemToDatabase` |
|
|
143
150
|
| `airflow3-full` | all of the above | Quick start / development |
|
|
144
151
|
|
|
145
152
|
---
|
|
@@ -313,7 +320,9 @@ FilesystemToFilesystem(
|
|
|
313
320
|
|
|
314
321
|
### FilesystemToDatabase
|
|
315
322
|
|
|
316
|
-
Reads files
|
|
323
|
+
Reads files from any filesystem and loads them into any SQLAlchemy-compatible database. Handles schema drift automatically: columns present in the file but missing from the table are added; columns present in the table but missing from the file are filled with `NULL`.
|
|
324
|
+
|
|
325
|
+
**Supported formats:** `csv`, `json`, `parquet`, `excel`, `avro`, `fixed_width`.
|
|
317
326
|
|
|
318
327
|
```python
|
|
319
328
|
from airflow_toolkit.providers.deltalake.operators.filesystem_to_database import FilesystemToDatabaseOperator
|
|
@@ -325,7 +334,7 @@ FilesystemToDatabaseOperator(
|
|
|
325
334
|
filesystem_path='raw/orders/{{ ds }}/',
|
|
326
335
|
db_schema='public',
|
|
327
336
|
db_table='orders',
|
|
328
|
-
source_format='csv',
|
|
337
|
+
source_format='csv', # 'csv' | 'json' | 'parquet' | 'excel' | 'avro' | 'fixed_width'
|
|
329
338
|
table_aggregation_type='append', # 'append' | 'replace' | 'fail'
|
|
330
339
|
metadata={
|
|
331
340
|
'_ds': '{{ ds }}',
|
|
@@ -335,6 +344,52 @@ FilesystemToDatabaseOperator(
|
|
|
335
344
|
)
|
|
336
345
|
```
|
|
337
346
|
|
|
347
|
+
**Excel** (requires the `[excel]` extra):
|
|
348
|
+
|
|
349
|
+
```python
|
|
350
|
+
FilesystemToDatabaseOperator(
|
|
351
|
+
task_id='load_excel_report',
|
|
352
|
+
filesystem_conn_id='my_data_lake',
|
|
353
|
+
database_conn_id='my_postgres',
|
|
354
|
+
filesystem_path='raw/reports/{{ ds }}/',
|
|
355
|
+
db_table='monthly_report',
|
|
356
|
+
source_format='excel',
|
|
357
|
+
source_format_options={'sheet_name': 'Data'},
|
|
358
|
+
)
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
**Avro** (requires the `[avro]` extra):
|
|
362
|
+
|
|
363
|
+
```python
|
|
364
|
+
FilesystemToDatabaseOperator(
|
|
365
|
+
task_id='load_avro_events',
|
|
366
|
+
filesystem_conn_id='my_data_lake',
|
|
367
|
+
database_conn_id='my_postgres',
|
|
368
|
+
filesystem_path='raw/events/{{ ds }}/',
|
|
369
|
+
db_table='events',
|
|
370
|
+
source_format='avro',
|
|
371
|
+
)
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
**Fixed-width** (no extra required — pandas native):
|
|
375
|
+
|
|
376
|
+
```python
|
|
377
|
+
FilesystemToDatabaseOperator(
|
|
378
|
+
task_id='load_fixed_width',
|
|
379
|
+
filesystem_conn_id='my_data_lake',
|
|
380
|
+
database_conn_id='my_postgres',
|
|
381
|
+
filesystem_path='raw/exports/{{ ds }}/',
|
|
382
|
+
db_table='transactions',
|
|
383
|
+
source_format='fixed_width',
|
|
384
|
+
source_format_options={
|
|
385
|
+
'colspecs': [(0, 10), (10, 25), (25, 35)],
|
|
386
|
+
'names': ['date', 'description', 'amount'],
|
|
387
|
+
},
|
|
388
|
+
)
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
Each format is matched by file extension: `.csv`/`.csv.gz`, `.json`/`.json.gz`, `.parquet`/`.parquet.gz`, `.xlsx`/`.xls`, `.avro`, `.fwf`/`.txt`/`.dat`. Files with other extensions in the same prefix are silently skipped.
|
|
392
|
+
|
|
338
393
|
### DuckdbToDeltalake
|
|
339
394
|
|
|
340
395
|
Executes a DuckDB SQL query and writes the result directly to a Delta Lake table on Azure storage. Useful for in-process transformations that land results as an open table format.
|
|
@@ -530,6 +585,50 @@ Each environment maps to a distinct colour across all channels so alerts are rec
|
|
|
530
585
|
|
|
531
586
|
---
|
|
532
587
|
|
|
588
|
+
## Testing Utilities
|
|
589
|
+
|
|
590
|
+
### MockFilesystem
|
|
591
|
+
|
|
592
|
+
`MockFilesystem` is an in-memory implementation of `FilesystemProtocol` for unit testing. It requires no Docker, no cloud credentials, and no network — all files are stored in a plain Python dict.
|
|
593
|
+
|
|
594
|
+
```python
|
|
595
|
+
from airflow_toolkit.testing import MockFilesystem
|
|
596
|
+
|
|
597
|
+
# Pre-load files at construction time
|
|
598
|
+
fs = MockFilesystem({
|
|
599
|
+
"raw/orders/2024-01-01/data.csv": b"id,amount\n1,100\n2,200",
|
|
600
|
+
})
|
|
601
|
+
|
|
602
|
+
# Or write files programmatically
|
|
603
|
+
fs.write(b"id,amount\n3,300", "raw/orders/2024-01-02/data.csv")
|
|
604
|
+
|
|
605
|
+
# Inspect the result in assertions
|
|
606
|
+
assert fs.check_file("raw/orders/2024-01-01/data.csv")
|
|
607
|
+
assert len(fs.list_files("raw/orders/")) == 2
|
|
608
|
+
assert fs.files["raw/orders/2024-01-01/data.csv"] == b"id,amount\n1,100\n2,200"
|
|
609
|
+
```
|
|
610
|
+
|
|
611
|
+
Use it to patch `FilesystemFactory.get_data_lake_filesystem` in your operator tests:
|
|
612
|
+
|
|
613
|
+
```python
|
|
614
|
+
from unittest.mock import patch
|
|
615
|
+
from airflow_toolkit.testing import MockFilesystem
|
|
616
|
+
|
|
617
|
+
def test_my_pipeline(tmp_path):
|
|
618
|
+
fs = MockFilesystem({"data/file.csv": b"id,name\n1,Alice"})
|
|
619
|
+
|
|
620
|
+
with patch(
|
|
621
|
+
"airflow_toolkit.filesystems.filesystem_factory.FilesystemFactory.get_data_lake_filesystem",
|
|
622
|
+
return_value=fs,
|
|
623
|
+
):
|
|
624
|
+
# run your operator or task here
|
|
625
|
+
...
|
|
626
|
+
```
|
|
627
|
+
|
|
628
|
+
`MockFilesystem` implements the full `FilesystemProtocol`: `read`, `write`, `delete_file`, `create_prefix`, `delete_prefix`, `check_file`, `check_prefix`, `list_files`.
|
|
629
|
+
|
|
630
|
+
---
|
|
631
|
+
|
|
533
632
|
## Running Tests
|
|
534
633
|
|
|
535
634
|
### Integration tests
|
|
@@ -83,10 +83,11 @@ pip install "airflow-toolkit[airflow3-full]"
|
|
|
83
83
|
| `google` | `providers-google` | GCS filesystem backend |
|
|
84
84
|
| `azure` | `providers-microsoft-azure` | Azure Blob / ADLS filesystem backend |
|
|
85
85
|
| `sftp` | `providers-sftp` | SFTP filesystem backend |
|
|
86
|
-
| `slack` | `providers-slack` | Slack failure notifications |
|
|
87
86
|
| `http` | `providers-http`, `requests`, `jmespath`, `pandas` | `HttpToFilesystem`, `MultiHttpToFilesystem` |
|
|
88
87
|
| `duckdb` | `airflow-provider-duckdb` | `DuckdbToDeltalake` operator |
|
|
89
88
|
| `sqlite` | `providers-sqlite` | SQLite as source or destination |
|
|
89
|
+
| `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` |
|
|
90
|
+
| `avro` | `fastavro` | Avro support in `FilesystemToDatabase` |
|
|
90
91
|
| `airflow3-full` | all of the above | Quick start / development |
|
|
91
92
|
|
|
92
93
|
---
|
|
@@ -260,7 +261,9 @@ FilesystemToFilesystem(
|
|
|
260
261
|
|
|
261
262
|
### FilesystemToDatabase
|
|
262
263
|
|
|
263
|
-
Reads files
|
|
264
|
+
Reads files from any filesystem and loads them into any SQLAlchemy-compatible database. Handles schema drift automatically: columns present in the file but missing from the table are added; columns present in the table but missing from the file are filled with `NULL`.
|
|
265
|
+
|
|
266
|
+
**Supported formats:** `csv`, `json`, `parquet`, `excel`, `avro`, `fixed_width`.
|
|
264
267
|
|
|
265
268
|
```python
|
|
266
269
|
from airflow_toolkit.providers.deltalake.operators.filesystem_to_database import FilesystemToDatabaseOperator
|
|
@@ -272,7 +275,7 @@ FilesystemToDatabaseOperator(
|
|
|
272
275
|
filesystem_path='raw/orders/{{ ds }}/',
|
|
273
276
|
db_schema='public',
|
|
274
277
|
db_table='orders',
|
|
275
|
-
source_format='csv',
|
|
278
|
+
source_format='csv', # 'csv' | 'json' | 'parquet' | 'excel' | 'avro' | 'fixed_width'
|
|
276
279
|
table_aggregation_type='append', # 'append' | 'replace' | 'fail'
|
|
277
280
|
metadata={
|
|
278
281
|
'_ds': '{{ ds }}',
|
|
@@ -282,6 +285,52 @@ FilesystemToDatabaseOperator(
|
|
|
282
285
|
)
|
|
283
286
|
```
|
|
284
287
|
|
|
288
|
+
**Excel** (requires the `[excel]` extra):
|
|
289
|
+
|
|
290
|
+
```python
|
|
291
|
+
FilesystemToDatabaseOperator(
|
|
292
|
+
task_id='load_excel_report',
|
|
293
|
+
filesystem_conn_id='my_data_lake',
|
|
294
|
+
database_conn_id='my_postgres',
|
|
295
|
+
filesystem_path='raw/reports/{{ ds }}/',
|
|
296
|
+
db_table='monthly_report',
|
|
297
|
+
source_format='excel',
|
|
298
|
+
source_format_options={'sheet_name': 'Data'},
|
|
299
|
+
)
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
**Avro** (requires the `[avro]` extra):
|
|
303
|
+
|
|
304
|
+
```python
|
|
305
|
+
FilesystemToDatabaseOperator(
|
|
306
|
+
task_id='load_avro_events',
|
|
307
|
+
filesystem_conn_id='my_data_lake',
|
|
308
|
+
database_conn_id='my_postgres',
|
|
309
|
+
filesystem_path='raw/events/{{ ds }}/',
|
|
310
|
+
db_table='events',
|
|
311
|
+
source_format='avro',
|
|
312
|
+
)
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
**Fixed-width** (no extra required — pandas native):
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
FilesystemToDatabaseOperator(
|
|
319
|
+
task_id='load_fixed_width',
|
|
320
|
+
filesystem_conn_id='my_data_lake',
|
|
321
|
+
database_conn_id='my_postgres',
|
|
322
|
+
filesystem_path='raw/exports/{{ ds }}/',
|
|
323
|
+
db_table='transactions',
|
|
324
|
+
source_format='fixed_width',
|
|
325
|
+
source_format_options={
|
|
326
|
+
'colspecs': [(0, 10), (10, 25), (25, 35)],
|
|
327
|
+
'names': ['date', 'description', 'amount'],
|
|
328
|
+
},
|
|
329
|
+
)
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
Each format is matched by file extension: `.csv`/`.csv.gz`, `.json`/`.json.gz`, `.parquet`/`.parquet.gz`, `.xlsx`/`.xls`, `.avro`, `.fwf`/`.txt`/`.dat`. Files with other extensions in the same prefix are silently skipped.
|
|
333
|
+
|
|
285
334
|
### DuckdbToDeltalake
|
|
286
335
|
|
|
287
336
|
Executes a DuckDB SQL query and writes the result directly to a Delta Lake table on Azure storage. Useful for in-process transformations that land results as an open table format.
|
|
@@ -477,6 +526,50 @@ Each environment maps to a distinct colour across all channels so alerts are rec
|
|
|
477
526
|
|
|
478
527
|
---
|
|
479
528
|
|
|
529
|
+
## Testing Utilities
|
|
530
|
+
|
|
531
|
+
### MockFilesystem
|
|
532
|
+
|
|
533
|
+
`MockFilesystem` is an in-memory implementation of `FilesystemProtocol` for unit testing. It requires no Docker, no cloud credentials, and no network — all files are stored in a plain Python dict.
|
|
534
|
+
|
|
535
|
+
```python
|
|
536
|
+
from airflow_toolkit.testing import MockFilesystem
|
|
537
|
+
|
|
538
|
+
# Pre-load files at construction time
|
|
539
|
+
fs = MockFilesystem({
|
|
540
|
+
"raw/orders/2024-01-01/data.csv": b"id,amount\n1,100\n2,200",
|
|
541
|
+
})
|
|
542
|
+
|
|
543
|
+
# Or write files programmatically
|
|
544
|
+
fs.write(b"id,amount\n3,300", "raw/orders/2024-01-02/data.csv")
|
|
545
|
+
|
|
546
|
+
# Inspect the result in assertions
|
|
547
|
+
assert fs.check_file("raw/orders/2024-01-01/data.csv")
|
|
548
|
+
assert len(fs.list_files("raw/orders/")) == 2
|
|
549
|
+
assert fs.files["raw/orders/2024-01-01/data.csv"] == b"id,amount\n1,100\n2,200"
|
|
550
|
+
```
|
|
551
|
+
|
|
552
|
+
Use it to patch `FilesystemFactory.get_data_lake_filesystem` in your operator tests:
|
|
553
|
+
|
|
554
|
+
```python
|
|
555
|
+
from unittest.mock import patch
|
|
556
|
+
from airflow_toolkit.testing import MockFilesystem
|
|
557
|
+
|
|
558
|
+
def test_my_pipeline(tmp_path):
|
|
559
|
+
fs = MockFilesystem({"data/file.csv": b"id,name\n1,Alice"})
|
|
560
|
+
|
|
561
|
+
with patch(
|
|
562
|
+
"airflow_toolkit.filesystems.filesystem_factory.FilesystemFactory.get_data_lake_filesystem",
|
|
563
|
+
return_value=fs,
|
|
564
|
+
):
|
|
565
|
+
# run your operator or task here
|
|
566
|
+
...
|
|
567
|
+
```
|
|
568
|
+
|
|
569
|
+
`MockFilesystem` implements the full `FilesystemProtocol`: `read`, `write`, `delete_file`, `create_prefix`, `delete_prefix`, `check_file`, `check_prefix`, `list_files`.
|
|
570
|
+
|
|
571
|
+
---
|
|
572
|
+
|
|
480
573
|
## Running Tests
|
|
481
574
|
|
|
482
575
|
### Integration tests
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "airflow-toolkit"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.3.0"
|
|
4
4
|
description = "A toolkit of operators, hooks and utilities for Apache Airflow 3"
|
|
5
5
|
authors = [{ name = "Biel Llobera", email = "biel_llobera@dkl.digital" }]
|
|
6
6
|
requires-python = ">=3.11,<3.15"
|
|
@@ -49,6 +49,12 @@ duckdb = [
|
|
|
49
49
|
sqlite = [
|
|
50
50
|
"apache-airflow-providers-sqlite",
|
|
51
51
|
]
|
|
52
|
+
excel = [
|
|
53
|
+
"openpyxl>=3.1",
|
|
54
|
+
]
|
|
55
|
+
avro = [
|
|
56
|
+
"fastavro>=1.9",
|
|
57
|
+
]
|
|
52
58
|
airflow3-full = [
|
|
53
59
|
"apache-airflow>=3,<4",
|
|
54
60
|
"apache-airflow-providers-fab>=3.0.0",
|
|
@@ -66,6 +72,8 @@ airflow3-full = [
|
|
|
66
72
|
"jmespath>=1.0.1,<2",
|
|
67
73
|
"airflow-provider-duckdb>=0.1.2",
|
|
68
74
|
"apache-airflow-providers-sqlite",
|
|
75
|
+
"openpyxl>=3.1",
|
|
76
|
+
"fastavro>=1.9",
|
|
69
77
|
]
|
|
70
78
|
|
|
71
79
|
[dependency-groups]
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import gzip
|
|
2
2
|
import zipfile
|
|
3
3
|
from io import BytesIO
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
from airflow_toolkit.types import CompressionOptions
|
|
5
6
|
|
|
6
7
|
DEFAULT_ZIP_FILENAME = "file.zip"
|
|
7
|
-
CompressionOptions = Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], None]
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def gzip_data(data: bytes) -> bytes:
|
|
@@ -15,6 +15,8 @@ import urllib.parse
|
|
|
15
15
|
from collections.abc import Iterator, Mapping
|
|
16
16
|
from typing import Any, Literal
|
|
17
17
|
|
|
18
|
+
from airflow_toolkit.types import MetadataSpec
|
|
19
|
+
|
|
18
20
|
import pandas as pd
|
|
19
21
|
from sqlalchemy import (
|
|
20
22
|
Boolean,
|
|
@@ -40,6 +42,29 @@ logger = logging.getLogger(__name__)
|
|
|
40
42
|
|
|
41
43
|
_BATCH_SIZE = 50_000
|
|
42
44
|
|
|
45
|
+
# Maps source_format names to the file extensions they match.
|
|
46
|
+
# Used in execute() to skip blobs that don't belong to the selected format.
|
|
47
|
+
_FORMAT_EXTENSIONS: dict[str, tuple[str, ...]] = {
|
|
48
|
+
"csv": (".csv", ".csv.gz"),
|
|
49
|
+
"json": (".json", ".json.gz"),
|
|
50
|
+
"parquet": (".parquet", ".parquet.gz"),
|
|
51
|
+
"excel": (".xlsx", ".xls"),
|
|
52
|
+
"avro": (".avro",),
|
|
53
|
+
"fixed_width": (".fwf", ".txt", ".dat"),
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
# Canonical extension for the temp file created in execute().
|
|
57
|
+
# Must match what the underlying reader expects (e.g. pandas read_excel
|
|
58
|
+
# infers the engine from the file extension).
|
|
59
|
+
_FORMAT_TEMP_SUFFIX: dict[str, str] = {
|
|
60
|
+
"csv": ".csv",
|
|
61
|
+
"json": ".json",
|
|
62
|
+
"parquet": ".parquet",
|
|
63
|
+
"excel": ".xlsx",
|
|
64
|
+
"avro": ".avro",
|
|
65
|
+
"fixed_width": ".fwf",
|
|
66
|
+
}
|
|
67
|
+
|
|
43
68
|
type_mapping: dict[str, type[Any]] = {
|
|
44
69
|
"int64": Integer,
|
|
45
70
|
"int": Integer,
|
|
@@ -76,11 +101,13 @@ class FilesystemToDatabaseOperator(BaseOperator):
|
|
|
76
101
|
filesystem_path: str,
|
|
77
102
|
db_table: str,
|
|
78
103
|
db_schema: str | None = None,
|
|
79
|
-
source_format: Literal[
|
|
104
|
+
source_format: Literal[
|
|
105
|
+
"csv", "json", "parquet", "excel", "avro", "fixed_width"
|
|
106
|
+
] = "csv",
|
|
80
107
|
source_format_options: Mapping[str, Any] | None = None,
|
|
81
108
|
batch_size: int = _BATCH_SIZE,
|
|
82
109
|
table_aggregation_type: Literal["append", "fail", "replace"] = "append",
|
|
83
|
-
metadata:
|
|
110
|
+
metadata: MetadataSpec | None = None,
|
|
84
111
|
metadata_columns_in_uppercase: bool = True,
|
|
85
112
|
include_source_path: bool = True,
|
|
86
113
|
normalize_unicode: bool = False,
|
|
@@ -157,11 +184,12 @@ class FilesystemToDatabaseOperator(BaseOperator):
|
|
|
157
184
|
if self.idempotent:
|
|
158
185
|
self._delete_existing_run_data(engine)
|
|
159
186
|
|
|
187
|
+
valid_extensions = _FORMAT_EXTENSIONS.get(
|
|
188
|
+
self.source_format, (f".{self.source_format}",)
|
|
189
|
+
)
|
|
160
190
|
first_batch = True
|
|
161
191
|
for blob_path in filesystem.list_files(prefix=self.filesystem_path):
|
|
162
|
-
if not blob_path.endswith(
|
|
163
|
-
(f".{self.source_format}", f".{self.source_format}.gz")
|
|
164
|
-
):
|
|
192
|
+
if not blob_path.endswith(valid_extensions):
|
|
165
193
|
logger.warning(
|
|
166
194
|
f"Blob {blob_path} is not in the right format. Skipping..."
|
|
167
195
|
)
|
|
@@ -175,7 +203,10 @@ class FilesystemToDatabaseOperator(BaseOperator):
|
|
|
175
203
|
f"Downloaded {file_mb:.1f} MB in {time.monotonic() - dl_start:.1f}s"
|
|
176
204
|
)
|
|
177
205
|
|
|
178
|
-
|
|
206
|
+
tmp_suffix = _FORMAT_TEMP_SUFFIX.get(
|
|
207
|
+
self.source_format, f".{self.source_format}"
|
|
208
|
+
)
|
|
209
|
+
tmp_fd, tmp_path = tempfile.mkstemp(suffix=tmp_suffix)
|
|
179
210
|
try:
|
|
180
211
|
with os.fdopen(tmp_fd, "wb") as tmp_file:
|
|
181
212
|
tmp_file.write(raw_bytes)
|
|
@@ -310,6 +341,19 @@ class FilesystemToDatabaseOperator(BaseOperator):
|
|
|
310
341
|
if peek_opts.get("lines"):
|
|
311
342
|
return set(pd.read_json(path, nrows=1, **peek_opts).columns)
|
|
312
343
|
return set(pd.read_json(path, **peek_opts).columns)
|
|
344
|
+
case "excel":
|
|
345
|
+
peek_opts = {k: v for k, v in options.items() if k != "sheet_name"}
|
|
346
|
+
return set(pd.read_excel(path, nrows=0, **peek_opts).columns)
|
|
347
|
+
case "avro":
|
|
348
|
+
import fastavro
|
|
349
|
+
|
|
350
|
+
with open(path, "rb") as f:
|
|
351
|
+
reader = fastavro.reader(f)
|
|
352
|
+
schema = reader.writer_schema
|
|
353
|
+
return {field["name"] for field in schema["fields"]}
|
|
354
|
+
case "fixed_width":
|
|
355
|
+
peek_opts = {k: v for k, v in options.items() if k != "chunksize"}
|
|
356
|
+
return set(pd.read_fwf(path, nrows=0, **peek_opts).columns)
|
|
313
357
|
case _:
|
|
314
358
|
return set()
|
|
315
359
|
|
|
@@ -398,6 +442,25 @@ class FilesystemToDatabaseOperator(BaseOperator):
|
|
|
398
442
|
yield from pd.read_json(path, chunksize=self.batch_size, **options)
|
|
399
443
|
else:
|
|
400
444
|
yield pd.read_json(path, **options)
|
|
445
|
+
case "excel":
|
|
446
|
+
df = pd.read_excel(path, **options)
|
|
447
|
+
for start in range(0, max(len(df), 1), self.batch_size):
|
|
448
|
+
yield df.iloc[start : start + self.batch_size].copy()
|
|
449
|
+
case "avro":
|
|
450
|
+
import fastavro
|
|
451
|
+
|
|
452
|
+
with open(path, "rb") as f:
|
|
453
|
+
reader = fastavro.reader(f)
|
|
454
|
+
batch: list[dict[str, Any]] = []
|
|
455
|
+
for record in reader:
|
|
456
|
+
batch.append(record)
|
|
457
|
+
if len(batch) >= self.batch_size:
|
|
458
|
+
yield pd.DataFrame(batch)
|
|
459
|
+
batch = []
|
|
460
|
+
if batch:
|
|
461
|
+
yield pd.DataFrame(batch)
|
|
462
|
+
case "fixed_width":
|
|
463
|
+
yield from pd.read_fwf(path, chunksize=self.batch_size, **options)
|
|
401
464
|
case _:
|
|
402
465
|
raise ValueError(f"Unknown source format {self.source_format}")
|
|
403
466
|
|
|
@@ -544,5 +607,26 @@ class FilesystemToDatabaseOperator(BaseOperator):
|
|
|
544
607
|
return pd.read_json(path_or_buf, **options)
|
|
545
608
|
case "parquet":
|
|
546
609
|
return pd.read_parquet(path_or_buf, **options)
|
|
610
|
+
case "excel":
|
|
611
|
+
return pd.read_excel(path_or_buf, **options)
|
|
612
|
+
case "avro":
|
|
613
|
+
import fastavro
|
|
614
|
+
|
|
615
|
+
if isinstance(path_or_buf, (str, bytes)):
|
|
616
|
+
buf = io.BytesIO(
|
|
617
|
+
path_or_buf
|
|
618
|
+
if isinstance(path_or_buf, bytes)
|
|
619
|
+
else path_or_buf.encode()
|
|
620
|
+
)
|
|
621
|
+
else:
|
|
622
|
+
buf = (
|
|
623
|
+
path_or_buf
|
|
624
|
+
if isinstance(path_or_buf, io.BytesIO)
|
|
625
|
+
else io.BytesIO(path_or_buf.read().encode())
|
|
626
|
+
)
|
|
627
|
+
records = list(fastavro.reader(buf))
|
|
628
|
+
return pd.DataFrame(records)
|
|
629
|
+
case "fixed_width":
|
|
630
|
+
return pd.read_fwf(path_or_buf, **options)
|
|
547
631
|
case _:
|
|
548
632
|
raise ValueError(f"Unknown source format {self.source_format}")
|
|
@@ -9,13 +9,9 @@ from typing import (
|
|
|
9
9
|
Any,
|
|
10
10
|
Callable,
|
|
11
11
|
Generator,
|
|
12
|
-
Literal,
|
|
13
12
|
Optional,
|
|
14
|
-
Type,
|
|
15
13
|
)
|
|
16
14
|
|
|
17
|
-
from typing import TypedDict
|
|
18
|
-
|
|
19
15
|
import jmespath
|
|
20
16
|
import pandas as pd
|
|
21
17
|
|
|
@@ -25,16 +21,20 @@ from airflow.utils.helpers import merge_dicts
|
|
|
25
21
|
from requests import Response
|
|
26
22
|
|
|
27
23
|
from airflow_toolkit._compact.airflow_shim import BaseOperator, Context, BaseHook
|
|
28
|
-
from airflow_toolkit.compression_utils import
|
|
24
|
+
from airflow_toolkit.compression_utils import compress
|
|
29
25
|
from airflow_toolkit.exceptions import ApiResponseTypeError
|
|
30
26
|
from airflow_toolkit.filesystems.filesystem_factory import FilesystemFactory
|
|
31
27
|
from airflow_toolkit.protocols import HttpTransformation
|
|
28
|
+
from airflow_toolkit.types import (
|
|
29
|
+
CompressionOptions,
|
|
30
|
+
RequestSpec,
|
|
31
|
+
RequestState,
|
|
32
|
+
SaveFormat,
|
|
33
|
+
)
|
|
32
34
|
|
|
33
35
|
if TYPE_CHECKING:
|
|
34
36
|
from requests.auth import AuthBase
|
|
35
37
|
|
|
36
|
-
SaveFormat = Literal["jsonl"]
|
|
37
|
-
|
|
38
38
|
|
|
39
39
|
class HttpBatchOperator(HttpOperator):
|
|
40
40
|
def execute(
|
|
@@ -318,34 +318,6 @@ class HttpToFilesystem(BaseOperator):
|
|
|
318
318
|
raise TypeError(f"Unsupported transformation output type: {type(value)!r}")
|
|
319
319
|
|
|
320
320
|
|
|
321
|
-
class RequestSpec(TypedDict, total=False):
|
|
322
|
-
"""User-provided per-request overrides (all keys optional)."""
|
|
323
|
-
|
|
324
|
-
endpoint: str
|
|
325
|
-
method: str
|
|
326
|
-
data: Any
|
|
327
|
-
headers: dict[str, str]
|
|
328
|
-
auth_type: Type["AuthBase"] | None
|
|
329
|
-
jmespath_expression: str | None
|
|
330
|
-
save_format: "SaveFormat"
|
|
331
|
-
source_format: "SaveFormat"
|
|
332
|
-
compression: "CompressionOptions" | None
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
class RequestState(TypedDict):
|
|
336
|
-
"""Fully-resolved runtime state (all keys present)."""
|
|
337
|
-
|
|
338
|
-
endpoint: str | None
|
|
339
|
-
method: str
|
|
340
|
-
data: Any
|
|
341
|
-
headers: dict[str, str] | None
|
|
342
|
-
auth_type: Type["AuthBase"] | None
|
|
343
|
-
jmespath_expression: str | None
|
|
344
|
-
save_format: "SaveFormat"
|
|
345
|
-
source_format: "SaveFormat"
|
|
346
|
-
compression: "CompressionOptions" | None
|
|
347
|
-
|
|
348
|
-
|
|
349
321
|
class MultiHttpToFilesystem(HttpToFilesystem):
|
|
350
322
|
"""
|
|
351
323
|
Execute multiple HTTP requests in a single task and save each response as a separate file.
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Testing utilities for airflow-toolkit.
|
|
2
|
+
|
|
3
|
+
Import from here in your unit tests — no Docker, no cloud credentials needed.
|
|
4
|
+
|
|
5
|
+
from airflow_toolkit.testing import MockFilesystem
|
|
6
|
+
|
|
7
|
+
fs = MockFilesystem({"data/2024-01-01.csv": b"id,name\\n1,Alice"})
|
|
8
|
+
fs.write(b"id,name\\n2,Bob", "data/2024-01-02.csv")
|
|
9
|
+
assert fs.check_file("data/2024-01-01.csv")
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from io import BytesIO
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MockFilesystem:
|
|
18
|
+
"""In-memory implementation of FilesystemProtocol for unit testing.
|
|
19
|
+
|
|
20
|
+
Stores all files in a plain dict — no network, no Docker, no credentials.
|
|
21
|
+
Inspect ``fs.files`` directly in assertions.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
files: Optional seed data mapping path → bytes.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, files: dict[str, bytes] | None = None) -> None:
|
|
28
|
+
self.files: dict[str, bytes] = dict(files or {})
|
|
29
|
+
|
|
30
|
+
def read(self, path: str) -> bytes:
|
|
31
|
+
if path not in self.files:
|
|
32
|
+
raise FileNotFoundError(f"MockFilesystem: no file at '{path}'")
|
|
33
|
+
return self.files[path]
|
|
34
|
+
|
|
35
|
+
def write(self, data: str | bytes | BytesIO, path: str) -> None:
|
|
36
|
+
if isinstance(data, str):
|
|
37
|
+
data = data.encode()
|
|
38
|
+
elif isinstance(data, BytesIO):
|
|
39
|
+
data = data.getvalue()
|
|
40
|
+
self.files[path] = data
|
|
41
|
+
|
|
42
|
+
def delete_file(self, path: str) -> None:
|
|
43
|
+
self.files.pop(path, None)
|
|
44
|
+
|
|
45
|
+
def create_prefix(self, prefix: str) -> None:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
def delete_prefix(self, prefix: str) -> None:
|
|
49
|
+
for key in [k for k in self.files if k.startswith(prefix)]:
|
|
50
|
+
del self.files[key]
|
|
51
|
+
|
|
52
|
+
def check_file(self, path: str) -> bool:
|
|
53
|
+
return path in self.files
|
|
54
|
+
|
|
55
|
+
def check_prefix(self, prefix: str) -> bool:
|
|
56
|
+
return any(k.startswith(prefix) for k in self.files)
|
|
57
|
+
|
|
58
|
+
def list_files(self, prefix: str) -> list[str]:
|
|
59
|
+
return [k for k in self.files if k.startswith(prefix)]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Literal, Type, TypedDict
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from requests.auth import AuthBase
|
|
7
|
+
|
|
8
|
+
# ── Compression ────────────────────────────────────────────────────────────
|
|
9
|
+
|
|
10
|
+
CompressionOptions = Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"] | None
|
|
11
|
+
|
|
12
|
+
# ── Filesystem / format ────────────────────────────────────────────────────
|
|
13
|
+
|
|
14
|
+
SaveFormat = Literal["jsonl"]
|
|
15
|
+
|
|
16
|
+
# ── Metadata columns ───────────────────────────────────────────────────────
|
|
17
|
+
# Passed to FilesystemToDatabaseOperator as extra columns added to every row.
|
|
18
|
+
# Key = column name; value = Airflow template string (e.g. "{{ ds }}").
|
|
19
|
+
# Keys prefixed with "_" are coerced to datetime at load time.
|
|
20
|
+
|
|
21
|
+
MetadataSpec = dict[str, str]
|
|
22
|
+
|
|
23
|
+
# ── HTTP multi-request ─────────────────────────────────────────────────────
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RequestSpec(TypedDict, total=False):
|
|
27
|
+
"""User-provided per-request overrides for MultiHttpToFilesystem (all keys optional)."""
|
|
28
|
+
|
|
29
|
+
endpoint: str
|
|
30
|
+
method: str
|
|
31
|
+
data: Any
|
|
32
|
+
headers: dict[str, str]
|
|
33
|
+
auth_type: Type["AuthBase"] | None
|
|
34
|
+
jmespath_expression: str | None
|
|
35
|
+
save_format: SaveFormat
|
|
36
|
+
source_format: SaveFormat
|
|
37
|
+
compression: CompressionOptions
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class RequestState(TypedDict):
|
|
41
|
+
"""Fully-resolved runtime state for a single HTTP request (all keys required)."""
|
|
42
|
+
|
|
43
|
+
endpoint: str | None
|
|
44
|
+
method: str
|
|
45
|
+
data: Any
|
|
46
|
+
headers: dict[str, str] | None
|
|
47
|
+
auth_type: Type["AuthBase"] | None
|
|
48
|
+
jmespath_expression: str | None
|
|
49
|
+
save_format: SaveFormat
|
|
50
|
+
source_format: SaveFormat
|
|
51
|
+
compression: CompressionOptions
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: airflow-toolkit
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.3.0
|
|
4
4
|
Summary: A toolkit of operators, hooks and utilities for Apache Airflow 3
|
|
5
5
|
Author-email: Biel Llobera <biel_llobera@dkl.digital>
|
|
6
6
|
Requires-Python: <3.15,>=3.11
|
|
@@ -32,6 +32,10 @@ Provides-Extra: duckdb
|
|
|
32
32
|
Requires-Dist: airflow-provider-duckdb>=0.1.2; extra == "duckdb"
|
|
33
33
|
Provides-Extra: sqlite
|
|
34
34
|
Requires-Dist: apache-airflow-providers-sqlite; extra == "sqlite"
|
|
35
|
+
Provides-Extra: excel
|
|
36
|
+
Requires-Dist: openpyxl>=3.1; extra == "excel"
|
|
37
|
+
Provides-Extra: avro
|
|
38
|
+
Requires-Dist: fastavro>=1.9; extra == "avro"
|
|
35
39
|
Provides-Extra: airflow3-full
|
|
36
40
|
Requires-Dist: apache-airflow<4,>=3; extra == "airflow3-full"
|
|
37
41
|
Requires-Dist: apache-airflow-providers-fab>=3.0.0; extra == "airflow3-full"
|
|
@@ -49,6 +53,8 @@ Requires-Dist: requests>=2.31.0; extra == "airflow3-full"
|
|
|
49
53
|
Requires-Dist: jmespath<2,>=1.0.1; extra == "airflow3-full"
|
|
50
54
|
Requires-Dist: airflow-provider-duckdb>=0.1.2; extra == "airflow3-full"
|
|
51
55
|
Requires-Dist: apache-airflow-providers-sqlite; extra == "airflow3-full"
|
|
56
|
+
Requires-Dist: openpyxl>=3.1; extra == "airflow3-full"
|
|
57
|
+
Requires-Dist: fastavro>=1.9; extra == "airflow3-full"
|
|
52
58
|
Dynamic: license-file
|
|
53
59
|
|
|
54
60
|
# Airflow Toolkit
|
|
@@ -136,10 +142,11 @@ pip install "airflow-toolkit[airflow3-full]"
|
|
|
136
142
|
| `google` | `providers-google` | GCS filesystem backend |
|
|
137
143
|
| `azure` | `providers-microsoft-azure` | Azure Blob / ADLS filesystem backend |
|
|
138
144
|
| `sftp` | `providers-sftp` | SFTP filesystem backend |
|
|
139
|
-
| `slack` | `providers-slack` | Slack failure notifications |
|
|
140
145
|
| `http` | `providers-http`, `requests`, `jmespath`, `pandas` | `HttpToFilesystem`, `MultiHttpToFilesystem` |
|
|
141
146
|
| `duckdb` | `airflow-provider-duckdb` | `DuckdbToDeltalake` operator |
|
|
142
147
|
| `sqlite` | `providers-sqlite` | SQLite as source or destination |
|
|
148
|
+
| `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` |
|
|
149
|
+
| `avro` | `fastavro` | Avro support in `FilesystemToDatabase` |
|
|
143
150
|
| `airflow3-full` | all of the above | Quick start / development |
|
|
144
151
|
|
|
145
152
|
---
|
|
@@ -313,7 +320,9 @@ FilesystemToFilesystem(
|
|
|
313
320
|
|
|
314
321
|
### FilesystemToDatabase
|
|
315
322
|
|
|
316
|
-
Reads files
|
|
323
|
+
Reads files from any filesystem and loads them into any SQLAlchemy-compatible database. Handles schema drift automatically: columns present in the file but missing from the table are added; columns present in the table but missing from the file are filled with `NULL`.
|
|
324
|
+
|
|
325
|
+
**Supported formats:** `csv`, `json`, `parquet`, `excel`, `avro`, `fixed_width`.
|
|
317
326
|
|
|
318
327
|
```python
|
|
319
328
|
from airflow_toolkit.providers.deltalake.operators.filesystem_to_database import FilesystemToDatabaseOperator
|
|
@@ -325,7 +334,7 @@ FilesystemToDatabaseOperator(
|
|
|
325
334
|
filesystem_path='raw/orders/{{ ds }}/',
|
|
326
335
|
db_schema='public',
|
|
327
336
|
db_table='orders',
|
|
328
|
-
source_format='csv',
|
|
337
|
+
source_format='csv', # 'csv' | 'json' | 'parquet' | 'excel' | 'avro' | 'fixed_width'
|
|
329
338
|
table_aggregation_type='append', # 'append' | 'replace' | 'fail'
|
|
330
339
|
metadata={
|
|
331
340
|
'_ds': '{{ ds }}',
|
|
@@ -335,6 +344,52 @@ FilesystemToDatabaseOperator(
|
|
|
335
344
|
)
|
|
336
345
|
```
|
|
337
346
|
|
|
347
|
+
**Excel** (requires the `[excel]` extra):
|
|
348
|
+
|
|
349
|
+
```python
|
|
350
|
+
FilesystemToDatabaseOperator(
|
|
351
|
+
task_id='load_excel_report',
|
|
352
|
+
filesystem_conn_id='my_data_lake',
|
|
353
|
+
database_conn_id='my_postgres',
|
|
354
|
+
filesystem_path='raw/reports/{{ ds }}/',
|
|
355
|
+
db_table='monthly_report',
|
|
356
|
+
source_format='excel',
|
|
357
|
+
source_format_options={'sheet_name': 'Data'},
|
|
358
|
+
)
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
**Avro** (requires the `[avro]` extra):
|
|
362
|
+
|
|
363
|
+
```python
|
|
364
|
+
FilesystemToDatabaseOperator(
|
|
365
|
+
task_id='load_avro_events',
|
|
366
|
+
filesystem_conn_id='my_data_lake',
|
|
367
|
+
database_conn_id='my_postgres',
|
|
368
|
+
filesystem_path='raw/events/{{ ds }}/',
|
|
369
|
+
db_table='events',
|
|
370
|
+
source_format='avro',
|
|
371
|
+
)
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
**Fixed-width** (no extra required — pandas native):
|
|
375
|
+
|
|
376
|
+
```python
|
|
377
|
+
FilesystemToDatabaseOperator(
|
|
378
|
+
task_id='load_fixed_width',
|
|
379
|
+
filesystem_conn_id='my_data_lake',
|
|
380
|
+
database_conn_id='my_postgres',
|
|
381
|
+
filesystem_path='raw/exports/{{ ds }}/',
|
|
382
|
+
db_table='transactions',
|
|
383
|
+
source_format='fixed_width',
|
|
384
|
+
source_format_options={
|
|
385
|
+
'colspecs': [(0, 10), (10, 25), (25, 35)],
|
|
386
|
+
'names': ['date', 'description', 'amount'],
|
|
387
|
+
},
|
|
388
|
+
)
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
Each format is matched by file extension: `.csv`/`.csv.gz`, `.json`/`.json.gz`, `.parquet`/`.parquet.gz`, `.xlsx`/`.xls`, `.avro`, `.fwf`/`.txt`/`.dat`. Files with other extensions in the same prefix are silently skipped.
|
|
392
|
+
|
|
338
393
|
### DuckdbToDeltalake
|
|
339
394
|
|
|
340
395
|
Executes a DuckDB SQL query and writes the result directly to a Delta Lake table on Azure storage. Useful for in-process transformations that land results as an open table format.
|
|
@@ -530,6 +585,50 @@ Each environment maps to a distinct colour across all channels so alerts are rec
|
|
|
530
585
|
|
|
531
586
|
---
|
|
532
587
|
|
|
588
|
+
## Testing Utilities
|
|
589
|
+
|
|
590
|
+
### MockFilesystem
|
|
591
|
+
|
|
592
|
+
`MockFilesystem` is an in-memory implementation of `FilesystemProtocol` for unit testing. It requires no Docker, no cloud credentials, and no network — all files are stored in a plain Python dict.
|
|
593
|
+
|
|
594
|
+
```python
|
|
595
|
+
from airflow_toolkit.testing import MockFilesystem
|
|
596
|
+
|
|
597
|
+
# Pre-load files at construction time
|
|
598
|
+
fs = MockFilesystem({
|
|
599
|
+
"raw/orders/2024-01-01/data.csv": b"id,amount\n1,100\n2,200",
|
|
600
|
+
})
|
|
601
|
+
|
|
602
|
+
# Or write files programmatically
|
|
603
|
+
fs.write(b"id,amount\n3,300", "raw/orders/2024-01-02/data.csv")
|
|
604
|
+
|
|
605
|
+
# Inspect the result in assertions
|
|
606
|
+
assert fs.check_file("raw/orders/2024-01-01/data.csv")
|
|
607
|
+
assert len(fs.list_files("raw/orders/")) == 2
|
|
608
|
+
assert fs.files["raw/orders/2024-01-01/data.csv"] == b"id,amount\n1,100\n2,200"
|
|
609
|
+
```
|
|
610
|
+
|
|
611
|
+
Use it to patch `FilesystemFactory.get_data_lake_filesystem` in your operator tests:
|
|
612
|
+
|
|
613
|
+
```python
|
|
614
|
+
from unittest.mock import patch
|
|
615
|
+
from airflow_toolkit.testing import MockFilesystem
|
|
616
|
+
|
|
617
|
+
def test_my_pipeline(tmp_path):
|
|
618
|
+
fs = MockFilesystem({"data/file.csv": b"id,name\n1,Alice"})
|
|
619
|
+
|
|
620
|
+
with patch(
|
|
621
|
+
"airflow_toolkit.filesystems.filesystem_factory.FilesystemFactory.get_data_lake_filesystem",
|
|
622
|
+
return_value=fs,
|
|
623
|
+
):
|
|
624
|
+
# run your operator or task here
|
|
625
|
+
...
|
|
626
|
+
```
|
|
627
|
+
|
|
628
|
+
`MockFilesystem` implements the full `FilesystemProtocol`: `read`, `write`, `delete_file`, `create_prefix`, `delete_prefix`, `check_file`, `check_prefix`, `list_files`.
|
|
629
|
+
|
|
630
|
+
---
|
|
631
|
+
|
|
533
632
|
## Running Tests
|
|
534
633
|
|
|
535
634
|
### Integration tests
|
|
@@ -6,6 +6,8 @@ src/airflow_toolkit/compression_utils.py
|
|
|
6
6
|
src/airflow_toolkit/exceptions.py
|
|
7
7
|
src/airflow_toolkit/protocols.py
|
|
8
8
|
src/airflow_toolkit/py.typed
|
|
9
|
+
src/airflow_toolkit/testing.py
|
|
10
|
+
src/airflow_toolkit/types.py
|
|
9
11
|
src/airflow_toolkit.egg-info/PKG-INFO
|
|
10
12
|
src/airflow_toolkit.egg-info/SOURCES.txt
|
|
11
13
|
src/airflow_toolkit.egg-info/dependency_links.txt
|
|
@@ -21,10 +21,15 @@ requests>=2.31.0
|
|
|
21
21
|
jmespath<2,>=1.0.1
|
|
22
22
|
airflow-provider-duckdb>=0.1.2
|
|
23
23
|
apache-airflow-providers-sqlite
|
|
24
|
+
openpyxl>=3.1
|
|
25
|
+
fastavro>=1.9
|
|
24
26
|
|
|
25
27
|
[amazon]
|
|
26
28
|
apache-airflow-providers-amazon>=9.15.0
|
|
27
29
|
|
|
30
|
+
[avro]
|
|
31
|
+
fastavro>=1.9
|
|
32
|
+
|
|
28
33
|
[azure]
|
|
29
34
|
apache-airflow-providers-microsoft-azure>=8
|
|
30
35
|
|
|
@@ -37,6 +42,9 @@ pandas<3,>=2.1.1
|
|
|
37
42
|
[duckdb]
|
|
38
43
|
airflow-provider-duckdb>=0.1.2
|
|
39
44
|
|
|
45
|
+
[excel]
|
|
46
|
+
openpyxl>=3.1
|
|
47
|
+
|
|
40
48
|
[google]
|
|
41
49
|
apache-airflow-providers-google>=18
|
|
42
50
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/_compact/airflow_shim.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/channels/email.py
RENAMED
|
File without changes
|
{airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/channels/slack.py
RENAMED
|
File without changes
|
{airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/channels/teams.py
RENAMED
|
File without changes
|
{airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/context.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/azure/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/deltalake/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/filesystem/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/filesystem/tasks.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|