airflow-toolkit 2.2.0__tar.gz → 2.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {airflow_toolkit-2.2.0/src/airflow_toolkit.egg-info → airflow_toolkit-2.3.0}/PKG-INFO +103 -4
  2. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/README.md +96 -3
  3. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/pyproject.toml +9 -1
  4. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/compression_utils.py +2 -2
  5. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/deltalake/operators/filesystem_to_database.py +90 -6
  6. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/filesystem/operators/http_to_filesystem.py +7 -35
  7. airflow_toolkit-2.3.0/src/airflow_toolkit/testing.py +59 -0
  8. airflow_toolkit-2.3.0/src/airflow_toolkit/types.py +51 -0
  9. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0/src/airflow_toolkit.egg-info}/PKG-INFO +103 -4
  10. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit.egg-info/SOURCES.txt +2 -0
  11. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit.egg-info/requires.txt +8 -0
  12. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/LICENSE.txt +0 -0
  13. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/setup.cfg +0 -0
  14. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/__init__.py +0 -0
  15. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/_compact/airflow_shim.py +0 -0
  16. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/exceptions.py +0 -0
  17. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/__init__.py +0 -0
  18. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/filesystem_factory.py +0 -0
  19. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/filesystem_protocol.py +0 -0
  20. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/__init__.py +0 -0
  21. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/azure_databricks_volume_filesystem.py +0 -0
  22. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/azure_file_share_filesystem.py +0 -0
  23. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/blob_storage_filesystem.py +0 -0
  24. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/google_cloud_storage_filesystem.py +0 -0
  25. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/local_filesystem.py +0 -0
  26. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/s3_filesystem.py +0 -0
  27. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/filesystems/impl/sftp_filesystem.py +0 -0
  28. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/__init__.py +0 -0
  29. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/channels/__init__.py +0 -0
  30. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/channels/discord.py +0 -0
  31. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/channels/email.py +0 -0
  32. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/channels/slack.py +0 -0
  33. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/channels/teams.py +0 -0
  34. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/notifications/context.py +0 -0
  35. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/protocols.py +0 -0
  36. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/__init__.py +0 -0
  37. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/azure/__init__.py +0 -0
  38. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/azure/hooks/__init__.py +0 -0
  39. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/azure/hooks/azure_databricks.py +0 -0
  40. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/azure/hooks/azure_file_share.py +0 -0
  41. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/deltalake/__init__.py +0 -0
  42. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/deltalake/operators/__init__.py +0 -0
  43. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/deltalake/operators/duckdb_to_deltalake.py +0 -0
  44. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/deltalake/sensors/__init__.py +0 -0
  45. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/deltalake/sensors/filesystem_file.py +0 -0
  46. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/filesystem/__init__.py +0 -0
  47. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/filesystem/operators/__init__.py +0 -0
  48. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/filesystem/operators/filesystem.py +0 -0
  49. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/filesystem/tasks.py +0 -0
  50. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/providers/package.py +0 -0
  51. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit/py.typed +0 -0
  52. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit.egg-info/dependency_links.txt +0 -0
  53. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit.egg-info/entry_points.txt +0 -0
  54. {airflow_toolkit-2.2.0 → airflow_toolkit-2.3.0}/src/airflow_toolkit.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: airflow-toolkit
3
- Version: 2.2.0
3
+ Version: 2.3.0
4
4
  Summary: A toolkit of operators, hooks and utilities for Apache Airflow 3
5
5
  Author-email: Biel Llobera <biel_llobera@dkl.digital>
6
6
  Requires-Python: <3.15,>=3.11
@@ -32,6 +32,10 @@ Provides-Extra: duckdb
32
32
  Requires-Dist: airflow-provider-duckdb>=0.1.2; extra == "duckdb"
33
33
  Provides-Extra: sqlite
34
34
  Requires-Dist: apache-airflow-providers-sqlite; extra == "sqlite"
35
+ Provides-Extra: excel
36
+ Requires-Dist: openpyxl>=3.1; extra == "excel"
37
+ Provides-Extra: avro
38
+ Requires-Dist: fastavro>=1.9; extra == "avro"
35
39
  Provides-Extra: airflow3-full
36
40
  Requires-Dist: apache-airflow<4,>=3; extra == "airflow3-full"
37
41
  Requires-Dist: apache-airflow-providers-fab>=3.0.0; extra == "airflow3-full"
@@ -49,6 +53,8 @@ Requires-Dist: requests>=2.31.0; extra == "airflow3-full"
49
53
  Requires-Dist: jmespath<2,>=1.0.1; extra == "airflow3-full"
50
54
  Requires-Dist: airflow-provider-duckdb>=0.1.2; extra == "airflow3-full"
51
55
  Requires-Dist: apache-airflow-providers-sqlite; extra == "airflow3-full"
56
+ Requires-Dist: openpyxl>=3.1; extra == "airflow3-full"
57
+ Requires-Dist: fastavro>=1.9; extra == "airflow3-full"
52
58
  Dynamic: license-file
53
59
 
54
60
  # Airflow Toolkit
@@ -136,10 +142,11 @@ pip install "airflow-toolkit[airflow3-full]"
136
142
  | `google` | `providers-google` | GCS filesystem backend |
137
143
  | `azure` | `providers-microsoft-azure` | Azure Blob / ADLS filesystem backend |
138
144
  | `sftp` | `providers-sftp` | SFTP filesystem backend |
139
- | `slack` | `providers-slack` | Slack failure notifications |
140
145
  | `http` | `providers-http`, `requests`, `jmespath`, `pandas` | `HttpToFilesystem`, `MultiHttpToFilesystem` |
141
146
  | `duckdb` | `airflow-provider-duckdb` | `DuckdbToDeltalake` operator |
142
147
  | `sqlite` | `providers-sqlite` | SQLite as source or destination |
148
+ | `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` |
149
+ | `avro` | `fastavro` | Avro support in `FilesystemToDatabase` |
143
150
  | `airflow3-full` | all of the above | Quick start / development |
144
151
 
145
152
  ---
@@ -313,7 +320,9 @@ FilesystemToFilesystem(
313
320
 
314
321
  ### FilesystemToDatabase
315
322
 
316
- Reads files (CSV, JSON, or Parquet) from any filesystem and loads them into any SQLAlchemy-compatible database. Handles schema drift automatically: columns present in the file but missing from the table are added; columns present in the table but missing from the file are filled with `NULL`.
323
+ Reads files from any filesystem and loads them into any SQLAlchemy-compatible database. Handles schema drift automatically: columns present in the file but missing from the table are added; columns present in the table but missing from the file are filled with `NULL`.
324
+
325
+ **Supported formats:** `csv`, `json`, `parquet`, `excel`, `avro`, `fixed_width`.
317
326
 
318
327
  ```python
319
328
  from airflow_toolkit.providers.deltalake.operators.filesystem_to_database import FilesystemToDatabaseOperator
@@ -325,7 +334,7 @@ FilesystemToDatabaseOperator(
325
334
  filesystem_path='raw/orders/{{ ds }}/',
326
335
  db_schema='public',
327
336
  db_table='orders',
328
- source_format='csv',
337
+ source_format='csv', # 'csv' | 'json' | 'parquet' | 'excel' | 'avro' | 'fixed_width'
329
338
  table_aggregation_type='append', # 'append' | 'replace' | 'fail'
330
339
  metadata={
331
340
  '_ds': '{{ ds }}',
@@ -335,6 +344,52 @@ FilesystemToDatabaseOperator(
335
344
  )
336
345
  ```
337
346
 
347
+ **Excel** (requires the `[excel]` extra):
348
+
349
+ ```python
350
+ FilesystemToDatabaseOperator(
351
+ task_id='load_excel_report',
352
+ filesystem_conn_id='my_data_lake',
353
+ database_conn_id='my_postgres',
354
+ filesystem_path='raw/reports/{{ ds }}/',
355
+ db_table='monthly_report',
356
+ source_format='excel',
357
+ source_format_options={'sheet_name': 'Data'},
358
+ )
359
+ ```
360
+
361
+ **Avro** (requires the `[avro]` extra):
362
+
363
+ ```python
364
+ FilesystemToDatabaseOperator(
365
+ task_id='load_avro_events',
366
+ filesystem_conn_id='my_data_lake',
367
+ database_conn_id='my_postgres',
368
+ filesystem_path='raw/events/{{ ds }}/',
369
+ db_table='events',
370
+ source_format='avro',
371
+ )
372
+ ```
373
+
374
+ **Fixed-width** (no extra required — pandas native):
375
+
376
+ ```python
377
+ FilesystemToDatabaseOperator(
378
+ task_id='load_fixed_width',
379
+ filesystem_conn_id='my_data_lake',
380
+ database_conn_id='my_postgres',
381
+ filesystem_path='raw/exports/{{ ds }}/',
382
+ db_table='transactions',
383
+ source_format='fixed_width',
384
+ source_format_options={
385
+ 'colspecs': [(0, 10), (10, 25), (25, 35)],
386
+ 'names': ['date', 'description', 'amount'],
387
+ },
388
+ )
389
+ ```
390
+
391
+ Each format is matched by file extension: `.csv`/`.csv.gz`, `.json`/`.json.gz`, `.parquet`/`.parquet.gz`, `.xlsx`/`.xls`, `.avro`, `.fwf`/`.txt`/`.dat`. Files with other extensions in the same prefix are silently skipped.
392
+
338
393
  ### DuckdbToDeltalake
339
394
 
340
395
  Executes a DuckDB SQL query and writes the result directly to a Delta Lake table on Azure storage. Useful for in-process transformations that land results as an open table format.
@@ -530,6 +585,50 @@ Each environment maps to a distinct colour across all channels so alerts are rec
530
585
 
531
586
  ---
532
587
 
588
+ ## Testing Utilities
589
+
590
+ ### MockFilesystem
591
+
592
+ `MockFilesystem` is an in-memory implementation of `FilesystemProtocol` for unit testing. It requires no Docker, no cloud credentials, and no network — all files are stored in a plain Python dict.
593
+
594
+ ```python
595
+ from airflow_toolkit.testing import MockFilesystem
596
+
597
+ # Pre-load files at construction time
598
+ fs = MockFilesystem({
599
+ "raw/orders/2024-01-01/data.csv": b"id,amount\n1,100\n2,200",
600
+ })
601
+
602
+ # Or write files programmatically
603
+ fs.write(b"id,amount\n3,300", "raw/orders/2024-01-02/data.csv")
604
+
605
+ # Inspect the result in assertions
606
+ assert fs.check_file("raw/orders/2024-01-01/data.csv")
607
+ assert len(fs.list_files("raw/orders/")) == 2
608
+ assert fs.files["raw/orders/2024-01-01/data.csv"] == b"id,amount\n1,100\n2,200"
609
+ ```
610
+
611
+ Use it to patch `FilesystemFactory.get_data_lake_filesystem` in your operator tests:
612
+
613
+ ```python
614
+ from unittest.mock import patch
615
+ from airflow_toolkit.testing import MockFilesystem
616
+
617
+ def test_my_pipeline(tmp_path):
618
+ fs = MockFilesystem({"data/file.csv": b"id,name\n1,Alice"})
619
+
620
+ with patch(
621
+ "airflow_toolkit.filesystems.filesystem_factory.FilesystemFactory.get_data_lake_filesystem",
622
+ return_value=fs,
623
+ ):
624
+ # run your operator or task here
625
+ ...
626
+ ```
627
+
628
+ `MockFilesystem` implements the full `FilesystemProtocol`: `read`, `write`, `delete_file`, `create_prefix`, `delete_prefix`, `check_file`, `check_prefix`, `list_files`.
629
+
630
+ ---
631
+
533
632
  ## Running Tests
534
633
 
535
634
  ### Integration tests
@@ -83,10 +83,11 @@ pip install "airflow-toolkit[airflow3-full]"
83
83
  | `google` | `providers-google` | GCS filesystem backend |
84
84
  | `azure` | `providers-microsoft-azure` | Azure Blob / ADLS filesystem backend |
85
85
  | `sftp` | `providers-sftp` | SFTP filesystem backend |
86
- | `slack` | `providers-slack` | Slack failure notifications |
87
86
  | `http` | `providers-http`, `requests`, `jmespath`, `pandas` | `HttpToFilesystem`, `MultiHttpToFilesystem` |
88
87
  | `duckdb` | `airflow-provider-duckdb` | `DuckdbToDeltalake` operator |
89
88
  | `sqlite` | `providers-sqlite` | SQLite as source or destination |
89
+ | `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` |
90
+ | `avro` | `fastavro` | Avro support in `FilesystemToDatabase` |
90
91
  | `airflow3-full` | all of the above | Quick start / development |
91
92
 
92
93
  ---
@@ -260,7 +261,9 @@ FilesystemToFilesystem(
260
261
 
261
262
  ### FilesystemToDatabase
262
263
 
263
- Reads files (CSV, JSON, or Parquet) from any filesystem and loads them into any SQLAlchemy-compatible database. Handles schema drift automatically: columns present in the file but missing from the table are added; columns present in the table but missing from the file are filled with `NULL`.
264
+ Reads files from any filesystem and loads them into any SQLAlchemy-compatible database. Handles schema drift automatically: columns present in the file but missing from the table are added; columns present in the table but missing from the file are filled with `NULL`.
265
+
266
+ **Supported formats:** `csv`, `json`, `parquet`, `excel`, `avro`, `fixed_width`.
264
267
 
265
268
  ```python
266
269
  from airflow_toolkit.providers.deltalake.operators.filesystem_to_database import FilesystemToDatabaseOperator
@@ -272,7 +275,7 @@ FilesystemToDatabaseOperator(
272
275
  filesystem_path='raw/orders/{{ ds }}/',
273
276
  db_schema='public',
274
277
  db_table='orders',
275
- source_format='csv',
278
+ source_format='csv', # 'csv' | 'json' | 'parquet' | 'excel' | 'avro' | 'fixed_width'
276
279
  table_aggregation_type='append', # 'append' | 'replace' | 'fail'
277
280
  metadata={
278
281
  '_ds': '{{ ds }}',
@@ -282,6 +285,52 @@ FilesystemToDatabaseOperator(
282
285
  )
283
286
  ```
284
287
 
288
+ **Excel** (requires the `[excel]` extra):
289
+
290
+ ```python
291
+ FilesystemToDatabaseOperator(
292
+ task_id='load_excel_report',
293
+ filesystem_conn_id='my_data_lake',
294
+ database_conn_id='my_postgres',
295
+ filesystem_path='raw/reports/{{ ds }}/',
296
+ db_table='monthly_report',
297
+ source_format='excel',
298
+ source_format_options={'sheet_name': 'Data'},
299
+ )
300
+ ```
301
+
302
+ **Avro** (requires the `[avro]` extra):
303
+
304
+ ```python
305
+ FilesystemToDatabaseOperator(
306
+ task_id='load_avro_events',
307
+ filesystem_conn_id='my_data_lake',
308
+ database_conn_id='my_postgres',
309
+ filesystem_path='raw/events/{{ ds }}/',
310
+ db_table='events',
311
+ source_format='avro',
312
+ )
313
+ ```
314
+
315
+ **Fixed-width** (no extra required — pandas native):
316
+
317
+ ```python
318
+ FilesystemToDatabaseOperator(
319
+ task_id='load_fixed_width',
320
+ filesystem_conn_id='my_data_lake',
321
+ database_conn_id='my_postgres',
322
+ filesystem_path='raw/exports/{{ ds }}/',
323
+ db_table='transactions',
324
+ source_format='fixed_width',
325
+ source_format_options={
326
+ 'colspecs': [(0, 10), (10, 25), (25, 35)],
327
+ 'names': ['date', 'description', 'amount'],
328
+ },
329
+ )
330
+ ```
331
+
332
+ Each format is matched by file extension: `.csv`/`.csv.gz`, `.json`/`.json.gz`, `.parquet`/`.parquet.gz`, `.xlsx`/`.xls`, `.avro`, `.fwf`/`.txt`/`.dat`. Files with other extensions in the same prefix are silently skipped.
333
+
285
334
  ### DuckdbToDeltalake
286
335
 
287
336
  Executes a DuckDB SQL query and writes the result directly to a Delta Lake table on Azure storage. Useful for in-process transformations that land results as an open table format.
@@ -477,6 +526,50 @@ Each environment maps to a distinct colour across all channels so alerts are rec
477
526
 
478
527
  ---
479
528
 
529
+ ## Testing Utilities
530
+
531
+ ### MockFilesystem
532
+
533
+ `MockFilesystem` is an in-memory implementation of `FilesystemProtocol` for unit testing. It requires no Docker, no cloud credentials, and no network — all files are stored in a plain Python dict.
534
+
535
+ ```python
536
+ from airflow_toolkit.testing import MockFilesystem
537
+
538
+ # Pre-load files at construction time
539
+ fs = MockFilesystem({
540
+ "raw/orders/2024-01-01/data.csv": b"id,amount\n1,100\n2,200",
541
+ })
542
+
543
+ # Or write files programmatically
544
+ fs.write(b"id,amount\n3,300", "raw/orders/2024-01-02/data.csv")
545
+
546
+ # Inspect the result in assertions
547
+ assert fs.check_file("raw/orders/2024-01-01/data.csv")
548
+ assert len(fs.list_files("raw/orders/")) == 2
549
+ assert fs.files["raw/orders/2024-01-01/data.csv"] == b"id,amount\n1,100\n2,200"
550
+ ```
551
+
552
+ Use it to patch `FilesystemFactory.get_data_lake_filesystem` in your operator tests:
553
+
554
+ ```python
555
+ from unittest.mock import patch
556
+ from airflow_toolkit.testing import MockFilesystem
557
+
558
+ def test_my_pipeline(tmp_path):
559
+ fs = MockFilesystem({"data/file.csv": b"id,name\n1,Alice"})
560
+
561
+ with patch(
562
+ "airflow_toolkit.filesystems.filesystem_factory.FilesystemFactory.get_data_lake_filesystem",
563
+ return_value=fs,
564
+ ):
565
+ # run your operator or task here
566
+ ...
567
+ ```
568
+
569
+ `MockFilesystem` implements the full `FilesystemProtocol`: `read`, `write`, `delete_file`, `create_prefix`, `delete_prefix`, `check_file`, `check_prefix`, `list_files`.
570
+
571
+ ---
572
+
480
573
  ## Running Tests
481
574
 
482
575
  ### Integration tests
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "airflow-toolkit"
3
- version = "2.2.0"
3
+ version = "2.3.0"
4
4
  description = "A toolkit of operators, hooks and utilities for Apache Airflow 3"
5
5
  authors = [{ name = "Biel Llobera", email = "biel_llobera@dkl.digital" }]
6
6
  requires-python = ">=3.11,<3.15"
@@ -49,6 +49,12 @@ duckdb = [
49
49
  sqlite = [
50
50
  "apache-airflow-providers-sqlite",
51
51
  ]
52
+ excel = [
53
+ "openpyxl>=3.1",
54
+ ]
55
+ avro = [
56
+ "fastavro>=1.9",
57
+ ]
52
58
  airflow3-full = [
53
59
  "apache-airflow>=3,<4",
54
60
  "apache-airflow-providers-fab>=3.0.0",
@@ -66,6 +72,8 @@ airflow3-full = [
66
72
  "jmespath>=1.0.1,<2",
67
73
  "airflow-provider-duckdb>=0.1.2",
68
74
  "apache-airflow-providers-sqlite",
75
+ "openpyxl>=3.1",
76
+ "fastavro>=1.9",
69
77
  ]
70
78
 
71
79
  [dependency-groups]
@@ -1,10 +1,10 @@
1
1
  import gzip
2
2
  import zipfile
3
3
  from io import BytesIO
4
- from typing import Literal, Union
4
+
5
+ from airflow_toolkit.types import CompressionOptions
5
6
 
6
7
  DEFAULT_ZIP_FILENAME = "file.zip"
7
- CompressionOptions = Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], None]
8
8
 
9
9
 
10
10
  def gzip_data(data: bytes) -> bytes:
@@ -15,6 +15,8 @@ import urllib.parse
15
15
  from collections.abc import Iterator, Mapping
16
16
  from typing import Any, Literal
17
17
 
18
+ from airflow_toolkit.types import MetadataSpec
19
+
18
20
  import pandas as pd
19
21
  from sqlalchemy import (
20
22
  Boolean,
@@ -40,6 +42,29 @@ logger = logging.getLogger(__name__)
40
42
 
41
43
  _BATCH_SIZE = 50_000
42
44
 
45
+ # Maps source_format names to the file extensions they match.
46
+ # Used in execute() to skip blobs that don't belong to the selected format.
47
+ _FORMAT_EXTENSIONS: dict[str, tuple[str, ...]] = {
48
+ "csv": (".csv", ".csv.gz"),
49
+ "json": (".json", ".json.gz"),
50
+ "parquet": (".parquet", ".parquet.gz"),
51
+ "excel": (".xlsx", ".xls"),
52
+ "avro": (".avro",),
53
+ "fixed_width": (".fwf", ".txt", ".dat"),
54
+ }
55
+
56
+ # Canonical extension for the temp file created in execute().
57
+ # Must match what the underlying reader expects (e.g. pandas read_excel
58
+ # infers the engine from the file extension).
59
+ _FORMAT_TEMP_SUFFIX: dict[str, str] = {
60
+ "csv": ".csv",
61
+ "json": ".json",
62
+ "parquet": ".parquet",
63
+ "excel": ".xlsx",
64
+ "avro": ".avro",
65
+ "fixed_width": ".fwf",
66
+ }
67
+
43
68
  type_mapping: dict[str, type[Any]] = {
44
69
  "int64": Integer,
45
70
  "int": Integer,
@@ -76,11 +101,13 @@ class FilesystemToDatabaseOperator(BaseOperator):
76
101
  filesystem_path: str,
77
102
  db_table: str,
78
103
  db_schema: str | None = None,
79
- source_format: Literal["csv", "json", "parquet"] = "csv",
104
+ source_format: Literal[
105
+ "csv", "json", "parquet", "excel", "avro", "fixed_width"
106
+ ] = "csv",
80
107
  source_format_options: Mapping[str, Any] | None = None,
81
108
  batch_size: int = _BATCH_SIZE,
82
109
  table_aggregation_type: Literal["append", "fail", "replace"] = "append",
83
- metadata: Mapping[str, str] | None = None,
110
+ metadata: MetadataSpec | None = None,
84
111
  metadata_columns_in_uppercase: bool = True,
85
112
  include_source_path: bool = True,
86
113
  normalize_unicode: bool = False,
@@ -157,11 +184,12 @@ class FilesystemToDatabaseOperator(BaseOperator):
157
184
  if self.idempotent:
158
185
  self._delete_existing_run_data(engine)
159
186
 
187
+ valid_extensions = _FORMAT_EXTENSIONS.get(
188
+ self.source_format, (f".{self.source_format}",)
189
+ )
160
190
  first_batch = True
161
191
  for blob_path in filesystem.list_files(prefix=self.filesystem_path):
162
- if not blob_path.endswith(
163
- (f".{self.source_format}", f".{self.source_format}.gz")
164
- ):
192
+ if not blob_path.endswith(valid_extensions):
165
193
  logger.warning(
166
194
  f"Blob {blob_path} is not in the right format. Skipping..."
167
195
  )
@@ -175,7 +203,10 @@ class FilesystemToDatabaseOperator(BaseOperator):
175
203
  f"Downloaded {file_mb:.1f} MB in {time.monotonic() - dl_start:.1f}s"
176
204
  )
177
205
 
178
- tmp_fd, tmp_path = tempfile.mkstemp(suffix=f".{self.source_format}")
206
+ tmp_suffix = _FORMAT_TEMP_SUFFIX.get(
207
+ self.source_format, f".{self.source_format}"
208
+ )
209
+ tmp_fd, tmp_path = tempfile.mkstemp(suffix=tmp_suffix)
179
210
  try:
180
211
  with os.fdopen(tmp_fd, "wb") as tmp_file:
181
212
  tmp_file.write(raw_bytes)
@@ -310,6 +341,19 @@ class FilesystemToDatabaseOperator(BaseOperator):
310
341
  if peek_opts.get("lines"):
311
342
  return set(pd.read_json(path, nrows=1, **peek_opts).columns)
312
343
  return set(pd.read_json(path, **peek_opts).columns)
344
+ case "excel":
345
+ peek_opts = {k: v for k, v in options.items() if k != "sheet_name"}
346
+ return set(pd.read_excel(path, nrows=0, **peek_opts).columns)
347
+ case "avro":
348
+ import fastavro
349
+
350
+ with open(path, "rb") as f:
351
+ reader = fastavro.reader(f)
352
+ schema = reader.writer_schema
353
+ return {field["name"] for field in schema["fields"]}
354
+ case "fixed_width":
355
+ peek_opts = {k: v for k, v in options.items() if k != "chunksize"}
356
+ return set(pd.read_fwf(path, nrows=0, **peek_opts).columns)
313
357
  case _:
314
358
  return set()
315
359
 
@@ -398,6 +442,25 @@ class FilesystemToDatabaseOperator(BaseOperator):
398
442
  yield from pd.read_json(path, chunksize=self.batch_size, **options)
399
443
  else:
400
444
  yield pd.read_json(path, **options)
445
+ case "excel":
446
+ df = pd.read_excel(path, **options)
447
+ for start in range(0, max(len(df), 1), self.batch_size):
448
+ yield df.iloc[start : start + self.batch_size].copy()
449
+ case "avro":
450
+ import fastavro
451
+
452
+ with open(path, "rb") as f:
453
+ reader = fastavro.reader(f)
454
+ batch: list[dict[str, Any]] = []
455
+ for record in reader:
456
+ batch.append(record)
457
+ if len(batch) >= self.batch_size:
458
+ yield pd.DataFrame(batch)
459
+ batch = []
460
+ if batch:
461
+ yield pd.DataFrame(batch)
462
+ case "fixed_width":
463
+ yield from pd.read_fwf(path, chunksize=self.batch_size, **options)
401
464
  case _:
402
465
  raise ValueError(f"Unknown source format {self.source_format}")
403
466
 
@@ -544,5 +607,26 @@ class FilesystemToDatabaseOperator(BaseOperator):
544
607
  return pd.read_json(path_or_buf, **options)
545
608
  case "parquet":
546
609
  return pd.read_parquet(path_or_buf, **options)
610
+ case "excel":
611
+ return pd.read_excel(path_or_buf, **options)
612
+ case "avro":
613
+ import fastavro
614
+
615
+ if isinstance(path_or_buf, (str, bytes)):
616
+ buf = io.BytesIO(
617
+ path_or_buf
618
+ if isinstance(path_or_buf, bytes)
619
+ else path_or_buf.encode()
620
+ )
621
+ else:
622
+ buf = (
623
+ path_or_buf
624
+ if isinstance(path_or_buf, io.BytesIO)
625
+ else io.BytesIO(path_or_buf.read().encode())
626
+ )
627
+ records = list(fastavro.reader(buf))
628
+ return pd.DataFrame(records)
629
+ case "fixed_width":
630
+ return pd.read_fwf(path_or_buf, **options)
547
631
  case _:
548
632
  raise ValueError(f"Unknown source format {self.source_format}")
@@ -9,13 +9,9 @@ from typing import (
9
9
  Any,
10
10
  Callable,
11
11
  Generator,
12
- Literal,
13
12
  Optional,
14
- Type,
15
13
  )
16
14
 
17
- from typing import TypedDict
18
-
19
15
  import jmespath
20
16
  import pandas as pd
21
17
 
@@ -25,16 +21,20 @@ from airflow.utils.helpers import merge_dicts
25
21
  from requests import Response
26
22
 
27
23
  from airflow_toolkit._compact.airflow_shim import BaseOperator, Context, BaseHook
28
- from airflow_toolkit.compression_utils import CompressionOptions, compress
24
+ from airflow_toolkit.compression_utils import compress
29
25
  from airflow_toolkit.exceptions import ApiResponseTypeError
30
26
  from airflow_toolkit.filesystems.filesystem_factory import FilesystemFactory
31
27
  from airflow_toolkit.protocols import HttpTransformation
28
+ from airflow_toolkit.types import (
29
+ CompressionOptions,
30
+ RequestSpec,
31
+ RequestState,
32
+ SaveFormat,
33
+ )
32
34
 
33
35
  if TYPE_CHECKING:
34
36
  from requests.auth import AuthBase
35
37
 
36
- SaveFormat = Literal["jsonl"]
37
-
38
38
 
39
39
  class HttpBatchOperator(HttpOperator):
40
40
  def execute(
@@ -318,34 +318,6 @@ class HttpToFilesystem(BaseOperator):
318
318
  raise TypeError(f"Unsupported transformation output type: {type(value)!r}")
319
319
 
320
320
 
321
- class RequestSpec(TypedDict, total=False):
322
- """User-provided per-request overrides (all keys optional)."""
323
-
324
- endpoint: str
325
- method: str
326
- data: Any
327
- headers: dict[str, str]
328
- auth_type: Type["AuthBase"] | None
329
- jmespath_expression: str | None
330
- save_format: "SaveFormat"
331
- source_format: "SaveFormat"
332
- compression: "CompressionOptions" | None
333
-
334
-
335
- class RequestState(TypedDict):
336
- """Fully-resolved runtime state (all keys present)."""
337
-
338
- endpoint: str | None
339
- method: str
340
- data: Any
341
- headers: dict[str, str] | None
342
- auth_type: Type["AuthBase"] | None
343
- jmespath_expression: str | None
344
- save_format: "SaveFormat"
345
- source_format: "SaveFormat"
346
- compression: "CompressionOptions" | None
347
-
348
-
349
321
  class MultiHttpToFilesystem(HttpToFilesystem):
350
322
  """
351
323
  Execute multiple HTTP requests in a single task and save each response as a separate file.
@@ -0,0 +1,59 @@
1
+ """Testing utilities for airflow-toolkit.
2
+
3
+ Import from here in your unit tests — no Docker, no cloud credentials needed.
4
+
5
+ from airflow_toolkit.testing import MockFilesystem
6
+
7
+ fs = MockFilesystem({"data/2024-01-01.csv": b"id,name\\n1,Alice"})
8
+ fs.write(b"id,name\\n2,Bob", "data/2024-01-02.csv")
9
+ assert fs.check_file("data/2024-01-01.csv")
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from io import BytesIO
15
+
16
+
17
+ class MockFilesystem:
18
+ """In-memory implementation of FilesystemProtocol for unit testing.
19
+
20
+ Stores all files in a plain dict — no network, no Docker, no credentials.
21
+ Inspect ``fs.files`` directly in assertions.
22
+
23
+ Args:
24
+ files: Optional seed data mapping path → bytes.
25
+ """
26
+
27
+ def __init__(self, files: dict[str, bytes] | None = None) -> None:
28
+ self.files: dict[str, bytes] = dict(files or {})
29
+
30
+ def read(self, path: str) -> bytes:
31
+ if path not in self.files:
32
+ raise FileNotFoundError(f"MockFilesystem: no file at '{path}'")
33
+ return self.files[path]
34
+
35
+ def write(self, data: str | bytes | BytesIO, path: str) -> None:
36
+ if isinstance(data, str):
37
+ data = data.encode()
38
+ elif isinstance(data, BytesIO):
39
+ data = data.getvalue()
40
+ self.files[path] = data
41
+
42
+ def delete_file(self, path: str) -> None:
43
+ self.files.pop(path, None)
44
+
45
+ def create_prefix(self, prefix: str) -> None:
46
+ pass
47
+
48
+ def delete_prefix(self, prefix: str) -> None:
49
+ for key in [k for k in self.files if k.startswith(prefix)]:
50
+ del self.files[key]
51
+
52
+ def check_file(self, path: str) -> bool:
53
+ return path in self.files
54
+
55
+ def check_prefix(self, prefix: str) -> bool:
56
+ return any(k.startswith(prefix) for k in self.files)
57
+
58
+ def list_files(self, prefix: str) -> list[str]:
59
+ return [k for k in self.files if k.startswith(prefix)]
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, Literal, Type, TypedDict
4
+
5
+ if TYPE_CHECKING:
6
+ from requests.auth import AuthBase
7
+
8
+ # ── Compression ────────────────────────────────────────────────────────────
9
+
10
+ CompressionOptions = Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"] | None
11
+
12
+ # ── Filesystem / format ────────────────────────────────────────────────────
13
+
14
+ SaveFormat = Literal["jsonl"]
15
+
16
+ # ── Metadata columns ───────────────────────────────────────────────────────
17
+ # Passed to FilesystemToDatabaseOperator as extra columns added to every row.
18
+ # Key = column name; value = Airflow template string (e.g. "{{ ds }}").
19
+ # Keys prefixed with "_" are coerced to datetime at load time.
20
+
21
+ MetadataSpec = dict[str, str]
22
+
23
+ # ── HTTP multi-request ─────────────────────────────────────────────────────
24
+
25
+
26
+ class RequestSpec(TypedDict, total=False):
27
+ """User-provided per-request overrides for MultiHttpToFilesystem (all keys optional)."""
28
+
29
+ endpoint: str
30
+ method: str
31
+ data: Any
32
+ headers: dict[str, str]
33
+ auth_type: Type["AuthBase"] | None
34
+ jmespath_expression: str | None
35
+ save_format: SaveFormat
36
+ source_format: SaveFormat
37
+ compression: CompressionOptions
38
+
39
+
40
+ class RequestState(TypedDict):
41
+ """Fully-resolved runtime state for a single HTTP request (all keys required)."""
42
+
43
+ endpoint: str | None
44
+ method: str
45
+ data: Any
46
+ headers: dict[str, str] | None
47
+ auth_type: Type["AuthBase"] | None
48
+ jmespath_expression: str | None
49
+ save_format: SaveFormat
50
+ source_format: SaveFormat
51
+ compression: CompressionOptions
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: airflow-toolkit
3
- Version: 2.2.0
3
+ Version: 2.3.0
4
4
  Summary: A toolkit of operators, hooks and utilities for Apache Airflow 3
5
5
  Author-email: Biel Llobera <biel_llobera@dkl.digital>
6
6
  Requires-Python: <3.15,>=3.11
@@ -32,6 +32,10 @@ Provides-Extra: duckdb
32
32
  Requires-Dist: airflow-provider-duckdb>=0.1.2; extra == "duckdb"
33
33
  Provides-Extra: sqlite
34
34
  Requires-Dist: apache-airflow-providers-sqlite; extra == "sqlite"
35
+ Provides-Extra: excel
36
+ Requires-Dist: openpyxl>=3.1; extra == "excel"
37
+ Provides-Extra: avro
38
+ Requires-Dist: fastavro>=1.9; extra == "avro"
35
39
  Provides-Extra: airflow3-full
36
40
  Requires-Dist: apache-airflow<4,>=3; extra == "airflow3-full"
37
41
  Requires-Dist: apache-airflow-providers-fab>=3.0.0; extra == "airflow3-full"
@@ -49,6 +53,8 @@ Requires-Dist: requests>=2.31.0; extra == "airflow3-full"
49
53
  Requires-Dist: jmespath<2,>=1.0.1; extra == "airflow3-full"
50
54
  Requires-Dist: airflow-provider-duckdb>=0.1.2; extra == "airflow3-full"
51
55
  Requires-Dist: apache-airflow-providers-sqlite; extra == "airflow3-full"
56
+ Requires-Dist: openpyxl>=3.1; extra == "airflow3-full"
57
+ Requires-Dist: fastavro>=1.9; extra == "airflow3-full"
52
58
  Dynamic: license-file
53
59
 
54
60
  # Airflow Toolkit
@@ -136,10 +142,11 @@ pip install "airflow-toolkit[airflow3-full]"
136
142
  | `google` | `providers-google` | GCS filesystem backend |
137
143
  | `azure` | `providers-microsoft-azure` | Azure Blob / ADLS filesystem backend |
138
144
  | `sftp` | `providers-sftp` | SFTP filesystem backend |
139
- | `slack` | `providers-slack` | Slack failure notifications |
140
145
  | `http` | `providers-http`, `requests`, `jmespath`, `pandas` | `HttpToFilesystem`, `MultiHttpToFilesystem` |
141
146
  | `duckdb` | `airflow-provider-duckdb` | `DuckdbToDeltalake` operator |
142
147
  | `sqlite` | `providers-sqlite` | SQLite as source or destination |
148
+ | `excel` | `openpyxl` | Excel (`.xlsx` / `.xls`) support in `FilesystemToDatabase` |
149
+ | `avro` | `fastavro` | Avro support in `FilesystemToDatabase` |
143
150
  | `airflow3-full` | all of the above | Quick start / development |
144
151
 
145
152
  ---
@@ -313,7 +320,9 @@ FilesystemToFilesystem(
313
320
 
314
321
  ### FilesystemToDatabase
315
322
 
316
- Reads files (CSV, JSON, or Parquet) from any filesystem and loads them into any SQLAlchemy-compatible database. Handles schema drift automatically: columns present in the file but missing from the table are added; columns present in the table but missing from the file are filled with `NULL`.
323
+ Reads files from any filesystem and loads them into any SQLAlchemy-compatible database. Handles schema drift automatically: columns present in the file but missing from the table are added; columns present in the table but missing from the file are filled with `NULL`.
324
+
325
+ **Supported formats:** `csv`, `json`, `parquet`, `excel`, `avro`, `fixed_width`.
317
326
 
318
327
  ```python
319
328
  from airflow_toolkit.providers.deltalake.operators.filesystem_to_database import FilesystemToDatabaseOperator
@@ -325,7 +334,7 @@ FilesystemToDatabaseOperator(
325
334
  filesystem_path='raw/orders/{{ ds }}/',
326
335
  db_schema='public',
327
336
  db_table='orders',
328
- source_format='csv',
337
+ source_format='csv', # 'csv' | 'json' | 'parquet' | 'excel' | 'avro' | 'fixed_width'
329
338
  table_aggregation_type='append', # 'append' | 'replace' | 'fail'
330
339
  metadata={
331
340
  '_ds': '{{ ds }}',
@@ -335,6 +344,52 @@ FilesystemToDatabaseOperator(
335
344
  )
336
345
  ```
337
346
 
347
+ **Excel** (requires the `[excel]` extra):
348
+
349
+ ```python
350
+ FilesystemToDatabaseOperator(
351
+ task_id='load_excel_report',
352
+ filesystem_conn_id='my_data_lake',
353
+ database_conn_id='my_postgres',
354
+ filesystem_path='raw/reports/{{ ds }}/',
355
+ db_table='monthly_report',
356
+ source_format='excel',
357
+ source_format_options={'sheet_name': 'Data'},
358
+ )
359
+ ```
360
+
361
+ **Avro** (requires the `[avro]` extra):
362
+
363
+ ```python
364
+ FilesystemToDatabaseOperator(
365
+ task_id='load_avro_events',
366
+ filesystem_conn_id='my_data_lake',
367
+ database_conn_id='my_postgres',
368
+ filesystem_path='raw/events/{{ ds }}/',
369
+ db_table='events',
370
+ source_format='avro',
371
+ )
372
+ ```
373
+
374
+ **Fixed-width** (no extra required — pandas native):
375
+
376
+ ```python
377
+ FilesystemToDatabaseOperator(
378
+ task_id='load_fixed_width',
379
+ filesystem_conn_id='my_data_lake',
380
+ database_conn_id='my_postgres',
381
+ filesystem_path='raw/exports/{{ ds }}/',
382
+ db_table='transactions',
383
+ source_format='fixed_width',
384
+ source_format_options={
385
+ 'colspecs': [(0, 10), (10, 25), (25, 35)],
386
+ 'names': ['date', 'description', 'amount'],
387
+ },
388
+ )
389
+ ```
390
+
391
+ Each format is matched by file extension: `.csv`/`.csv.gz`, `.json`/`.json.gz`, `.parquet`/`.parquet.gz`, `.xlsx`/`.xls`, `.avro`, `.fwf`/`.txt`/`.dat`. Files with other extensions in the same prefix are silently skipped.
392
+
338
393
  ### DuckdbToDeltalake
339
394
 
340
395
  Executes a DuckDB SQL query and writes the result directly to a Delta Lake table on Azure storage. Useful for in-process transformations that land results as an open table format.
@@ -530,6 +585,50 @@ Each environment maps to a distinct colour across all channels so alerts are rec
530
585
 
531
586
  ---
532
587
 
588
+ ## Testing Utilities
589
+
590
+ ### MockFilesystem
591
+
592
+ `MockFilesystem` is an in-memory implementation of `FilesystemProtocol` for unit testing. It requires no Docker, no cloud credentials, and no network — all files are stored in a plain Python dict.
593
+
594
+ ```python
595
+ from airflow_toolkit.testing import MockFilesystem
596
+
597
+ # Pre-load files at construction time
598
+ fs = MockFilesystem({
599
+ "raw/orders/2024-01-01/data.csv": b"id,amount\n1,100\n2,200",
600
+ })
601
+
602
+ # Or write files programmatically
603
+ fs.write(b"id,amount\n3,300", "raw/orders/2024-01-02/data.csv")
604
+
605
+ # Inspect the result in assertions
606
+ assert fs.check_file("raw/orders/2024-01-01/data.csv")
607
+ assert len(fs.list_files("raw/orders/")) == 2
608
+ assert fs.files["raw/orders/2024-01-01/data.csv"] == b"id,amount\n1,100\n2,200"
609
+ ```
610
+
611
+ Use it to patch `FilesystemFactory.get_data_lake_filesystem` in your operator tests:
612
+
613
+ ```python
614
+ from unittest.mock import patch
615
+ from airflow_toolkit.testing import MockFilesystem
616
+
617
+ def test_my_pipeline(tmp_path):
618
+ fs = MockFilesystem({"data/file.csv": b"id,name\n1,Alice"})
619
+
620
+ with patch(
621
+ "airflow_toolkit.filesystems.filesystem_factory.FilesystemFactory.get_data_lake_filesystem",
622
+ return_value=fs,
623
+ ):
624
+ # run your operator or task here
625
+ ...
626
+ ```
627
+
628
+ `MockFilesystem` implements the full `FilesystemProtocol`: `read`, `write`, `delete_file`, `create_prefix`, `delete_prefix`, `check_file`, `check_prefix`, `list_files`.
629
+
630
+ ---
631
+
533
632
  ## Running Tests
534
633
 
535
634
  ### Integration tests
@@ -6,6 +6,8 @@ src/airflow_toolkit/compression_utils.py
6
6
  src/airflow_toolkit/exceptions.py
7
7
  src/airflow_toolkit/protocols.py
8
8
  src/airflow_toolkit/py.typed
9
+ src/airflow_toolkit/testing.py
10
+ src/airflow_toolkit/types.py
9
11
  src/airflow_toolkit.egg-info/PKG-INFO
10
12
  src/airflow_toolkit.egg-info/SOURCES.txt
11
13
  src/airflow_toolkit.egg-info/dependency_links.txt
@@ -21,10 +21,15 @@ requests>=2.31.0
21
21
  jmespath<2,>=1.0.1
22
22
  airflow-provider-duckdb>=0.1.2
23
23
  apache-airflow-providers-sqlite
24
+ openpyxl>=3.1
25
+ fastavro>=1.9
24
26
 
25
27
  [amazon]
26
28
  apache-airflow-providers-amazon>=9.15.0
27
29
 
30
+ [avro]
31
+ fastavro>=1.9
32
+
28
33
  [azure]
29
34
  apache-airflow-providers-microsoft-azure>=8
30
35
 
@@ -37,6 +42,9 @@ pandas<3,>=2.1.1
37
42
  [duckdb]
38
43
  airflow-provider-duckdb>=0.1.2
39
44
 
45
+ [excel]
46
+ openpyxl>=3.1
47
+
40
48
  [google]
41
49
  apache-airflow-providers-google>=18
42
50