elaunira-airflow-provider-r2index 0.2.1__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. {elaunira_airflow_provider_r2index-0.2.1 → elaunira_airflow_provider_r2index-0.3.0}/PKG-INFO +2 -1
  2. {elaunira_airflow_provider_r2index-0.2.1 → elaunira_airflow_provider_r2index-0.3.0}/pyproject.toml +2 -1
  3. {elaunira_airflow_provider_r2index-0.2.1 → elaunira_airflow_provider_r2index-0.3.0}/src/elaunira/airflow/provider/r2index/__init__.py +10 -0
  4. elaunira_airflow_provider_r2index-0.3.0/src/elaunira/airflow/provider/r2index/decorators/__init__.py +8 -0
  5. elaunira_airflow_provider_r2index-0.3.0/src/elaunira/airflow/provider/r2index/decorators/r2index.py +139 -0
  6. {elaunira_airflow_provider_r2index-0.2.1 → elaunira_airflow_provider_r2index-0.3.0}/src/elaunira/airflow/provider/r2index/hooks/r2index.py +5 -5
  7. elaunira_airflow_provider_r2index-0.3.0/src/elaunira/airflow/provider/r2index/links/__init__.py +5 -0
  8. elaunira_airflow_provider_r2index-0.3.0/src/elaunira/airflow/provider/r2index/links/r2index.py +37 -0
  9. elaunira_airflow_provider_r2index-0.3.0/src/elaunira/airflow/provider/r2index/operators/__init__.py +15 -0
  10. elaunira_airflow_provider_r2index-0.3.0/src/elaunira/airflow/provider/r2index/operators/r2index.py +232 -0
  11. {elaunira_airflow_provider_r2index-0.2.1 → elaunira_airflow_provider_r2index-0.3.0}/.gitignore +0 -0
  12. {elaunira_airflow_provider_r2index-0.2.1 → elaunira_airflow_provider_r2index-0.3.0}/LICENSE +0 -0
  13. {elaunira_airflow_provider_r2index-0.2.1 → elaunira_airflow_provider_r2index-0.3.0}/README.md +0 -0
  14. {elaunira_airflow_provider_r2index-0.2.1 → elaunira_airflow_provider_r2index-0.3.0}/src/elaunira/__init__.py +0 -0
  15. {elaunira_airflow_provider_r2index-0.2.1 → elaunira_airflow_provider_r2index-0.3.0}/src/elaunira/airflow/__init__.py +0 -0
  16. {elaunira_airflow_provider_r2index-0.2.1 → elaunira_airflow_provider_r2index-0.3.0}/src/elaunira/airflow/provider/__init__.py +0 -0
  17. {elaunira_airflow_provider_r2index-0.2.1 → elaunira_airflow_provider_r2index-0.3.0}/src/elaunira/airflow/provider/r2index/hooks/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: elaunira-airflow-provider-r2index
3
- Version: 0.2.1
3
+ Version: 0.3.0
4
4
  Summary: Airflow provider for Elaunira R2Index connections
5
5
  Project-URL: Repository, https://github.com/elaunira/elaunira-airflow-provider-r2index
6
6
  License-Expression: MIT
@@ -8,3 +8,4 @@ License-File: LICENSE
8
8
  Requires-Python: >=3.12
9
9
  Requires-Dist: apache-airflow-providers-hashicorp
10
10
  Requires-Dist: apache-airflow>=3.0.0
11
+ Requires-Dist: elaunira-r2index
@@ -4,13 +4,14 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "elaunira-airflow-provider-r2index"
7
- version = "0.2.1"
7
+ version = "0.3.0"
8
8
  description = "Airflow provider for Elaunira R2Index connections"
9
9
  requires-python = ">=3.12"
10
10
  license = "MIT"
11
11
  dependencies = [
12
12
  "apache-airflow>=3.0.0",
13
13
  "apache-airflow-providers-hashicorp",
14
+ "elaunira-r2index",
14
15
  ]
15
16
 
16
17
  [project.urls]
@@ -17,5 +17,15 @@ def get_provider_info():
17
17
  "hook-class-name": "elaunira.airflow.provider.r2index.hooks.r2index.R2IndexHook",
18
18
  }
19
19
  ],
20
+ "task-decorators": [
21
+ {
22
+ "name": "r2index_upload",
23
+ "class-name": "elaunira.airflow.provider.r2index.decorators.r2index.r2index_upload",
24
+ },
25
+ {
26
+ "name": "r2index_download",
27
+ "class-name": "elaunira.airflow.provider.r2index.decorators.r2index.r2index_download",
28
+ },
29
+ ],
20
30
  "versions": [__version__],
21
31
  }
@@ -0,0 +1,8 @@
1
+ """R2Index TaskFlow decorators."""
2
+
3
+ from elaunira.airflow.provider.r2index.decorators.r2index import (
4
+ r2index_download,
5
+ r2index_upload,
6
+ )
7
+
8
+ __all__ = ["r2index_download", "r2index_upload"]
@@ -0,0 +1,139 @@
1
+ """R2Index TaskFlow decorators."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any, Callable, TypeVar
6
+
7
+ from airflow.sdk.bases.decorator import task_decorator_factory
8
+
9
+ from elaunira.airflow.provider.r2index.operators.r2index import (
10
+ DownloadItem,
11
+ R2IndexDownloadOperator,
12
+ R2IndexUploadOperator,
13
+ UploadItem,
14
+ )
15
+
16
+ if TYPE_CHECKING:
17
+ from airflow.sdk.bases.decorator import TaskDecorator
18
+
19
+ F = TypeVar("F", bound=Callable[..., Any])
20
+
21
+
22
+ def r2index_upload(
23
+ bucket: str,
24
+ r2index_conn_id: str = "r2index_default",
25
+ **kwargs: Any,
26
+ ) -> TaskDecorator:
27
+ """
28
+ Decorator to upload files to R2Index.
29
+
30
+ The decorated function should return an UploadItem or list of UploadItems.
31
+
32
+ Example:
33
+ @task.r2index_upload(bucket="my-bucket")
34
+ def prepare_upload() -> UploadItem:
35
+ return UploadItem(
36
+ source="/tmp/data.csv",
37
+ category="acme",
38
+ entity="acme-data",
39
+ extension="csv",
40
+ media_type="text/csv",
41
+ destination_path="acme/data",
42
+ destination_filename="data.csv",
43
+ destination_version="{{ ds }}",
44
+ )
45
+
46
+ :param bucket: R2 bucket name.
47
+ :param r2index_conn_id: Airflow connection ID for R2Index.
48
+ """
49
+ return task_decorator_factory(
50
+ decorated_operator_class=_R2IndexUploadDecoratedOperator,
51
+ bucket=bucket,
52
+ r2index_conn_id=r2index_conn_id,
53
+ **kwargs,
54
+ )
55
+
56
+
57
+ def r2index_download(
58
+ bucket: str,
59
+ r2index_conn_id: str = "r2index_default",
60
+ **kwargs: Any,
61
+ ) -> TaskDecorator:
62
+ """
63
+ Decorator to download files from R2Index.
64
+
65
+ The decorated function should return a DownloadItem or list of DownloadItems.
66
+
67
+ Example:
68
+ @task.r2index_download(bucket="my-bucket")
69
+ def prepare_download() -> DownloadItem:
70
+ return DownloadItem(
71
+ source_path="acme/data",
72
+ source_filename="data.csv",
73
+ source_version="{{ ds }}",
74
+ destination="/tmp/downloaded.csv",
75
+ )
76
+
77
+ :param bucket: R2 bucket name.
78
+ :param r2index_conn_id: Airflow connection ID for R2Index.
79
+ """
80
+ return task_decorator_factory(
81
+ decorated_operator_class=_R2IndexDownloadDecoratedOperator,
82
+ bucket=bucket,
83
+ r2index_conn_id=r2index_conn_id,
84
+ **kwargs,
85
+ )
86
+
87
+
88
+ class _R2IndexUploadDecoratedOperator(R2IndexUploadOperator):
89
+ """Decorated operator for R2Index uploads."""
90
+
91
+ custom_operator_name = "@task.r2index_upload"
92
+
93
+ def __init__(
94
+ self,
95
+ *,
96
+ python_callable: Callable[..., UploadItem | list[UploadItem]],
97
+ op_args: list[Any] | None = None,
98
+ op_kwargs: dict[str, Any] | None = None,
99
+ **kwargs: Any,
100
+ ) -> None:
101
+ self.python_callable = python_callable
102
+ self.op_args = op_args or []
103
+ self.op_kwargs = op_kwargs or {}
104
+ # items will be set in execute()
105
+ kwargs["items"] = []
106
+ super().__init__(**kwargs)
107
+
108
+ def execute(self, context: Any) -> list[dict[str, Any]]:
109
+ """Execute the decorated function and upload the result."""
110
+ items = self.python_callable(*self.op_args, **self.op_kwargs)
111
+ self.items = [items] if isinstance(items, UploadItem) else items
112
+ return super().execute(context)
113
+
114
+
115
+ class _R2IndexDownloadDecoratedOperator(R2IndexDownloadOperator):
116
+ """Decorated operator for R2Index downloads."""
117
+
118
+ custom_operator_name = "@task.r2index_download"
119
+
120
+ def __init__(
121
+ self,
122
+ *,
123
+ python_callable: Callable[..., DownloadItem | list[DownloadItem]],
124
+ op_args: list[Any] | None = None,
125
+ op_kwargs: dict[str, Any] | None = None,
126
+ **kwargs: Any,
127
+ ) -> None:
128
+ self.python_callable = python_callable
129
+ self.op_args = op_args or []
130
+ self.op_kwargs = op_kwargs or {}
131
+ # items will be set in execute()
132
+ kwargs["items"] = []
133
+ super().__init__(**kwargs)
134
+
135
+ def execute(self, context: Any) -> list[dict[str, Any]]:
136
+ """Execute the decorated function and download the result."""
137
+ items = self.python_callable(*self.op_args, **self.op_kwargs)
138
+ self.items = [items] if isinstance(items, DownloadItem) else items
139
+ return super().execute(context)
@@ -25,7 +25,7 @@ class R2IndexHook(BaseHook):
25
25
  {
26
26
  "vault_conn_id": "openbao_default",
27
27
  "vault_namespace": "elaunira/production",
28
- "vault_secrets": {
28
+ "vault_secrets_mapping": {
29
29
  "r2index_api_url": "cloudflare/r2index#api-url",
30
30
  "r2index_api_token": "cloudflare/r2index#api-token",
31
31
  "r2_access_key_id": "cloudflare/r2/airflow#access-key-id",
@@ -37,7 +37,7 @@ class R2IndexHook(BaseHook):
37
37
  The vault_conn_id references an Airflow HashiCorp Vault connection
38
38
  configured with AppRole or other auth method.
39
39
 
40
- vault_secrets format: "path#key" or "path" (uses config key as secret key)
40
+ vault_secrets_mapping format: "path#key" or "path" (uses config key as secret key)
41
41
  Required keys:
42
42
  - r2index_api_url
43
43
  - r2index_api_token
@@ -87,7 +87,7 @@ class R2IndexHook(BaseHook):
87
87
  "password": "API token for direct connection",
88
88
  "vault_conn_id": "openbao-myservice",
89
89
  "vault_namespace": "myservice/production",
90
- "vault_secrets": '{"r2index_api_url": "cloudflare/r2index#api-url", ...}',
90
+ "vault_secrets_mapping": '{"r2index_api_url": "cloudflare/r2index#api-url", ...}',
91
91
  "r2_access_key_id": "Direct mode: R2 access key ID",
92
92
  "r2_secret_access_key": "Direct mode: R2 secret access key",
93
93
  "r2_endpoint_url": "https://account.r2.cloudflarestorage.com",
@@ -112,7 +112,7 @@ class R2IndexHook(BaseHook):
112
112
  widget=BS3TextFieldWidget(),
113
113
  description="OpenBao namespace (e.g., elaunira/production)",
114
114
  ),
115
- "vault_secrets": StringField(
115
+ "vault_secrets_mapping": StringField(
116
116
  lazy_gettext("Vault Secrets (JSON)"),
117
117
  widget=BS3TextFieldWidget(),
118
118
  description="JSON mapping of config keys to secret paths",
@@ -213,7 +213,7 @@ class R2IndexHook(BaseHook):
213
213
 
214
214
  vault_conn_id = extra.get("vault_conn_id")
215
215
  if vault_conn_id:
216
- secrets_raw = extra.get("vault_secrets")
216
+ secrets_raw = extra.get("vault_secrets_mapping")
217
217
  if not secrets_raw:
218
218
  return None
219
219
  if isinstance(secrets_raw, str):
@@ -0,0 +1,5 @@
1
+ """R2Index operator extra links."""
2
+
3
+ from elaunira.airflow.provider.r2index.links.r2index import R2IndexFileLink
4
+
5
+ __all__ = ["R2IndexFileLink"]
@@ -0,0 +1,37 @@
1
+ """R2Index operator extra links."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from airflow.models import BaseOperatorLink
8
+
9
+ if TYPE_CHECKING:
10
+ from airflow.models.taskinstancekey import TaskInstanceKey
11
+
12
+
13
+ class R2IndexFileLink(BaseOperatorLink):
14
+ """
15
+ Link to the R2Index file details.
16
+
17
+ This link extracts the file ID from the operator's XCom return value
18
+ and constructs a URL to view the file in the R2Index UI.
19
+ """
20
+
21
+ name = "R2Index File"
22
+
23
+ def get_link(
24
+ self,
25
+ operator: Any,
26
+ *,
27
+ ti_key: TaskInstanceKey,
28
+ ) -> str:
29
+ """Get the link to the R2Index file."""
30
+ from airflow.models import XCom
31
+
32
+ result = XCom.get_value(ti_key=ti_key)
33
+ if result and isinstance(result, dict):
34
+ file_id = result.get("id") or result.get("file_record", {}).get("id")
35
+ if file_id:
36
+ return f"https://r2index.elaunira.com/files/{file_id}"
37
+ return ""
@@ -0,0 +1,15 @@
1
+ """R2Index operators for Airflow."""
2
+
3
+ from elaunira.airflow.provider.r2index.operators.r2index import (
4
+ DownloadItem,
5
+ R2IndexDownloadOperator,
6
+ R2IndexUploadOperator,
7
+ UploadItem,
8
+ )
9
+
10
+ __all__ = [
11
+ "DownloadItem",
12
+ "R2IndexDownloadOperator",
13
+ "R2IndexUploadOperator",
14
+ "UploadItem",
15
+ ]
@@ -0,0 +1,232 @@
1
+ """R2Index operators for Airflow."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from collections.abc import Sequence
7
+ from dataclasses import dataclass
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ from airflow.exceptions import AirflowException
11
+ from airflow.models import BaseOperator
12
+
13
+ from elaunira.airflow.provider.r2index.hooks import R2IndexHook
14
+ from elaunira.airflow.provider.r2index.links.r2index import R2IndexFileLink
15
+ from elaunira.r2index import AsyncR2IndexClient
16
+
17
+ if TYPE_CHECKING:
18
+ from airflow.utils.context import Context
19
+
20
+
21
+ @dataclass
22
+ class UploadItem:
23
+ """Defines a single file to upload."""
24
+
25
+ source: str
26
+ category: str
27
+ entity: str
28
+ extension: str
29
+ media_type: str
30
+ destination_path: str
31
+ destination_filename: str
32
+ destination_version: str
33
+ bucket: str | None = None
34
+ name: str | None = None
35
+ tags: list[str] | None = None
36
+ extra: dict[str, Any] | None = None
37
+ create_checksum_files: bool = False
38
+ r2index_conn_id: str | None = None
39
+
40
+
41
+ @dataclass
42
+ class DownloadItem:
43
+ """Defines a single file to download."""
44
+
45
+ source_path: str
46
+ source_filename: str
47
+ source_version: str
48
+ destination: str
49
+ bucket: str | None = None
50
+ verify_checksum: bool = True
51
+ r2index_conn_id: str | None = None
52
+
53
+
54
+ class R2IndexUploadOperator(BaseOperator):
55
+ """
56
+ Upload one or more files to R2 in parallel.
57
+
58
+ When multiple items are provided, all uploads run concurrently using
59
+ asyncio within the worker process.
60
+
61
+ :param bucket: Default R2 bucket name (can be overridden per item).
62
+ :param items: List of UploadItem or single UploadItem defining files to upload.
63
+ :param r2index_conn_id: Default Airflow connection ID (can be overridden per item).
64
+ """
65
+
66
+ template_fields: Sequence[str] = ("items",)
67
+ template_ext: Sequence[str] = ()
68
+ ui_color = "#e4f0e8"
69
+ operator_extra_links = (R2IndexFileLink(),)
70
+
71
+ def __init__(
72
+ self,
73
+ *,
74
+ bucket: str,
75
+ items: list[UploadItem] | UploadItem,
76
+ r2index_conn_id: str = "r2index_default",
77
+ **kwargs: Any,
78
+ ) -> None:
79
+ super().__init__(**kwargs)
80
+ self.bucket = bucket
81
+ self.items = [items] if isinstance(items, UploadItem) else items
82
+ self.r2index_conn_id = r2index_conn_id
83
+
84
+ def _get_client_config(self, conn_id: str) -> dict[str, Any]:
85
+ """Get client configuration using the hook's priority chain."""
86
+ hook = R2IndexHook(r2index_conn_id=conn_id)
87
+ config = hook._get_config_from_connection()
88
+ if config is None or not config.get("index_api_url"):
89
+ config = hook._get_config_from_env()
90
+ return config or {}
91
+
92
+ def execute(self, context: Context) -> list[dict[str, Any]]:
93
+ """Execute the uploads in parallel."""
94
+ # Group items by connection ID for efficient client reuse
95
+ conn_configs: dict[str, dict[str, Any]] = {}
96
+
97
+ async def get_or_create_config(conn_id: str) -> dict[str, Any]:
98
+ if conn_id not in conn_configs:
99
+ conn_configs[conn_id] = self._get_client_config(conn_id)
100
+ return conn_configs[conn_id]
101
+
102
+ async def upload_one(item: UploadItem) -> dict[str, Any]:
103
+ conn_id = item.r2index_conn_id or self.r2index_conn_id
104
+ bucket = item.bucket or self.bucket
105
+ config = await get_or_create_config(conn_id)
106
+
107
+ try:
108
+ async with AsyncR2IndexClient(**config) as client:
109
+ file_record = await client.upload(
110
+ bucket=bucket,
111
+ source=item.source,
112
+ category=item.category,
113
+ entity=item.entity,
114
+ extension=item.extension,
115
+ media_type=item.media_type,
116
+ destination_path=item.destination_path,
117
+ destination_filename=item.destination_filename,
118
+ destination_version=item.destination_version,
119
+ name=item.name,
120
+ tags=item.tags,
121
+ extra=item.extra,
122
+ create_checksum_files=item.create_checksum_files,
123
+ )
124
+ return {"status": "success", "file_record": file_record.model_dump()}
125
+ except Exception as e:
126
+ return {"status": "error", "message": str(e), "source": item.source}
127
+
128
+ async def upload_all() -> list[dict[str, Any]]:
129
+ tasks = [upload_one(item) for item in self.items]
130
+ return await asyncio.gather(*tasks)
131
+
132
+ results = asyncio.run(upload_all())
133
+
134
+ # Check for errors
135
+ errors = [r for r in results if r.get("status") == "error"]
136
+ if errors:
137
+ self.log.error("Failed uploads: %s", errors)
138
+ raise AirflowException(f"{len(errors)}/{len(results)} uploads failed")
139
+
140
+ self.log.info("Uploaded %d files in parallel", len(results))
141
+ return [r["file_record"] for r in results]
142
+
143
+
144
+ class R2IndexDownloadOperator(BaseOperator):
145
+ """
146
+ Download one or more files from R2 in parallel.
147
+
148
+ When multiple items are provided, all downloads run concurrently using
149
+ asyncio within the worker process.
150
+
151
+ :param bucket: Default R2 bucket name (can be overridden per item).
152
+ :param items: List of DownloadItem or single DownloadItem defining files to download.
153
+ :param r2index_conn_id: Default Airflow connection ID (can be overridden per item).
154
+ """
155
+
156
+ template_fields: Sequence[str] = ("items",)
157
+ template_ext: Sequence[str] = ()
158
+ ui_color = "#f0e4e8"
159
+ operator_extra_links = (R2IndexFileLink(),)
160
+
161
+ def __init__(
162
+ self,
163
+ *,
164
+ bucket: str,
165
+ items: list[DownloadItem] | DownloadItem,
166
+ r2index_conn_id: str = "r2index_default",
167
+ **kwargs: Any,
168
+ ) -> None:
169
+ super().__init__(**kwargs)
170
+ self.bucket = bucket
171
+ self.items = [items] if isinstance(items, DownloadItem) else items
172
+ self.r2index_conn_id = r2index_conn_id
173
+
174
+ def _get_client_config(self, conn_id: str) -> dict[str, Any]:
175
+ """Get client configuration using the hook's priority chain."""
176
+ hook = R2IndexHook(r2index_conn_id=conn_id)
177
+ config = hook._get_config_from_connection()
178
+ if config is None or not config.get("index_api_url"):
179
+ config = hook._get_config_from_env()
180
+ return config or {}
181
+
182
+ def execute(self, context: Context) -> list[dict[str, Any]]:
183
+ """Execute the downloads in parallel."""
184
+ # Group items by connection ID for efficient client reuse
185
+ conn_configs: dict[str, dict[str, Any]] = {}
186
+
187
+ async def get_or_create_config(conn_id: str) -> dict[str, Any]:
188
+ if conn_id not in conn_configs:
189
+ conn_configs[conn_id] = self._get_client_config(conn_id)
190
+ return conn_configs[conn_id]
191
+
192
+ async def download_one(item: DownloadItem) -> dict[str, Any]:
193
+ conn_id = item.r2index_conn_id or self.r2index_conn_id
194
+ bucket = item.bucket or self.bucket
195
+ config = await get_or_create_config(conn_id)
196
+
197
+ try:
198
+ async with AsyncR2IndexClient(**config) as client:
199
+ downloaded_path, file_record = await client.download(
200
+ bucket=bucket,
201
+ source_path=item.source_path,
202
+ source_filename=item.source_filename,
203
+ source_version=item.source_version,
204
+ destination=item.destination,
205
+ verify_checksum=item.verify_checksum,
206
+ )
207
+ return {
208
+ "status": "success",
209
+ "path": str(downloaded_path),
210
+ "file_record": file_record.model_dump(),
211
+ }
212
+ except Exception as e:
213
+ return {
214
+ "status": "error",
215
+ "message": str(e),
216
+ "source_path": item.source_path,
217
+ }
218
+
219
+ async def download_all() -> list[dict[str, Any]]:
220
+ tasks = [download_one(item) for item in self.items]
221
+ return await asyncio.gather(*tasks)
222
+
223
+ results = asyncio.run(download_all())
224
+
225
+ # Check for errors
226
+ errors = [r for r in results if r.get("status") == "error"]
227
+ if errors:
228
+ self.log.error("Failed downloads: %s", errors)
229
+ raise AirflowException(f"{len(errors)}/{len(results)} downloads failed")
230
+
231
+ self.log.info("Downloaded %d files in parallel", len(results))
232
+ return [{"path": r["path"], "file_record": r["file_record"]} for r in results]