elaunira-airflow-providers-r2index 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. elaunira_airflow_providers_r2index-0.4.0/.gitignore +27 -0
  2. elaunira_airflow_providers_r2index-0.4.0/LICENSE +21 -0
  3. elaunira_airflow_providers_r2index-0.4.0/PKG-INFO +11 -0
  4. elaunira_airflow_providers_r2index-0.4.0/README.md +169 -0
  5. elaunira_airflow_providers_r2index-0.4.0/pyproject.toml +24 -0
  6. elaunira_airflow_providers_r2index-0.4.0/src/elaunira/__init__.py +1 -0
  7. elaunira_airflow_providers_r2index-0.4.0/src/elaunira/airflow/__init__.py +1 -0
  8. elaunira_airflow_providers_r2index-0.4.0/src/elaunira/airflow/providers/__init__.py +1 -0
  9. elaunira_airflow_providers_r2index-0.4.0/src/elaunira/airflow/providers/r2index/__init__.py +31 -0
  10. elaunira_airflow_providers_r2index-0.4.0/src/elaunira/airflow/providers/r2index/decorators/__init__.py +8 -0
  11. elaunira_airflow_providers_r2index-0.4.0/src/elaunira/airflow/providers/r2index/decorators/r2index.py +139 -0
  12. elaunira_airflow_providers_r2index-0.4.0/src/elaunira/airflow/providers/r2index/hooks/__init__.py +5 -0
  13. elaunira_airflow_providers_r2index-0.4.0/src/elaunira/airflow/providers/r2index/hooks/r2index.py +350 -0
  14. elaunira_airflow_providers_r2index-0.4.0/src/elaunira/airflow/providers/r2index/links/__init__.py +5 -0
  15. elaunira_airflow_providers_r2index-0.4.0/src/elaunira/airflow/providers/r2index/links/r2index.py +37 -0
  16. elaunira_airflow_providers_r2index-0.4.0/src/elaunira/airflow/providers/r2index/operators/__init__.py +15 -0
  17. elaunira_airflow_providers_r2index-0.4.0/src/elaunira/airflow/providers/r2index/operators/r2index.py +232 -0
@@ -0,0 +1,27 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Distribution / packaging
7
+ build/
8
+ dist/
9
+ *.egg-info/
10
+ *.egg
11
+
12
+ # Virtual environments
13
+ .venv/
14
+ venv/
15
+
16
+ # IDE
17
+ .idea/
18
+ .vscode/
19
+ *.iml
20
+
21
+ # Testing
22
+ .pytest_cache/
23
+ .coverage
24
+ htmlcov/
25
+
26
+ # OS
27
+ .DS_Store
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Elaunira
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: elaunira-airflow-providers-r2index
3
+ Version: 0.4.0
4
+ Summary: Airflow provider for Elaunira R2Index connections
5
+ Project-URL: Repository, https://github.com/elaunira/elaunira-airflow-providers-r2index
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.12
9
+ Requires-Dist: apache-airflow-providers-hashicorp
10
+ Requires-Dist: apache-airflow>=3.0.0
11
+ Requires-Dist: elaunira-r2index
@@ -0,0 +1,169 @@
1
+ # Elaunira Airflow Provider for R2Index
2
+
3
+ Airflow provider package for R2Index, providing connection type, operators, and TaskFlow decorators.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install elaunira-airflow-providers-r2index
9
+ ```
10
+
11
+ ## Features
12
+
13
+ - **R2Index** connection type in Airflow UI
14
+ - `R2IndexUploadOperator` and `R2IndexDownloadOperator` for file transfers
15
+ - `@task.r2index_upload` and `@task.r2index_download` TaskFlow decorators
16
+
17
+ ## Connection Configuration
18
+
19
+ After installation, the **R2Index** connection type will be available in Airflow's connection UI.
20
+
21
+ ### Vault/OpenBao Mode (Recommended)
22
+
23
+ Fetches credentials dynamically from HashiCorp Vault or OpenBao:
24
+
25
+ | Field | Description |
26
+ |-------|-------------|
27
+ | Vault Connection ID | Airflow Vault connection ID (e.g., `openbao-elaunira`) |
28
+ | Vault Namespace | OpenBao namespace (e.g., `elaunira/production`) |
29
+ | Vault Secrets Mapping | JSON mapping of config keys to secret paths |
30
+
31
+ Example Vault Secrets Mapping:
32
+ ```json
33
+ {
34
+ "r2index_api_url": "cloudflare/r2index#api-url",
35
+ "r2index_api_token": "cloudflare/r2index#api-token",
36
+ "r2_access_key_id": "cloudflare/r2/e2e-tests#access-key-id",
37
+ "r2_secret_access_key": "cloudflare/r2/e2e-tests#secret-access-key",
38
+ "r2_endpoint_url": "cloudflare/r2/e2e-tests#endpoint-url"
39
+ }
40
+ ```
41
+
42
+ Secret path format: `path#key` (e.g., `cloudflare/r2index#api-url`)
43
+
44
+ ### Direct Mode
45
+
46
+ Store credentials directly in the connection:
47
+
48
+ | Field | Description |
49
+ |-------|-------------|
50
+ | R2Index API URL | API endpoint URL |
51
+ | R2Index API Token | API authentication token |
52
+ | R2 Access Key ID | Cloudflare R2 access key |
53
+ | R2 Secret Access Key | Cloudflare R2 secret key |
54
+ | R2 Endpoint URL | Cloudflare R2 endpoint |
55
+
56
+ ### Environment Variables Fallback
57
+
58
+ If no connection is configured, the hook falls back to environment variables:
59
+
60
+ - `R2INDEX_API_URL`
61
+ - `R2INDEX_API_TOKEN`
62
+ - `R2_ACCESS_KEY_ID`
63
+ - `R2_SECRET_ACCESS_KEY`
64
+ - `R2_ENDPOINT_URL`
65
+
66
+ ## Operators
67
+
68
+ ### R2IndexUploadOperator
69
+
70
+ Upload files to R2Index:
71
+
72
+ ```python
73
+ from elaunira.airflow.providers.r2index.operators import R2IndexUploadOperator, UploadItem
74
+
75
+ upload = R2IndexUploadOperator(
76
+ task_id="upload_file",
77
+ bucket="my-bucket",
78
+ r2index_conn_id="my_r2index_connection",
79
+ items=UploadItem(
80
+ source="/tmp/data.csv",
81
+ category="example",
82
+ entity="sample-data",
83
+ extension="csv",
84
+ media_type="text/csv",
85
+ destination_path="example/data",
86
+ destination_filename="data.csv",
87
+ destination_version="{{ ds }}",
88
+ ),
89
+ )
90
+ ```
91
+
92
+ ### R2IndexDownloadOperator
93
+
94
+ Download files from R2Index:
95
+
96
+ ```python
97
+ from elaunira.airflow.providers.r2index.operators import R2IndexDownloadOperator, DownloadItem
98
+
99
+ download = R2IndexDownloadOperator(
100
+ task_id="download_file",
101
+ bucket="my-bucket",
102
+ r2index_conn_id="my_r2index_connection",
103
+ items=DownloadItem(
104
+ source_path="example/data",
105
+ source_filename="data.csv",
106
+ source_version="{{ ds }}",
107
+ destination="/tmp/downloaded.csv",
108
+ ),
109
+ )
110
+ ```
111
+
112
+ ## TaskFlow Decorators
113
+
114
+ ### @task.r2index_upload
115
+
116
+ ```python
117
+ from airflow.sdk import dag, task
118
+ from elaunira.airflow.providers.r2index.operators import UploadItem
119
+
120
+ @dag(schedule=None)
121
+ def my_dag():
122
+ @task.r2index_upload(bucket="my-bucket", r2index_conn_id="my_connection")
123
+ def prepare_upload() -> UploadItem:
124
+ return UploadItem(
125
+ source="/tmp/data.csv",
126
+ category="example",
127
+ entity="sample-data",
128
+ extension="csv",
129
+ media_type="text/csv",
130
+ destination_path="example/data",
131
+ destination_filename="data.csv",
132
+ destination_version="2024-01-01",
133
+ )
134
+
135
+ prepare_upload()
136
+ ```
137
+
138
+ ### @task.r2index_download
139
+
140
+ ```python
141
+ from airflow.sdk import dag, task
142
+ from elaunira.airflow.providers.r2index.operators import DownloadItem
143
+
144
+ @dag(schedule=None)
145
+ def my_dag():
146
+ @task.r2index_download(bucket="my-bucket", r2index_conn_id="my_connection")
147
+ def prepare_download() -> DownloadItem:
148
+ return DownloadItem(
149
+ source_path="example/data",
150
+ source_filename="data.csv",
151
+ source_version="2024-01-01",
152
+ destination="/tmp/downloaded.csv",
153
+ )
154
+
155
+ prepare_download()
156
+ ```
157
+
158
+ ## Hook Usage
159
+
160
+ ```python
161
+ from elaunira.airflow.providers.r2index.hooks import R2IndexHook
162
+
163
+ hook = R2IndexHook(r2index_conn_id="my_r2index_connection")
164
+ client = hook.get_conn()
165
+ ```
166
+
167
+ ## License
168
+
169
+ MIT
@@ -0,0 +1,24 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "elaunira-airflow-providers-r2index"
7
+ version = "0.4.0"
8
+ description = "Airflow provider for Elaunira R2Index connections"
9
+ requires-python = ">=3.12"
10
+ license = "MIT"
11
+ dependencies = [
12
+ "apache-airflow>=3.0.0",
13
+ "apache-airflow-providers-hashicorp",
14
+ "elaunira-r2index",
15
+ ]
16
+
17
+ [project.urls]
18
+ Repository = "https://github.com/elaunira/elaunira-airflow-providers-r2index"
19
+
20
+ [project.entry-points."apache_airflow_provider"]
21
+ provider_info = "elaunira.airflow.providers.r2index:get_provider_info"
22
+
23
+ [tool.hatch.build.targets.wheel]
24
+ packages = ["src/elaunira"]
@@ -0,0 +1 @@
1
+ """Elaunira namespace package."""
@@ -0,0 +1 @@
1
+ """Elaunira Airflow namespace package."""
@@ -0,0 +1 @@
1
+ """Elaunira Airflow providers namespace package."""
@@ -0,0 +1,31 @@
1
+ """Elaunira R2Index Airflow provider."""
2
+
3
+ from importlib.metadata import version
4
+
5
+ __version__ = version("elaunira-airflow-providers-r2index")
6
+
7
+
8
+ def get_provider_info():
9
+ """Return provider metadata for Airflow."""
10
+ return {
11
+ "package-name": "elaunira-airflow-providers-r2index",
12
+ "name": "R2Index",
13
+ "description": "Airflow provider for R2Index connections",
14
+ "connection-types": [
15
+ {
16
+ "connection-type": "r2index",
17
+ "hook-class-name": "elaunira.airflow.providers.r2index.hooks.r2index.R2IndexHook",
18
+ }
19
+ ],
20
+ "task-decorators": [
21
+ {
22
+ "name": "r2index_upload",
23
+ "class-name": "elaunira.airflow.providers.r2index.decorators.r2index.r2index_upload",
24
+ },
25
+ {
26
+ "name": "r2index_download",
27
+ "class-name": "elaunira.airflow.providers.r2index.decorators.r2index.r2index_download",
28
+ },
29
+ ],
30
+ "versions": [__version__],
31
+ }
@@ -0,0 +1,8 @@
1
+ """R2Index TaskFlow decorators."""
2
+
3
+ from elaunira.airflow.providers.r2index.decorators.r2index import (
4
+ r2index_download,
5
+ r2index_upload,
6
+ )
7
+
8
+ __all__ = ["r2index_download", "r2index_upload"]
@@ -0,0 +1,139 @@
1
+ """R2Index TaskFlow decorators."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any, Callable, TypeVar
6
+
7
+ from airflow.sdk.bases.decorator import task_decorator_factory
8
+
9
+ from elaunira.airflow.providers.r2index.operators.r2index import (
10
+ DownloadItem,
11
+ R2IndexDownloadOperator,
12
+ R2IndexUploadOperator,
13
+ UploadItem,
14
+ )
15
+
16
+ if TYPE_CHECKING:
17
+ from airflow.sdk.bases.decorator import TaskDecorator
18
+
19
+ F = TypeVar("F", bound=Callable[..., Any])
20
+
21
+
22
+ def r2index_upload(
23
+ bucket: str,
24
+ r2index_conn_id: str = "r2index_default",
25
+ **kwargs: Any,
26
+ ) -> TaskDecorator:
27
+ """
28
+ Decorator to upload files to R2Index.
29
+
30
+ The decorated function should return an UploadItem or list of UploadItems.
31
+
32
+ Example:
33
+ @task.r2index_upload(bucket="my-bucket")
34
+ def prepare_upload() -> UploadItem:
35
+ return UploadItem(
36
+ source="/tmp/data.csv",
37
+ category="acme",
38
+ entity="acme-data",
39
+ extension="csv",
40
+ media_type="text/csv",
41
+ destination_path="acme/data",
42
+ destination_filename="data.csv",
43
+ destination_version="{{ ds }}",
44
+ )
45
+
46
+ :param bucket: R2 bucket name.
47
+ :param r2index_conn_id: Airflow connection ID for R2Index.
48
+ """
49
+ return task_decorator_factory(
50
+ decorated_operator_class=_R2IndexUploadDecoratedOperator,
51
+ bucket=bucket,
52
+ r2index_conn_id=r2index_conn_id,
53
+ **kwargs,
54
+ )
55
+
56
+
57
+ def r2index_download(
58
+ bucket: str,
59
+ r2index_conn_id: str = "r2index_default",
60
+ **kwargs: Any,
61
+ ) -> TaskDecorator:
62
+ """
63
+ Decorator to download files from R2Index.
64
+
65
+ The decorated function should return a DownloadItem or list of DownloadItems.
66
+
67
+ Example:
68
+ @task.r2index_download(bucket="my-bucket")
69
+ def prepare_download() -> DownloadItem:
70
+ return DownloadItem(
71
+ source_path="acme/data",
72
+ source_filename="data.csv",
73
+ source_version="{{ ds }}",
74
+ destination="/tmp/downloaded.csv",
75
+ )
76
+
77
+ :param bucket: R2 bucket name.
78
+ :param r2index_conn_id: Airflow connection ID for R2Index.
79
+ """
80
+ return task_decorator_factory(
81
+ decorated_operator_class=_R2IndexDownloadDecoratedOperator,
82
+ bucket=bucket,
83
+ r2index_conn_id=r2index_conn_id,
84
+ **kwargs,
85
+ )
86
+
87
+
88
+ class _R2IndexUploadDecoratedOperator(R2IndexUploadOperator):
89
+ """Decorated operator for R2Index uploads."""
90
+
91
+ custom_operator_name = "@task.r2index_upload"
92
+
93
+ def __init__(
94
+ self,
95
+ *,
96
+ python_callable: Callable[..., UploadItem | list[UploadItem]],
97
+ op_args: list[Any] | None = None,
98
+ op_kwargs: dict[str, Any] | None = None,
99
+ **kwargs: Any,
100
+ ) -> None:
101
+ self.python_callable = python_callable
102
+ self.op_args = op_args or []
103
+ self.op_kwargs = op_kwargs or {}
104
+ # items will be set in execute()
105
+ kwargs["items"] = []
106
+ super().__init__(**kwargs)
107
+
108
+ def execute(self, context: Any) -> list[dict[str, Any]]:
109
+ """Execute the decorated function and upload the result."""
110
+ items = self.python_callable(*self.op_args, **self.op_kwargs)
111
+ self.items = [items] if isinstance(items, UploadItem) else items
112
+ return super().execute(context)
113
+
114
+
115
+ class _R2IndexDownloadDecoratedOperator(R2IndexDownloadOperator):
116
+ """Decorated operator for R2Index downloads."""
117
+
118
+ custom_operator_name = "@task.r2index_download"
119
+
120
+ def __init__(
121
+ self,
122
+ *,
123
+ python_callable: Callable[..., DownloadItem | list[DownloadItem]],
124
+ op_args: list[Any] | None = None,
125
+ op_kwargs: dict[str, Any] | None = None,
126
+ **kwargs: Any,
127
+ ) -> None:
128
+ self.python_callable = python_callable
129
+ self.op_args = op_args or []
130
+ self.op_kwargs = op_kwargs or {}
131
+ # items will be set in execute()
132
+ kwargs["items"] = []
133
+ super().__init__(**kwargs)
134
+
135
+ def execute(self, context: Any) -> list[dict[str, Any]]:
136
+ """Execute the decorated function and download the result."""
137
+ items = self.python_callable(*self.op_args, **self.op_kwargs)
138
+ self.items = [items] if isinstance(items, DownloadItem) else items
139
+ return super().execute(context)
@@ -0,0 +1,5 @@
1
+ """Elaunira R2Index hooks."""
2
+
3
+ from elaunira.airflow.providers.r2index.hooks.r2index import R2IndexHook
4
+
5
+ __all__ = ["R2IndexHook"]
@@ -0,0 +1,350 @@
1
+ """R2Index hook for Airflow."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ from airflow.sdk.bases.hook import BaseHook
10
+
11
+ if TYPE_CHECKING:
12
+ from elaunira.r2index import R2IndexClient
13
+
14
+
15
+ class R2IndexHook(BaseHook):
16
+ """
17
+ Hook for interacting with R2Index API and R2 storage.
18
+
19
+ Configuration priority:
20
+ 1. Airflow connection with Vault/OpenBao reference (fetches secrets dynamically)
21
+ 2. Airflow connection with direct credentials
22
+ 3. Environment variables (fallback)
23
+
24
+ Airflow connection with Vault/OpenBao reference (extra JSON):
25
+ {
26
+ "vault_conn_id": "openbao_default",
27
+ "vault_namespace": "elaunira/production",
28
+ "vault_secrets_mapping": {
29
+ "r2index_api_url": "cloudflare/r2index#api-url",
30
+ "r2index_api_token": "cloudflare/r2index#api-token",
31
+ "r2_access_key_id": "cloudflare/r2/airflow#access-key-id",
32
+ "r2_secret_access_key": "cloudflare/r2/airflow#secret-access-key",
33
+ "r2_endpoint_url": "cloudflare/r2/airflow#endpoint-url"
34
+ }
35
+ }
36
+
37
+ The vault_conn_id references an Airflow HashiCorp Vault connection
38
+ configured with AppRole or other auth method.
39
+
40
+ vault_secrets_mapping format: "path#key" or "path" (uses config key as secret key)
41
+ Required keys:
42
+ - r2index_api_url
43
+ - r2index_api_token
44
+ - r2_access_key_id
45
+ - r2_secret_access_key
46
+ - r2_endpoint_url
47
+
48
+ Airflow connection with direct credentials:
49
+ - host: R2Index API URL
50
+ - password: R2Index API token
51
+ - extra.r2_access_key_id: R2 access key ID
52
+ - extra.r2_secret_access_key: R2 secret access key
53
+ - extra.r2_endpoint_url: R2 endpoint URL
54
+
55
+ Environment variables (fallback):
56
+ - R2INDEX_API_URL
57
+ - R2INDEX_API_TOKEN
58
+ - R2_ACCESS_KEY_ID
59
+ - R2_SECRET_ACCESS_KEY
60
+ - R2_ENDPOINT_URL
61
+ """
62
+
63
+ conn_name_attr = "r2index_conn_id"
64
+ default_conn_name = "r2index_default"
65
+ conn_type = "r2index"
66
+ hook_name = "R2Index"
67
+
68
+ CONFIG_KEYS = [
69
+ "r2index_api_url",
70
+ "r2index_api_token",
71
+ "r2_access_key_id",
72
+ "r2_secret_access_key",
73
+ "r2_endpoint_url",
74
+ ]
75
+
76
+ @classmethod
77
+ def get_ui_field_behaviour(cls) -> dict[str, Any]:
78
+ """Customize connection form UI."""
79
+ return {
80
+ "hidden_fields": ["port", "schema", "login", "extra"],
81
+ "relabeling": {
82
+ "host": "R2Index API URL (direct mode only)",
83
+ "password": "R2Index API Token (direct mode only)",
84
+ },
85
+ "placeholders": {
86
+ "host": "https://r2index.example.com",
87
+ "password": "API token for direct connection",
88
+ "vault_conn_id": "openbao-myservice",
89
+ "vault_namespace": "myservice/production",
90
+ "vault_secrets_mapping": '{"r2index_api_url": "cloudflare/r2index#api-url", ...}',
91
+ "r2_access_key_id": "Direct mode: R2 access key ID",
92
+ "r2_secret_access_key": "Direct mode: R2 secret access key",
93
+ "r2_endpoint_url": "https://account.r2.cloudflarestorage.com",
94
+ },
95
+ }
96
+
97
+ @classmethod
98
+ def get_connection_form_widgets(cls) -> dict[str, Any]:
99
+ """Define custom connection form widgets."""
100
+ from flask_appbuilder.fieldwidgets import BS3PasswordFieldWidget, BS3TextFieldWidget
101
+ from flask_babel import lazy_gettext
102
+ from wtforms import PasswordField, StringField
103
+
104
+ return {
105
+ "vault_conn_id": StringField(
106
+ lazy_gettext("Vault Connection ID"),
107
+ widget=BS3TextFieldWidget(),
108
+ description="Airflow Vault connection ID (e.g., openbao-elaunira)",
109
+ ),
110
+ "vault_namespace": StringField(
111
+ lazy_gettext("Vault Namespace"),
112
+ widget=BS3TextFieldWidget(),
113
+ description="OpenBao namespace (e.g., elaunira/production)",
114
+ ),
115
+ "vault_secrets_mapping": StringField(
116
+ lazy_gettext("Vault Secrets (JSON)"),
117
+ widget=BS3TextFieldWidget(),
118
+ description="JSON mapping of config keys to secret paths",
119
+ ),
120
+ "r2_access_key_id": StringField(
121
+ lazy_gettext("R2 Access Key ID"),
122
+ widget=BS3TextFieldWidget(),
123
+ description="Direct mode: Cloudflare R2 access key ID",
124
+ ),
125
+ "r2_secret_access_key": PasswordField(
126
+ lazy_gettext("R2 Secret Access Key"),
127
+ widget=BS3PasswordFieldWidget(),
128
+ description="Direct mode: Cloudflare R2 secret access key",
129
+ ),
130
+ "r2_endpoint_url": StringField(
131
+ lazy_gettext("R2 Endpoint URL"),
132
+ widget=BS3TextFieldWidget(),
133
+ description="Direct mode: Cloudflare R2 endpoint URL",
134
+ ),
135
+ }
136
+
137
+ def __init__(self, r2index_conn_id: str = default_conn_name) -> None:
138
+ super().__init__()
139
+ self.r2index_conn_id = r2index_conn_id
140
+ self._client: R2IndexClient | None = None
141
+
142
+ def _parse_secret_ref(self, secret_ref: str, default_key: str) -> tuple[str, str]:
143
+ """Parse a secret reference into (path, key).
144
+
145
+ Format: "path#key" or just "path" (uses default_key).
146
+ """
147
+ if "#" in secret_ref:
148
+ path, key = secret_ref.rsplit("#", 1)
149
+ return path, key
150
+ return secret_ref, default_key
151
+
152
+ def _get_config_from_env(self) -> dict[str, str | None]:
153
+ """Get configuration from environment variables."""
154
+ return {
155
+ "index_api_url": os.environ.get("R2INDEX_API_URL"),
156
+ "index_api_token": os.environ.get("R2INDEX_API_TOKEN"),
157
+ "r2_access_key_id": os.environ.get("R2_ACCESS_KEY_ID"),
158
+ "r2_secret_access_key": os.environ.get("R2_SECRET_ACCESS_KEY"),
159
+ "r2_endpoint_url": os.environ.get("R2_ENDPOINT_URL"),
160
+ }
161
+
162
+ def _get_config_from_vault(
163
+ self,
164
+ vault_conn_id: str,
165
+ secrets: dict[str, str],
166
+ namespace: str | None = None,
167
+ ) -> dict[str, str | None] | None:
168
+ """Get configuration from Vault/OpenBao using Airflow's VaultHook.
169
+
170
+ Args:
171
+ vault_conn_id: Airflow connection ID for Vault/OpenBao
172
+ secrets: Mapping of config key to secret reference (path#key format)
173
+ namespace: OpenBao namespace to use
174
+ """
175
+ from airflow.providers.hashicorp.hooks.vault import VaultHook
176
+
177
+ try:
178
+ # Note: namespace should be configured in the Vault connection's extra field
179
+ vault_hook = VaultHook(vault_conn_id=vault_conn_id)
180
+ secret_cache: dict[str, dict[str, Any]] = {}
181
+
182
+ def get_secret_value(config_key: str) -> str | None:
183
+ secret_ref = secrets.get(config_key)
184
+ if not secret_ref:
185
+ return None
186
+
187
+ path, key = self._parse_secret_ref(secret_ref, config_key)
188
+
189
+ if path not in secret_cache:
190
+ data = vault_hook.get_secret(secret_path=path, secret_version=None)
191
+ secret_cache[path] = data or {}
192
+
193
+ return secret_cache[path].get(key)
194
+
195
+ config = {
196
+ "index_api_url": get_secret_value("r2index_api_url"),
197
+ "index_api_token": get_secret_value("r2index_api_token"),
198
+ "r2_access_key_id": get_secret_value("r2_access_key_id"),
199
+ "r2_secret_access_key": get_secret_value("r2_secret_access_key"),
200
+ "r2_endpoint_url": get_secret_value("r2_endpoint_url"),
201
+ }
202
+ # Log which values are missing
203
+ missing = [k for k, v in config.items() if v is None]
204
+ if missing:
205
+ self.log.warning("Missing Vault secrets: %s", missing)
206
+ return config
207
+ except Exception as e:
208
+ self.log.error("Failed to get config from Vault: %s", e)
209
+ return None
210
+
211
+ def _get_config_from_connection(self) -> dict[str, str | None] | None:
212
+ """Get configuration from Airflow connection.
213
+
214
+ If connection has vault_conn_id, fetches from Vault/OpenBao.
215
+ Otherwise uses direct credentials from connection fields.
216
+ """
217
+ try:
218
+ conn = self.get_connection(self.r2index_conn_id)
219
+ extra = conn.extra_dejson
220
+
221
+ vault_conn_id = extra.get("vault_conn_id")
222
+ if vault_conn_id:
223
+ secrets_raw = extra.get("vault_secrets_mapping")
224
+ if not secrets_raw:
225
+ return None
226
+ if isinstance(secrets_raw, str):
227
+ secrets = json.loads(secrets_raw)
228
+ else:
229
+ secrets = secrets_raw
230
+ return self._get_config_from_vault(
231
+ vault_conn_id=vault_conn_id,
232
+ secrets=secrets,
233
+ namespace=extra.get("vault_namespace"),
234
+ )
235
+
236
+ return {
237
+ "index_api_url": conn.host,
238
+ "index_api_token": conn.password,
239
+ "r2_access_key_id": extra.get("r2_access_key_id"),
240
+ "r2_secret_access_key": extra.get("r2_secret_access_key"),
241
+ "r2_endpoint_url": extra.get("r2_endpoint_url"),
242
+ }
243
+ except Exception as e:
244
+ self.log.error("Failed to get config from connection: %s", e)
245
+ return None
246
+
247
+ def get_conn(self) -> R2IndexClient:
248
+ """Get the R2IndexClient."""
249
+ if self._client is not None:
250
+ return self._client
251
+
252
+ from elaunira.r2index import R2IndexClient
253
+
254
+ config = self._get_config_from_connection()
255
+ if config is None or not config.get("index_api_url"):
256
+ config = self._get_config_from_env()
257
+
258
+ self._client = R2IndexClient(
259
+ index_api_url=config["index_api_url"],
260
+ index_api_token=config["index_api_token"],
261
+ r2_access_key_id=config["r2_access_key_id"],
262
+ r2_secret_access_key=config["r2_secret_access_key"],
263
+ r2_endpoint_url=config["r2_endpoint_url"],
264
+ )
265
+ return self._client
266
+
267
+ def upload(
268
+ self,
269
+ bucket: str,
270
+ source: str,
271
+ category: str,
272
+ entity: str,
273
+ extension: str,
274
+ media_type: str,
275
+ destination_path: str,
276
+ destination_filename: str,
277
+ destination_version: str,
278
+ name: str | None = None,
279
+ tags: list[str] | None = None,
280
+ extra: dict[str, Any] | None = None,
281
+ create_checksum_files: bool = False,
282
+ ) -> dict[str, Any]:
283
+ """Upload a file to R2 and register it with R2Index."""
284
+ client = self.get_conn()
285
+ file_record = client.upload(
286
+ bucket=bucket,
287
+ source=source,
288
+ category=category,
289
+ entity=entity,
290
+ extension=extension,
291
+ media_type=media_type,
292
+ destination_path=destination_path,
293
+ destination_filename=destination_filename,
294
+ destination_version=destination_version,
295
+ name=name,
296
+ tags=tags,
297
+ extra=extra,
298
+ create_checksum_files=create_checksum_files,
299
+ )
300
+ return file_record.model_dump()
301
+
302
+ def download(
303
+ self,
304
+ bucket: str,
305
+ source_path: str,
306
+ source_filename: str,
307
+ source_version: str,
308
+ destination: str,
309
+ verify_checksum: bool = True,
310
+ ) -> dict[str, Any]:
311
+ """Download a file from R2."""
312
+ client = self.get_conn()
313
+ downloaded_path, file_record = client.download(
314
+ bucket=bucket,
315
+ source_path=source_path,
316
+ source_filename=source_filename,
317
+ source_version=source_version,
318
+ destination=destination,
319
+ verify_checksum=verify_checksum,
320
+ )
321
+ return {
322
+ "path": str(downloaded_path),
323
+ "file_record": file_record.model_dump(),
324
+ }
325
+
326
+ def get_file(self, file_id: str) -> dict[str, Any]:
327
+ """Get a file record by ID."""
328
+ client = self.get_conn()
329
+ return client.get(file_id).model_dump()
330
+
331
+ def list_files(
332
+ self,
333
+ bucket: str | None = None,
334
+ category: str | None = None,
335
+ entity: str | None = None,
336
+ extension: str | None = None,
337
+ tags: list[str] | None = None,
338
+ limit: int | None = None,
339
+ ) -> dict[str, Any]:
340
+ """List files with optional filters."""
341
+ client = self.get_conn()
342
+ response = client.list_files(
343
+ bucket=bucket,
344
+ category=category,
345
+ entity=entity,
346
+ extension=extension,
347
+ tags=tags,
348
+ limit=limit,
349
+ )
350
+ return response.model_dump()
@@ -0,0 +1,5 @@
1
+ """R2Index operator extra links."""
2
+
3
+ from elaunira.airflow.providers.r2index.links.r2index import R2IndexFileLink
4
+
5
+ __all__ = ["R2IndexFileLink"]
@@ -0,0 +1,37 @@
1
+ """R2Index operator extra links."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from airflow.models import BaseOperatorLink
8
+
9
+ if TYPE_CHECKING:
10
+ from airflow.models.taskinstancekey import TaskInstanceKey
11
+
12
+
13
+ class R2IndexFileLink(BaseOperatorLink):
14
+ """
15
+ Link to the R2Index file details.
16
+
17
+ This link extracts the file ID from the operator's XCom return value
18
+ and constructs a URL to view the file in the R2Index UI.
19
+ """
20
+
21
+ name = "R2Index File"
22
+
23
+ def get_link(
24
+ self,
25
+ operator: Any,
26
+ *,
27
+ ti_key: TaskInstanceKey,
28
+ ) -> str:
29
+ """Get the link to the R2Index file."""
30
+ from airflow.models import XCom
31
+
32
+ result = XCom.get_value(ti_key=ti_key, key="return_value")
33
+ if result and isinstance(result, dict):
34
+ file_id = result.get("id") or result.get("file_record", {}).get("id")
35
+ if file_id:
36
+ return f"https://r2index.elaunira.com/files/{file_id}"
37
+ return ""
@@ -0,0 +1,15 @@
1
+ """R2Index operators for Airflow."""
2
+
3
+ from elaunira.airflow.providers.r2index.operators.r2index import (
4
+ DownloadItem,
5
+ R2IndexDownloadOperator,
6
+ R2IndexUploadOperator,
7
+ UploadItem,
8
+ )
9
+
10
+ __all__ = [
11
+ "DownloadItem",
12
+ "R2IndexDownloadOperator",
13
+ "R2IndexUploadOperator",
14
+ "UploadItem",
15
+ ]
@@ -0,0 +1,232 @@
1
+ """R2Index operators for Airflow."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from collections.abc import Sequence
7
+ from dataclasses import dataclass
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ from airflow.exceptions import AirflowException
11
+ from airflow.models import BaseOperator
12
+
13
+ from elaunira.airflow.providers.r2index.hooks import R2IndexHook
14
+ from elaunira.airflow.providers.r2index.links.r2index import R2IndexFileLink
15
+ from elaunira.r2index import AsyncR2IndexClient
16
+
17
+ if TYPE_CHECKING:
18
+ from airflow.utils.context import Context
19
+
20
+
21
+ @dataclass
22
+ class UploadItem:
23
+ """Defines a single file to upload."""
24
+
25
+ source: str
26
+ category: str
27
+ entity: str
28
+ extension: str
29
+ media_type: str
30
+ destination_path: str
31
+ destination_filename: str
32
+ destination_version: str
33
+ bucket: str | None = None
34
+ name: str | None = None
35
+ tags: list[str] | None = None
36
+ extra: dict[str, Any] | None = None
37
+ create_checksum_files: bool = False
38
+ r2index_conn_id: str | None = None
39
+
40
+
41
+ @dataclass
42
+ class DownloadItem:
43
+ """Defines a single file to download."""
44
+
45
+ source_path: str
46
+ source_filename: str
47
+ source_version: str
48
+ destination: str
49
+ bucket: str | None = None
50
+ verify_checksum: bool = True
51
+ r2index_conn_id: str | None = None
52
+
53
+
54
+ class R2IndexUploadOperator(BaseOperator):
55
+ """
56
+ Upload one or more files to R2 in parallel.
57
+
58
+ When multiple items are provided, all uploads run concurrently using
59
+ asyncio within the worker process.
60
+
61
+ :param bucket: Default R2 bucket name (can be overridden per item).
62
+ :param items: List of UploadItem or single UploadItem defining files to upload.
63
+ :param r2index_conn_id: Default Airflow connection ID (can be overridden per item).
64
+ """
65
+
66
+ template_fields: Sequence[str] = ("items",)
67
+ template_ext: Sequence[str] = ()
68
+ ui_color = "#e4f0e8"
69
+ operator_extra_links = (R2IndexFileLink(),)
70
+
71
+ def __init__(
72
+ self,
73
+ *,
74
+ bucket: str,
75
+ items: list[UploadItem] | UploadItem,
76
+ r2index_conn_id: str = "r2index_default",
77
+ **kwargs: Any,
78
+ ) -> None:
79
+ super().__init__(**kwargs)
80
+ self.bucket = bucket
81
+ self.items = [items] if isinstance(items, UploadItem) else items
82
+ self.r2index_conn_id = r2index_conn_id
83
+
84
+ def _get_client_config(self, conn_id: str) -> dict[str, Any]:
85
+ """Get client configuration using the hook's priority chain."""
86
+ hook = R2IndexHook(r2index_conn_id=conn_id)
87
+ config = hook._get_config_from_connection()
88
+ if config is None or not config.get("index_api_url"):
89
+ config = hook._get_config_from_env()
90
+ return config or {}
91
+
92
+ def execute(self, context: Context) -> list[dict[str, Any]]:
93
+ """Execute the uploads in parallel."""
94
+ # Group items by connection ID for efficient client reuse
95
+ conn_configs: dict[str, dict[str, Any]] = {}
96
+
97
+ async def get_or_create_config(conn_id: str) -> dict[str, Any]:
98
+ if conn_id not in conn_configs:
99
+ conn_configs[conn_id] = self._get_client_config(conn_id)
100
+ return conn_configs[conn_id]
101
+
102
+ async def upload_one(item: UploadItem) -> dict[str, Any]:
103
+ conn_id = item.r2index_conn_id or self.r2index_conn_id
104
+ bucket = item.bucket or self.bucket
105
+ config = await get_or_create_config(conn_id)
106
+
107
+ try:
108
+ async with AsyncR2IndexClient(**config) as client:
109
+ file_record = await client.upload(
110
+ bucket=bucket,
111
+ source=item.source,
112
+ category=item.category,
113
+ entity=item.entity,
114
+ extension=item.extension,
115
+ media_type=item.media_type,
116
+ destination_path=item.destination_path,
117
+ destination_filename=item.destination_filename,
118
+ destination_version=item.destination_version,
119
+ name=item.name,
120
+ tags=item.tags,
121
+ extra=item.extra,
122
+ create_checksum_files=item.create_checksum_files,
123
+ )
124
+ return {"status": "success", "file_record": file_record.model_dump()}
125
+ except Exception as e:
126
+ return {"status": "error", "message": str(e), "source": item.source}
127
+
128
+ async def upload_all() -> list[dict[str, Any]]:
129
+ tasks = [upload_one(item) for item in self.items]
130
+ return await asyncio.gather(*tasks)
131
+
132
+ results = asyncio.run(upload_all())
133
+
134
+ # Check for errors
135
+ errors = [r for r in results if r.get("status") == "error"]
136
+ if errors:
137
+ self.log.error("Failed uploads: %s", errors)
138
+ raise AirflowException(f"{len(errors)}/{len(results)} uploads failed")
139
+
140
+ self.log.info("Uploaded %d files in parallel", len(results))
141
+ return [r["file_record"] for r in results]
142
+
143
+
144
+ class R2IndexDownloadOperator(BaseOperator):
145
+ """
146
+ Download one or more files from R2 in parallel.
147
+
148
+ When multiple items are provided, all downloads run concurrently using
149
+ asyncio within the worker process.
150
+
151
+ :param bucket: Default R2 bucket name (can be overridden per item).
152
+ :param items: List of DownloadItem or single DownloadItem defining files to download.
153
+ :param r2index_conn_id: Default Airflow connection ID (can be overridden per item).
154
+ """
155
+
156
+ template_fields: Sequence[str] = ("items",)
157
+ template_ext: Sequence[str] = ()
158
+ ui_color = "#f0e4e8"
159
+ operator_extra_links = (R2IndexFileLink(),)
160
+
161
+ def __init__(
162
+ self,
163
+ *,
164
+ bucket: str,
165
+ items: list[DownloadItem] | DownloadItem,
166
+ r2index_conn_id: str = "r2index_default",
167
+ **kwargs: Any,
168
+ ) -> None:
169
+ super().__init__(**kwargs)
170
+ self.bucket = bucket
171
+ self.items = [items] if isinstance(items, DownloadItem) else items
172
+ self.r2index_conn_id = r2index_conn_id
173
+
174
+ def _get_client_config(self, conn_id: str) -> dict[str, Any]:
175
+ """Get client configuration using the hook's priority chain."""
176
+ hook = R2IndexHook(r2index_conn_id=conn_id)
177
+ config = hook._get_config_from_connection()
178
+ if config is None or not config.get("index_api_url"):
179
+ config = hook._get_config_from_env()
180
+ return config or {}
181
+
182
+ def execute(self, context: Context) -> list[dict[str, Any]]:
183
+ """Execute the downloads in parallel."""
184
+ # Group items by connection ID for efficient client reuse
185
+ conn_configs: dict[str, dict[str, Any]] = {}
186
+
187
+ async def get_or_create_config(conn_id: str) -> dict[str, Any]:
188
+ if conn_id not in conn_configs:
189
+ conn_configs[conn_id] = self._get_client_config(conn_id)
190
+ return conn_configs[conn_id]
191
+
192
+ async def download_one(item: DownloadItem) -> dict[str, Any]:
193
+ conn_id = item.r2index_conn_id or self.r2index_conn_id
194
+ bucket = item.bucket or self.bucket
195
+ config = await get_or_create_config(conn_id)
196
+
197
+ try:
198
+ async with AsyncR2IndexClient(**config) as client:
199
+ downloaded_path, file_record = await client.download(
200
+ bucket=bucket,
201
+ source_path=item.source_path,
202
+ source_filename=item.source_filename,
203
+ source_version=item.source_version,
204
+ destination=item.destination,
205
+ verify_checksum=item.verify_checksum,
206
+ )
207
+ return {
208
+ "status": "success",
209
+ "path": str(downloaded_path),
210
+ "file_record": file_record.model_dump(),
211
+ }
212
+ except Exception as e:
213
+ return {
214
+ "status": "error",
215
+ "message": str(e),
216
+ "source_path": item.source_path,
217
+ }
218
+
219
+ async def download_all() -> list[dict[str, Any]]:
220
+ tasks = [download_one(item) for item in self.items]
221
+ return await asyncio.gather(*tasks)
222
+
223
+ results = asyncio.run(download_all())
224
+
225
+ # Check for errors
226
+ errors = [r for r in results if r.get("status") == "error"]
227
+ if errors:
228
+ self.log.error("Failed downloads: %s", errors)
229
+ raise AirflowException(f"{len(errors)}/{len(results)} downloads failed")
230
+
231
+ self.log.info("Downloaded %d files in parallel", len(results))
232
+ return [{"path": r["path"], "file_record": r["file_record"]} for r in results]