elaunira-airflow 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- elaunira_airflow-0.1.0/.gitignore +1 -0
- elaunira_airflow-0.1.0/PKG-INFO +14 -0
- elaunira_airflow-0.1.0/README.md +273 -0
- elaunira_airflow-0.1.0/pyproject.toml +29 -0
- elaunira_airflow-0.1.0/src/elaunira/__init__.py +1 -0
- elaunira_airflow-0.1.0/src/elaunira/airflow/__init__.py +5 -0
- elaunira_airflow-0.1.0/src/elaunira/airflow/decorators/__init__.py +5 -0
- elaunira_airflow-0.1.0/src/elaunira/airflow/decorators/r2index.py +139 -0
- elaunira_airflow-0.1.0/src/elaunira/airflow/hooks/__init__.py +5 -0
- elaunira_airflow-0.1.0/src/elaunira/airflow/links/__init__.py +5 -0
- elaunira_airflow-0.1.0/src/elaunira/airflow/links/r2index.py +37 -0
- elaunira_airflow-0.1.0/src/elaunira/airflow/operators/__init__.py +15 -0
- elaunira_airflow-0.1.0/src/elaunira/airflow/operators/r2index.py +232 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
*.iml
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: elaunira-airflow
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Elaunira Airflow operators and hooks for R2Index
|
|
5
|
+
Project-URL: Repository, https://github.com/elaunira/elaunira-airflow
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Requires-Dist: apache-airflow-providers-edge3
|
|
9
|
+
Requires-Dist: apache-airflow-providers-git
|
|
10
|
+
Requires-Dist: apache-airflow>=3.0.0
|
|
11
|
+
Requires-Dist: cryptography
|
|
12
|
+
Requires-Dist: elaunira-airflow-provider-r2index
|
|
13
|
+
Requires-Dist: elaunira-r2index
|
|
14
|
+
Requires-Dist: pyjwt
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
# elaunira-airflow
|
|
2
|
+
|
|
3
|
+
Airflow operators and hooks for R2Index file operations.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install elaunira-airflow
|
|
9
|
+
|
|
10
|
+
# With OpenBao/Vault support
|
|
11
|
+
pip install elaunira-airflow[openbao]
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Configuration
|
|
15
|
+
|
|
16
|
+
Configuration is loaded from Airflow connections, with environment variables as fallback.
|
|
17
|
+
|
|
18
|
+
### Option 1: Vault/OpenBao (recommended)
|
|
19
|
+
|
|
20
|
+
Create one Vault connection per service, then reference it from R2Index connections.
|
|
21
|
+
|
|
22
|
+
**Step 1: Setup OpenBao AppRoles**
|
|
23
|
+
|
|
24
|
+
Run the setup script to create an `airflow` AppRole in each service namespace:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
./scripts/setup-openbao-approle.sh
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
This creates in each service namespace (ipregistry, noticeable, etc.):
|
|
31
|
+
- AppRole: `airflow`
|
|
32
|
+
- Policy: `airflow` (read access to `kv/data/*` in namespace and sub-namespaces)
|
|
33
|
+
|
|
34
|
+
To retrieve credentials later:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
# Get role_id (can be read anytime)
|
|
38
|
+
bao read -namespace="ipregistry" -field=role_id auth/approle/role/airflow/role-id
|
|
39
|
+
|
|
40
|
+
# Generate new secret_id (cannot retrieve existing)
|
|
41
|
+
bao write -namespace="ipregistry" -field=secret_id -f auth/approle/role/airflow/secret-id
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
**Step 2: Create Vault connection (one per service)**
|
|
45
|
+
|
|
46
|
+
| Field | Value |
|
|
47
|
+
|-------|-------|
|
|
48
|
+
| Connection ID | `openbao-ipregistry` |
|
|
49
|
+
| Connection Type | `HashiCorp Vault` |
|
|
50
|
+
| Host | `bao.elaunira.com` |
|
|
51
|
+
| Schema | `https` |
|
|
52
|
+
| Login | AppRole role_id |
|
|
53
|
+
| Password | AppRole secret_id |
|
|
54
|
+
| Extra | `{"auth_type": "approle", "auth_mount_point": "approle", "namespace": "ipregistry"}` |
|
|
55
|
+
|
|
56
|
+
**Step 3: Create R2Index connection(s)**
|
|
57
|
+
|
|
58
|
+
| Field | Value |
|
|
59
|
+
|-------|-------|
|
|
60
|
+
| Connection ID | `r2index-ipregistry-prod` |
|
|
61
|
+
| Connection Type | `Generic` |
|
|
62
|
+
| Extra | JSON (see below) |
|
|
63
|
+
|
|
64
|
+
```json
|
|
65
|
+
{
|
|
66
|
+
"vault_conn_id": "openbao-ipregistry",
|
|
67
|
+
"vault_namespace": "ipregistry/production",
|
|
68
|
+
"vault_secrets": {
|
|
69
|
+
"r2index_api_url": "cloudflare/r2index#api-url",
|
|
70
|
+
"r2index_api_token": "cloudflare/r2index#api-token",
|
|
71
|
+
"r2_access_key_id": "cloudflare/r2/airflow#access-key-id",
|
|
72
|
+
"r2_secret_access_key": "cloudflare/r2/airflow#secret-access-key",
|
|
73
|
+
"r2_endpoint_url": "cloudflare/r2/airflow#endpoint-url"
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
- `vault_conn_id`: references the Vault connection for the service
|
|
79
|
+
- `vault_namespace`: target namespace (e.g., `ipregistry/production`)
|
|
80
|
+
- `vault_secrets`: mapping of config key to secret path
|
|
81
|
+
- Format: `"path#key"` or `"path"` (uses config key as secret key)
|
|
82
|
+
- Secrets at the same path are fetched once and cached
|
|
83
|
+
|
|
84
|
+
Required keys in `vault_secrets`:
|
|
85
|
+
- `r2index_api_url`
|
|
86
|
+
- `r2index_api_token`
|
|
87
|
+
- `r2_access_key_id`
|
|
88
|
+
- `r2_secret_access_key`
|
|
89
|
+
- `r2_endpoint_url`
|
|
90
|
+
|
|
91
|
+
### Option 2: Direct Credentials
|
|
92
|
+
|
|
93
|
+
Store credentials directly in the Airflow connection:
|
|
94
|
+
|
|
95
|
+
| Field | Value |
|
|
96
|
+
|-------|-------|
|
|
97
|
+
| Connection ID | `r2index_default` |
|
|
98
|
+
| Host | R2Index API URL |
|
|
99
|
+
| Password | R2Index API token |
|
|
100
|
+
| Extra | JSON (see below) |
|
|
101
|
+
|
|
102
|
+
```json
|
|
103
|
+
{
|
|
104
|
+
"r2_access_key_id": "your-access-key",
|
|
105
|
+
"r2_secret_access_key": "your-secret-key",
|
|
106
|
+
"r2_endpoint_url": "https://your-account.r2.cloudflarestorage.com"
|
|
107
|
+
}
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Option 3: Environment Variables (fallback)
|
|
111
|
+
|
|
112
|
+
If no connection is found:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
export R2INDEX_API_URL="https://r2index.elaunira.com"
|
|
116
|
+
export R2INDEX_API_TOKEN="your-token"
|
|
117
|
+
export R2_ACCESS_KEY_ID="your-access-key"
|
|
118
|
+
export R2_SECRET_ACCESS_KEY="your-secret-key"
|
|
119
|
+
export R2_ENDPOINT_URL="https://your-account.r2.cloudflarestorage.com"
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Usage
|
|
123
|
+
|
|
124
|
+
### Upload files
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from elaunira.airflow.operators import R2IndexUploadOperator, UploadItem
|
|
128
|
+
|
|
129
|
+
# Single file
|
|
130
|
+
upload = R2IndexUploadOperator(
|
|
131
|
+
task_id="upload_file",
|
|
132
|
+
bucket="my-bucket",
|
|
133
|
+
items=UploadItem(
|
|
134
|
+
source="/tmp/data.csv",
|
|
135
|
+
category="acme",
|
|
136
|
+
entity="acme-data",
|
|
137
|
+
extension="csv",
|
|
138
|
+
media_type="text/csv",
|
|
139
|
+
destination_path="acme/data",
|
|
140
|
+
destination_filename="data.csv",
|
|
141
|
+
destination_version="{{ ds }}",
|
|
142
|
+
),
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Multiple files (parallel)
|
|
146
|
+
upload_batch = R2IndexUploadOperator(
|
|
147
|
+
task_id="upload_batch",
|
|
148
|
+
bucket="my-bucket",
|
|
149
|
+
items=[
|
|
150
|
+
UploadItem(
|
|
151
|
+
source="/tmp/file1.csv",
|
|
152
|
+
category="acme",
|
|
153
|
+
entity="acme-data",
|
|
154
|
+
extension="csv",
|
|
155
|
+
media_type="text/csv",
|
|
156
|
+
destination_path="acme/data",
|
|
157
|
+
destination_filename="file1.csv",
|
|
158
|
+
destination_version="{{ ds }}",
|
|
159
|
+
),
|
|
160
|
+
UploadItem(
|
|
161
|
+
source="/tmp/file2.csv",
|
|
162
|
+
category="acme",
|
|
163
|
+
entity="acme-data",
|
|
164
|
+
extension="csv",
|
|
165
|
+
media_type="text/csv",
|
|
166
|
+
destination_path="acme/data",
|
|
167
|
+
destination_filename="file2.csv",
|
|
168
|
+
destination_version="{{ ds }}",
|
|
169
|
+
),
|
|
170
|
+
],
|
|
171
|
+
)
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Download files
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
from elaunira.airflow.operators import R2IndexDownloadOperator, DownloadItem
|
|
178
|
+
|
|
179
|
+
# Single file
|
|
180
|
+
download = R2IndexDownloadOperator(
|
|
181
|
+
task_id="download_file",
|
|
182
|
+
bucket="my-bucket",
|
|
183
|
+
items=DownloadItem(
|
|
184
|
+
source_path="acme/data",
|
|
185
|
+
source_filename="data.csv",
|
|
186
|
+
source_version="{{ ds }}",
|
|
187
|
+
destination="/tmp/downloaded.csv",
|
|
188
|
+
),
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Multiple files (parallel)
|
|
192
|
+
download_batch = R2IndexDownloadOperator(
|
|
193
|
+
task_id="download_batch",
|
|
194
|
+
bucket="my-bucket",
|
|
195
|
+
items=[
|
|
196
|
+
DownloadItem(
|
|
197
|
+
source_path="acme/data",
|
|
198
|
+
source_filename="file1.csv",
|
|
199
|
+
source_version="{{ ds }}",
|
|
200
|
+
destination="/tmp/file1.csv",
|
|
201
|
+
),
|
|
202
|
+
DownloadItem(
|
|
203
|
+
source_path="acme/data",
|
|
204
|
+
source_filename="file2.csv",
|
|
205
|
+
source_version="{{ ds }}",
|
|
206
|
+
destination="/tmp/file2.csv",
|
|
207
|
+
),
|
|
208
|
+
],
|
|
209
|
+
)
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### Template fields
|
|
213
|
+
|
|
214
|
+
The following fields support Jinja templating:
|
|
215
|
+
|
|
216
|
+
- `items` (including nested `source`, `destination_*`, `source_*` fields)
|
|
217
|
+
|
|
218
|
+
Common template variables:
|
|
219
|
+
- `{{ ds }}` - Logical date (YYYY-MM-DD)
|
|
220
|
+
- `{{ ds_nodash }}` - Logical date without dashes
|
|
221
|
+
- `{{ data_interval_start }}` - Start of data interval
|
|
222
|
+
|
|
223
|
+
## TaskFlow API
|
|
224
|
+
|
|
225
|
+
Use decorators for a more Pythonic approach:
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
from airflow.decorators import task
|
|
229
|
+
from elaunira.airflow.operators import UploadItem, DownloadItem
|
|
230
|
+
|
|
231
|
+
@task.r2index_upload(bucket="my-bucket")
|
|
232
|
+
def prepare_upload() -> UploadItem:
|
|
233
|
+
# Dynamic logic to prepare the upload
|
|
234
|
+
return UploadItem(
|
|
235
|
+
source="/tmp/data.csv",
|
|
236
|
+
category="acme",
|
|
237
|
+
entity="acme-data",
|
|
238
|
+
extension="csv",
|
|
239
|
+
media_type="text/csv",
|
|
240
|
+
destination_path="acme/data",
|
|
241
|
+
destination_filename="data.csv",
|
|
242
|
+
destination_version="2024-01-01",
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
@task.r2index_download(bucket="my-bucket")
|
|
246
|
+
def prepare_download() -> DownloadItem:
|
|
247
|
+
return DownloadItem(
|
|
248
|
+
source_path="acme/data",
|
|
249
|
+
source_filename="data.csv",
|
|
250
|
+
source_version="2024-01-01",
|
|
251
|
+
destination="/tmp/downloaded.csv",
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# In your DAG
|
|
255
|
+
with DAG(...) as dag:
|
|
256
|
+
upload_result = prepare_upload()
|
|
257
|
+
download_result = prepare_download()
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
Return a list of items for parallel transfers:
|
|
261
|
+
|
|
262
|
+
```python
|
|
263
|
+
@task.r2index_upload(bucket="my-bucket")
|
|
264
|
+
def prepare_batch_upload() -> list[UploadItem]:
|
|
265
|
+
return [
|
|
266
|
+
UploadItem(source="/tmp/file1.csv", ...),
|
|
267
|
+
UploadItem(source="/tmp/file2.csv", ...),
|
|
268
|
+
]
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
## Parallel execution
|
|
272
|
+
|
|
273
|
+
When multiple items are provided, all transfers run concurrently using asyncio within the worker process. This is efficient for I/O-bound operations like file transfers.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "elaunira-airflow"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Elaunira Airflow operators and hooks for R2Index"
|
|
9
|
+
requires-python = ">=3.12"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"apache-airflow>=3.0.0",
|
|
13
|
+
"apache-airflow-providers-edge3",
|
|
14
|
+
"apache-airflow-providers-git",
|
|
15
|
+
"cryptography",
|
|
16
|
+
"elaunira-airflow-provider-r2index",
|
|
17
|
+
"elaunira-r2index",
|
|
18
|
+
"PyJWT",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[project.urls]
|
|
22
|
+
Repository = "https://github.com/elaunira/elaunira-airflow"
|
|
23
|
+
|
|
24
|
+
[project.entry-points."airflow.task_decorators"]
|
|
25
|
+
r2index_upload = "elaunira.airflow.decorators.r2index:r2index_upload"
|
|
26
|
+
r2index_download = "elaunira.airflow.decorators.r2index:r2index_download"
|
|
27
|
+
|
|
28
|
+
[tool.hatch.build.targets.wheel]
|
|
29
|
+
packages = ["src/elaunira"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Elaunira namespace package."""
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""R2Index TaskFlow decorators."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Callable, TypeVar
|
|
6
|
+
|
|
7
|
+
from airflow.decorators.base import task_decorator_factory
|
|
8
|
+
|
|
9
|
+
from elaunira.airflow.operators.r2index import (
|
|
10
|
+
DownloadItem,
|
|
11
|
+
R2IndexDownloadOperator,
|
|
12
|
+
R2IndexUploadOperator,
|
|
13
|
+
UploadItem,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from airflow.decorators.base import TaskDecorator
|
|
18
|
+
|
|
19
|
+
F = TypeVar("F", bound=Callable[..., Any])
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def r2index_upload(
|
|
23
|
+
bucket: str,
|
|
24
|
+
r2index_conn_id: str = "r2index_default",
|
|
25
|
+
**kwargs: Any,
|
|
26
|
+
) -> TaskDecorator:
|
|
27
|
+
"""
|
|
28
|
+
Decorator to upload files to R2Index.
|
|
29
|
+
|
|
30
|
+
The decorated function should return an UploadItem or list of UploadItems.
|
|
31
|
+
|
|
32
|
+
Example:
|
|
33
|
+
@task.r2index_upload(bucket="my-bucket")
|
|
34
|
+
def prepare_upload() -> UploadItem:
|
|
35
|
+
return UploadItem(
|
|
36
|
+
source="/tmp/data.csv",
|
|
37
|
+
category="acme",
|
|
38
|
+
entity="acme-data",
|
|
39
|
+
extension="csv",
|
|
40
|
+
media_type="text/csv",
|
|
41
|
+
destination_path="acme/data",
|
|
42
|
+
destination_filename="data.csv",
|
|
43
|
+
destination_version="{{ ds }}",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
:param bucket: R2 bucket name.
|
|
47
|
+
:param r2index_conn_id: Airflow connection ID for R2Index.
|
|
48
|
+
"""
|
|
49
|
+
return task_decorator_factory(
|
|
50
|
+
decorated_operator_class=_R2IndexUploadDecoratedOperator,
|
|
51
|
+
bucket=bucket,
|
|
52
|
+
r2index_conn_id=r2index_conn_id,
|
|
53
|
+
**kwargs,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def r2index_download(
|
|
58
|
+
bucket: str,
|
|
59
|
+
r2index_conn_id: str = "r2index_default",
|
|
60
|
+
**kwargs: Any,
|
|
61
|
+
) -> TaskDecorator:
|
|
62
|
+
"""
|
|
63
|
+
Decorator to download files from R2Index.
|
|
64
|
+
|
|
65
|
+
The decorated function should return a DownloadItem or list of DownloadItems.
|
|
66
|
+
|
|
67
|
+
Example:
|
|
68
|
+
@task.r2index_download(bucket="my-bucket")
|
|
69
|
+
def prepare_download() -> DownloadItem:
|
|
70
|
+
return DownloadItem(
|
|
71
|
+
source_path="acme/data",
|
|
72
|
+
source_filename="data.csv",
|
|
73
|
+
source_version="{{ ds }}",
|
|
74
|
+
destination="/tmp/downloaded.csv",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
:param bucket: R2 bucket name.
|
|
78
|
+
:param r2index_conn_id: Airflow connection ID for R2Index.
|
|
79
|
+
"""
|
|
80
|
+
return task_decorator_factory(
|
|
81
|
+
decorated_operator_class=_R2IndexDownloadDecoratedOperator,
|
|
82
|
+
bucket=bucket,
|
|
83
|
+
r2index_conn_id=r2index_conn_id,
|
|
84
|
+
**kwargs,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class _R2IndexUploadDecoratedOperator(R2IndexUploadOperator):
|
|
89
|
+
"""Decorated operator for R2Index uploads."""
|
|
90
|
+
|
|
91
|
+
custom_operator_name = "@task.r2index_upload"
|
|
92
|
+
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
*,
|
|
96
|
+
python_callable: Callable[..., UploadItem | list[UploadItem]],
|
|
97
|
+
op_args: list[Any] | None = None,
|
|
98
|
+
op_kwargs: dict[str, Any] | None = None,
|
|
99
|
+
**kwargs: Any,
|
|
100
|
+
) -> None:
|
|
101
|
+
self.python_callable = python_callable
|
|
102
|
+
self.op_args = op_args or []
|
|
103
|
+
self.op_kwargs = op_kwargs or {}
|
|
104
|
+
# items will be set in execute()
|
|
105
|
+
kwargs["items"] = []
|
|
106
|
+
super().__init__(**kwargs)
|
|
107
|
+
|
|
108
|
+
def execute(self, context: Any) -> list[dict[str, Any]]:
|
|
109
|
+
"""Execute the decorated function and upload the result."""
|
|
110
|
+
items = self.python_callable(*self.op_args, **self.op_kwargs)
|
|
111
|
+
self.items = [items] if isinstance(items, UploadItem) else items
|
|
112
|
+
return super().execute(context)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class _R2IndexDownloadDecoratedOperator(R2IndexDownloadOperator):
|
|
116
|
+
"""Decorated operator for R2Index downloads."""
|
|
117
|
+
|
|
118
|
+
custom_operator_name = "@task.r2index_download"
|
|
119
|
+
|
|
120
|
+
def __init__(
|
|
121
|
+
self,
|
|
122
|
+
*,
|
|
123
|
+
python_callable: Callable[..., DownloadItem | list[DownloadItem]],
|
|
124
|
+
op_args: list[Any] | None = None,
|
|
125
|
+
op_kwargs: dict[str, Any] | None = None,
|
|
126
|
+
**kwargs: Any,
|
|
127
|
+
) -> None:
|
|
128
|
+
self.python_callable = python_callable
|
|
129
|
+
self.op_args = op_args or []
|
|
130
|
+
self.op_kwargs = op_kwargs or {}
|
|
131
|
+
# items will be set in execute()
|
|
132
|
+
kwargs["items"] = []
|
|
133
|
+
super().__init__(**kwargs)
|
|
134
|
+
|
|
135
|
+
def execute(self, context: Any) -> list[dict[str, Any]]:
|
|
136
|
+
"""Execute the decorated function and download the result."""
|
|
137
|
+
items = self.python_callable(*self.op_args, **self.op_kwargs)
|
|
138
|
+
self.items = [items] if isinstance(items, DownloadItem) else items
|
|
139
|
+
return super().execute(context)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""R2Index operator extra links."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
from airflow.models import BaseOperatorLink
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from airflow.models.taskinstancekey import TaskInstanceKey
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class R2IndexFileLink(BaseOperatorLink):
|
|
14
|
+
"""
|
|
15
|
+
Link to the R2Index file details.
|
|
16
|
+
|
|
17
|
+
This link extracts the file ID from the operator's XCom return value
|
|
18
|
+
and constructs a URL to view the file in the R2Index UI.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name = "R2Index File"
|
|
22
|
+
|
|
23
|
+
def get_link(
|
|
24
|
+
self,
|
|
25
|
+
operator: Any,
|
|
26
|
+
*,
|
|
27
|
+
ti_key: TaskInstanceKey,
|
|
28
|
+
) -> str:
|
|
29
|
+
"""Get the link to the R2Index file."""
|
|
30
|
+
from airflow.models import XCom
|
|
31
|
+
|
|
32
|
+
result = XCom.get_value(ti_key=ti_key)
|
|
33
|
+
if result and isinstance(result, dict):
|
|
34
|
+
file_id = result.get("id") or result.get("file_record", {}).get("id")
|
|
35
|
+
if file_id:
|
|
36
|
+
return f"https://r2index.elaunira.com/files/{file_id}"
|
|
37
|
+
return ""
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Elaunira Airflow operators."""
|
|
2
|
+
|
|
3
|
+
from elaunira.airflow.operators.r2index import (
|
|
4
|
+
DownloadItem,
|
|
5
|
+
R2IndexDownloadOperator,
|
|
6
|
+
R2IndexUploadOperator,
|
|
7
|
+
UploadItem,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"R2IndexUploadOperator",
|
|
12
|
+
"R2IndexDownloadOperator",
|
|
13
|
+
"UploadItem",
|
|
14
|
+
"DownloadItem",
|
|
15
|
+
]
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""R2Index operators for Airflow."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from collections.abc import Sequence
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
|
+
|
|
10
|
+
from airflow.exceptions import AirflowException
|
|
11
|
+
from airflow.models import BaseOperator
|
|
12
|
+
|
|
13
|
+
from elaunira.airflow.hooks.r2index import R2IndexHook
|
|
14
|
+
from elaunira.airflow.links.r2index import R2IndexFileLink
|
|
15
|
+
from elaunira.r2index import AsyncR2IndexClient
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from airflow.utils.context import Context
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class UploadItem:
|
|
23
|
+
"""Defines a single file to upload."""
|
|
24
|
+
|
|
25
|
+
source: str
|
|
26
|
+
category: str
|
|
27
|
+
entity: str
|
|
28
|
+
extension: str
|
|
29
|
+
media_type: str
|
|
30
|
+
destination_path: str
|
|
31
|
+
destination_filename: str
|
|
32
|
+
destination_version: str
|
|
33
|
+
bucket: str | None = None
|
|
34
|
+
name: str | None = None
|
|
35
|
+
tags: list[str] | None = None
|
|
36
|
+
extra: dict[str, Any] | None = None
|
|
37
|
+
create_checksum_files: bool = False
|
|
38
|
+
r2index_conn_id: str | None = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class DownloadItem:
|
|
43
|
+
"""Defines a single file to download."""
|
|
44
|
+
|
|
45
|
+
source_path: str
|
|
46
|
+
source_filename: str
|
|
47
|
+
source_version: str
|
|
48
|
+
destination: str
|
|
49
|
+
bucket: str | None = None
|
|
50
|
+
verify_checksum: bool = True
|
|
51
|
+
r2index_conn_id: str | None = None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class R2IndexUploadOperator(BaseOperator):
|
|
55
|
+
"""
|
|
56
|
+
Upload one or more files to R2 in parallel.
|
|
57
|
+
|
|
58
|
+
When multiple items are provided, all uploads run concurrently using
|
|
59
|
+
asyncio within the worker process.
|
|
60
|
+
|
|
61
|
+
:param bucket: Default R2 bucket name (can be overridden per item).
|
|
62
|
+
:param items: List of UploadItem or single UploadItem defining files to upload.
|
|
63
|
+
:param r2index_conn_id: Default Airflow connection ID (can be overridden per item).
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
template_fields: Sequence[str] = ("items",)
|
|
67
|
+
template_ext: Sequence[str] = ()
|
|
68
|
+
ui_color = "#e4f0e8"
|
|
69
|
+
operator_extra_links = (R2IndexFileLink(),)
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
*,
|
|
74
|
+
bucket: str,
|
|
75
|
+
items: list[UploadItem] | UploadItem,
|
|
76
|
+
r2index_conn_id: str = "r2index_default",
|
|
77
|
+
**kwargs: Any,
|
|
78
|
+
) -> None:
|
|
79
|
+
super().__init__(**kwargs)
|
|
80
|
+
self.bucket = bucket
|
|
81
|
+
self.items = [items] if isinstance(items, UploadItem) else items
|
|
82
|
+
self.r2index_conn_id = r2index_conn_id
|
|
83
|
+
|
|
84
|
+
def _get_client_config(self, conn_id: str) -> dict[str, Any]:
|
|
85
|
+
"""Get client configuration using the hook's priority chain."""
|
|
86
|
+
hook = R2IndexHook(r2index_conn_id=conn_id)
|
|
87
|
+
config = hook._get_config_from_connection()
|
|
88
|
+
if config is None or not config.get("index_api_url"):
|
|
89
|
+
config = hook._get_config_from_env()
|
|
90
|
+
return config or {}
|
|
91
|
+
|
|
92
|
+
def execute(self, context: Context) -> list[dict[str, Any]]:
|
|
93
|
+
"""Execute the uploads in parallel."""
|
|
94
|
+
# Group items by connection ID for efficient client reuse
|
|
95
|
+
conn_configs: dict[str, dict[str, Any]] = {}
|
|
96
|
+
|
|
97
|
+
async def get_or_create_config(conn_id: str) -> dict[str, Any]:
|
|
98
|
+
if conn_id not in conn_configs:
|
|
99
|
+
conn_configs[conn_id] = self._get_client_config(conn_id)
|
|
100
|
+
return conn_configs[conn_id]
|
|
101
|
+
|
|
102
|
+
async def upload_one(item: UploadItem) -> dict[str, Any]:
|
|
103
|
+
conn_id = item.r2index_conn_id or self.r2index_conn_id
|
|
104
|
+
bucket = item.bucket or self.bucket
|
|
105
|
+
config = await get_or_create_config(conn_id)
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
async with AsyncR2IndexClient(**config) as client:
|
|
109
|
+
file_record = await client.upload(
|
|
110
|
+
bucket=bucket,
|
|
111
|
+
source=item.source,
|
|
112
|
+
category=item.category,
|
|
113
|
+
entity=item.entity,
|
|
114
|
+
extension=item.extension,
|
|
115
|
+
media_type=item.media_type,
|
|
116
|
+
destination_path=item.destination_path,
|
|
117
|
+
destination_filename=item.destination_filename,
|
|
118
|
+
destination_version=item.destination_version,
|
|
119
|
+
name=item.name,
|
|
120
|
+
tags=item.tags,
|
|
121
|
+
extra=item.extra,
|
|
122
|
+
create_checksum_files=item.create_checksum_files,
|
|
123
|
+
)
|
|
124
|
+
return {"status": "success", "file_record": file_record.model_dump()}
|
|
125
|
+
except Exception as e:
|
|
126
|
+
return {"status": "error", "message": str(e), "source": item.source}
|
|
127
|
+
|
|
128
|
+
async def upload_all() -> list[dict[str, Any]]:
|
|
129
|
+
tasks = [upload_one(item) for item in self.items]
|
|
130
|
+
return await asyncio.gather(*tasks)
|
|
131
|
+
|
|
132
|
+
results = asyncio.run(upload_all())
|
|
133
|
+
|
|
134
|
+
# Check for errors
|
|
135
|
+
errors = [r for r in results if r.get("status") == "error"]
|
|
136
|
+
if errors:
|
|
137
|
+
self.log.error("Failed uploads: %s", errors)
|
|
138
|
+
raise AirflowException(f"{len(errors)}/{len(results)} uploads failed")
|
|
139
|
+
|
|
140
|
+
self.log.info("Uploaded %d files in parallel", len(results))
|
|
141
|
+
return [r["file_record"] for r in results]
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class R2IndexDownloadOperator(BaseOperator):
|
|
145
|
+
"""
|
|
146
|
+
Download one or more files from R2 in parallel.
|
|
147
|
+
|
|
148
|
+
When multiple items are provided, all downloads run concurrently using
|
|
149
|
+
asyncio within the worker process.
|
|
150
|
+
|
|
151
|
+
:param bucket: Default R2 bucket name (can be overridden per item).
|
|
152
|
+
:param items: List of DownloadItem or single DownloadItem defining files to download.
|
|
153
|
+
:param r2index_conn_id: Default Airflow connection ID (can be overridden per item).
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
template_fields: Sequence[str] = ("items",)
|
|
157
|
+
template_ext: Sequence[str] = ()
|
|
158
|
+
ui_color = "#f0e4e8"
|
|
159
|
+
operator_extra_links = (R2IndexFileLink(),)
|
|
160
|
+
|
|
161
|
+
def __init__(
|
|
162
|
+
self,
|
|
163
|
+
*,
|
|
164
|
+
bucket: str,
|
|
165
|
+
items: list[DownloadItem] | DownloadItem,
|
|
166
|
+
r2index_conn_id: str = "r2index_default",
|
|
167
|
+
**kwargs: Any,
|
|
168
|
+
) -> None:
|
|
169
|
+
super().__init__(**kwargs)
|
|
170
|
+
self.bucket = bucket
|
|
171
|
+
self.items = [items] if isinstance(items, DownloadItem) else items
|
|
172
|
+
self.r2index_conn_id = r2index_conn_id
|
|
173
|
+
|
|
174
|
+
def _get_client_config(self, conn_id: str) -> dict[str, Any]:
|
|
175
|
+
"""Get client configuration using the hook's priority chain."""
|
|
176
|
+
hook = R2IndexHook(r2index_conn_id=conn_id)
|
|
177
|
+
config = hook._get_config_from_connection()
|
|
178
|
+
if config is None or not config.get("index_api_url"):
|
|
179
|
+
config = hook._get_config_from_env()
|
|
180
|
+
return config or {}
|
|
181
|
+
|
|
182
|
+
def execute(self, context: Context) -> list[dict[str, Any]]:
|
|
183
|
+
"""Execute the downloads in parallel."""
|
|
184
|
+
# Group items by connection ID for efficient client reuse
|
|
185
|
+
conn_configs: dict[str, dict[str, Any]] = {}
|
|
186
|
+
|
|
187
|
+
async def get_or_create_config(conn_id: str) -> dict[str, Any]:
|
|
188
|
+
if conn_id not in conn_configs:
|
|
189
|
+
conn_configs[conn_id] = self._get_client_config(conn_id)
|
|
190
|
+
return conn_configs[conn_id]
|
|
191
|
+
|
|
192
|
+
async def download_one(item: DownloadItem) -> dict[str, Any]:
|
|
193
|
+
conn_id = item.r2index_conn_id or self.r2index_conn_id
|
|
194
|
+
bucket = item.bucket or self.bucket
|
|
195
|
+
config = await get_or_create_config(conn_id)
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
async with AsyncR2IndexClient(**config) as client:
|
|
199
|
+
downloaded_path, file_record = await client.download(
|
|
200
|
+
bucket=bucket,
|
|
201
|
+
source_path=item.source_path,
|
|
202
|
+
source_filename=item.source_filename,
|
|
203
|
+
source_version=item.source_version,
|
|
204
|
+
destination=item.destination,
|
|
205
|
+
verify_checksum=item.verify_checksum,
|
|
206
|
+
)
|
|
207
|
+
return {
|
|
208
|
+
"status": "success",
|
|
209
|
+
"path": str(downloaded_path),
|
|
210
|
+
"file_record": file_record.model_dump(),
|
|
211
|
+
}
|
|
212
|
+
except Exception as e:
|
|
213
|
+
return {
|
|
214
|
+
"status": "error",
|
|
215
|
+
"message": str(e),
|
|
216
|
+
"source_path": item.source_path,
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
async def download_all() -> list[dict[str, Any]]:
|
|
220
|
+
tasks = [download_one(item) for item in self.items]
|
|
221
|
+
return await asyncio.gather(*tasks)
|
|
222
|
+
|
|
223
|
+
results = asyncio.run(download_all())
|
|
224
|
+
|
|
225
|
+
# Check for errors
|
|
226
|
+
errors = [r for r in results if r.get("status") == "error"]
|
|
227
|
+
if errors:
|
|
228
|
+
self.log.error("Failed downloads: %s", errors)
|
|
229
|
+
raise AirflowException(f"{len(errors)}/{len(results)} downloads failed")
|
|
230
|
+
|
|
231
|
+
self.log.info("Downloaded %d files in parallel", len(results))
|
|
232
|
+
return [{"path": r["path"], "file_record": r["file_record"]} for r in results]
|