apache-airflow-providers-edge3 1.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/edge3/LICENSE +201 -0
- airflow/providers/edge3/__init__.py +39 -0
- airflow/providers/edge3/cli/__init__.py +16 -0
- airflow/providers/edge3/cli/api_client.py +206 -0
- airflow/providers/edge3/cli/dataclasses.py +95 -0
- airflow/providers/edge3/cli/edge_command.py +689 -0
- airflow/providers/edge3/example_dags/__init__.py +16 -0
- airflow/providers/edge3/example_dags/integration_test.py +164 -0
- airflow/providers/edge3/example_dags/win_notepad.py +83 -0
- airflow/providers/edge3/example_dags/win_test.py +342 -0
- airflow/providers/edge3/executors/__init__.py +22 -0
- airflow/providers/edge3/executors/edge_executor.py +367 -0
- airflow/providers/edge3/get_provider_info.py +99 -0
- airflow/providers/edge3/models/__init__.py +16 -0
- airflow/providers/edge3/models/edge_job.py +94 -0
- airflow/providers/edge3/models/edge_logs.py +73 -0
- airflow/providers/edge3/models/edge_worker.py +230 -0
- airflow/providers/edge3/openapi/__init__.py +19 -0
- airflow/providers/edge3/openapi/edge_worker_api_v1.yaml +808 -0
- airflow/providers/edge3/plugins/__init__.py +16 -0
- airflow/providers/edge3/plugins/edge_executor_plugin.py +229 -0
- airflow/providers/edge3/plugins/templates/edge_worker_hosts.html +175 -0
- airflow/providers/edge3/plugins/templates/edge_worker_jobs.html +69 -0
- airflow/providers/edge3/version_compat.py +36 -0
- airflow/providers/edge3/worker_api/__init__.py +17 -0
- airflow/providers/edge3/worker_api/app.py +43 -0
- airflow/providers/edge3/worker_api/auth.py +135 -0
- airflow/providers/edge3/worker_api/datamodels.py +190 -0
- airflow/providers/edge3/worker_api/routes/__init__.py +16 -0
- airflow/providers/edge3/worker_api/routes/_v2_compat.py +135 -0
- airflow/providers/edge3/worker_api/routes/_v2_routes.py +237 -0
- airflow/providers/edge3/worker_api/routes/health.py +28 -0
- airflow/providers/edge3/worker_api/routes/jobs.py +162 -0
- airflow/providers/edge3/worker_api/routes/logs.py +133 -0
- airflow/providers/edge3/worker_api/routes/worker.py +224 -0
- apache_airflow_providers_edge3-1.0.0rc1.dist-info/METADATA +117 -0
- apache_airflow_providers_edge3-1.0.0rc1.dist-info/RECORD +39 -0
- apache_airflow_providers_edge3-1.0.0rc1.dist-info/WHEEL +4 -0
- apache_airflow_providers_edge3-1.0.0rc1.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,367 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
from __future__ import annotations
|
19
|
+
|
20
|
+
import contextlib
|
21
|
+
from collections.abc import Sequence
|
22
|
+
from copy import deepcopy
|
23
|
+
from datetime import datetime, timedelta
|
24
|
+
from typing import TYPE_CHECKING, Any, Optional
|
25
|
+
|
26
|
+
from sqlalchemy import delete, inspect, text
|
27
|
+
from sqlalchemy.exc import NoSuchTableError
|
28
|
+
from sqlalchemy.orm import Session
|
29
|
+
|
30
|
+
from airflow.cli.cli_config import GroupCommand
|
31
|
+
from airflow.configuration import conf
|
32
|
+
from airflow.executors.base_executor import BaseExecutor
|
33
|
+
from airflow.models.abstractoperator import DEFAULT_QUEUE
|
34
|
+
from airflow.models.taskinstance import TaskInstance, TaskInstanceState
|
35
|
+
from airflow.providers.edge3.cli.edge_command import EDGE_COMMANDS
|
36
|
+
from airflow.providers.edge3.models.edge_job import EdgeJobModel
|
37
|
+
from airflow.providers.edge3.models.edge_logs import EdgeLogsModel
|
38
|
+
from airflow.providers.edge3.models.edge_worker import EdgeWorkerModel, EdgeWorkerState, reset_metrics
|
39
|
+
from airflow.providers.edge3.version_compat import AIRFLOW_V_3_0_PLUS
|
40
|
+
from airflow.stats import Stats
|
41
|
+
from airflow.utils import timezone
|
42
|
+
from airflow.utils.db import DBLocks, create_global_lock
|
43
|
+
from airflow.utils.session import NEW_SESSION, provide_session
|
44
|
+
|
45
|
+
if TYPE_CHECKING:
|
46
|
+
import argparse
|
47
|
+
|
48
|
+
from sqlalchemy.engine.base import Engine
|
49
|
+
|
50
|
+
from airflow.executors.base_executor import CommandType
|
51
|
+
from airflow.models.taskinstancekey import TaskInstanceKey
|
52
|
+
|
53
|
+
# Task tuple to send to be executed
|
54
|
+
TaskTuple = tuple[TaskInstanceKey, CommandType, Optional[str], Optional[Any]]
|
55
|
+
|
56
|
+
PARALLELISM: int = conf.getint("core", "PARALLELISM")
|
57
|
+
|
58
|
+
|
59
|
+
class EdgeExecutor(BaseExecutor):
|
60
|
+
"""Implementation of the EdgeExecutor to distribute work to Edge Workers via HTTP."""
|
61
|
+
|
62
|
+
def __init__(self, parallelism: int = PARALLELISM):
|
63
|
+
super().__init__(parallelism=parallelism)
|
64
|
+
self.last_reported_state: dict[TaskInstanceKey, TaskInstanceState] = {}
|
65
|
+
|
66
|
+
def _check_db_schema(self, engine: Engine) -> None:
|
67
|
+
"""
|
68
|
+
Check if already existing table matches the newest table schema.
|
69
|
+
|
70
|
+
workaround till Airflow 3.0.0, then it is possible to use alembic also for provider distributions.
|
71
|
+
"""
|
72
|
+
inspector = inspect(engine)
|
73
|
+
edge_job_columns = None
|
74
|
+
with contextlib.suppress(NoSuchTableError):
|
75
|
+
edge_job_columns = [column["name"] for column in inspector.get_columns("edge_job")]
|
76
|
+
|
77
|
+
# version 0.6.0rc1 added new column concurrency_slots
|
78
|
+
if edge_job_columns and "concurrency_slots" not in edge_job_columns:
|
79
|
+
EdgeJobModel.metadata.drop_all(engine, tables=[EdgeJobModel.__table__])
|
80
|
+
|
81
|
+
edge_worker_columns = None
|
82
|
+
with contextlib.suppress(NoSuchTableError):
|
83
|
+
edge_worker_columns = [column["name"] for column in inspector.get_columns("edge_worker")]
|
84
|
+
|
85
|
+
# version 0.14.0pre0 added new column maintenance_comment
|
86
|
+
if edge_worker_columns and "maintenance_comment" not in edge_worker_columns:
|
87
|
+
with Session(engine) as session:
|
88
|
+
query = "ALTER TABLE edge_worker ADD maintenance_comment VARCHAR(1024);"
|
89
|
+
session.execute(text(query))
|
90
|
+
session.commit()
|
91
|
+
|
92
|
+
@provide_session
|
93
|
+
def start(self, session: Session = NEW_SESSION):
|
94
|
+
"""If EdgeExecutor provider is loaded first time, ensure table exists."""
|
95
|
+
with create_global_lock(session=session, lock=DBLocks.MIGRATIONS):
|
96
|
+
engine = session.get_bind().engine
|
97
|
+
self._check_db_schema(engine)
|
98
|
+
EdgeJobModel.metadata.create_all(engine)
|
99
|
+
EdgeLogsModel.metadata.create_all(engine)
|
100
|
+
EdgeWorkerModel.metadata.create_all(engine)
|
101
|
+
|
102
|
+
def _process_tasks(self, task_tuples: list[TaskTuple]) -> None:
|
103
|
+
"""
|
104
|
+
Temponary overwrite of _process_tasks function.
|
105
|
+
|
106
|
+
Idea is to not change the interface of the execute_async function in BaseExecutor as it will be changed in Airflow 3.
|
107
|
+
Edge worker needs task_instance in execute_async but BaseExecutor deletes this out of the self.queued_tasks.
|
108
|
+
Store queued_tasks in own var to be able to access this in execute_async function.
|
109
|
+
"""
|
110
|
+
self.edge_queued_tasks = deepcopy(self.queued_tasks)
|
111
|
+
super()._process_tasks(task_tuples)
|
112
|
+
|
113
|
+
@provide_session
|
114
|
+
def execute_async(
|
115
|
+
self,
|
116
|
+
key: TaskInstanceKey,
|
117
|
+
command: CommandType,
|
118
|
+
queue: str | None = None,
|
119
|
+
executor_config: Any | None = None,
|
120
|
+
session: Session = NEW_SESSION,
|
121
|
+
) -> None:
|
122
|
+
"""Execute asynchronously. Airflow 2.10 entry point to execute a task."""
|
123
|
+
# Use of a temponary trick to get task instance, will be changed with Airflow 3.0.0
|
124
|
+
# code works together with _process_tasks overwrite to get task instance.
|
125
|
+
task_instance = self.edge_queued_tasks[key][3] # TaskInstance in fourth element
|
126
|
+
del self.edge_queued_tasks[key]
|
127
|
+
|
128
|
+
self.validate_airflow_tasks_run_command(command)
|
129
|
+
session.add(
|
130
|
+
EdgeJobModel(
|
131
|
+
dag_id=key.dag_id,
|
132
|
+
task_id=key.task_id,
|
133
|
+
run_id=key.run_id,
|
134
|
+
map_index=key.map_index,
|
135
|
+
try_number=key.try_number,
|
136
|
+
state=TaskInstanceState.QUEUED,
|
137
|
+
queue=queue or DEFAULT_QUEUE,
|
138
|
+
concurrency_slots=task_instance.pool_slots,
|
139
|
+
command=str(command),
|
140
|
+
)
|
141
|
+
)
|
142
|
+
|
143
|
+
@provide_session
|
144
|
+
def queue_workload(
|
145
|
+
self,
|
146
|
+
workload: Any, # Note actually "airflow.executors.workloads.All" but not existing in Airflow 2.10
|
147
|
+
session: Session = NEW_SESSION,
|
148
|
+
) -> None:
|
149
|
+
"""Put new workload to queue. Airflow 3 entry point to execute a task."""
|
150
|
+
from airflow.executors import workloads
|
151
|
+
|
152
|
+
if not isinstance(workload, workloads.ExecuteTask):
|
153
|
+
raise TypeError(f"Don't know how to queue workload of type {type(workload).__name__}")
|
154
|
+
|
155
|
+
task_instance = workload.ti
|
156
|
+
key = task_instance.key
|
157
|
+
session.add(
|
158
|
+
EdgeJobModel(
|
159
|
+
dag_id=key.dag_id,
|
160
|
+
task_id=key.task_id,
|
161
|
+
run_id=key.run_id,
|
162
|
+
map_index=key.map_index,
|
163
|
+
try_number=key.try_number,
|
164
|
+
state=TaskInstanceState.QUEUED,
|
165
|
+
queue=task_instance.queue,
|
166
|
+
concurrency_slots=task_instance.pool_slots,
|
167
|
+
command=workload.model_dump_json(),
|
168
|
+
)
|
169
|
+
)
|
170
|
+
|
171
|
+
def _check_worker_liveness(self, session: Session) -> bool:
|
172
|
+
"""Reset worker state if heartbeat timed out."""
|
173
|
+
changed = False
|
174
|
+
heartbeat_interval: int = conf.getint("edge", "heartbeat_interval")
|
175
|
+
lifeless_workers: list[EdgeWorkerModel] = (
|
176
|
+
session.query(EdgeWorkerModel)
|
177
|
+
.with_for_update(skip_locked=True)
|
178
|
+
.filter(
|
179
|
+
EdgeWorkerModel.state.not_in(
|
180
|
+
[EdgeWorkerState.UNKNOWN, EdgeWorkerState.OFFLINE, EdgeWorkerState.OFFLINE_MAINTENANCE]
|
181
|
+
),
|
182
|
+
EdgeWorkerModel.last_update < (timezone.utcnow() - timedelta(seconds=heartbeat_interval * 5)),
|
183
|
+
)
|
184
|
+
.all()
|
185
|
+
)
|
186
|
+
|
187
|
+
for worker in lifeless_workers:
|
188
|
+
changed = True
|
189
|
+
worker.state = EdgeWorkerState.UNKNOWN
|
190
|
+
reset_metrics(worker.worker_name)
|
191
|
+
|
192
|
+
return changed
|
193
|
+
|
194
|
+
def _update_orphaned_jobs(self, session: Session) -> bool:
|
195
|
+
"""Update status ob jobs when workers die and don't update anymore."""
|
196
|
+
if AIRFLOW_V_3_0_PLUS:
|
197
|
+
heartbeat_interval_config_name = "task_instance_heartbeat_timeout"
|
198
|
+
else:
|
199
|
+
heartbeat_interval_config_name = "scheduler_zombie_task_threshold"
|
200
|
+
heartbeat_interval: int = conf.getint("scheduler", heartbeat_interval_config_name)
|
201
|
+
lifeless_jobs: list[EdgeJobModel] = (
|
202
|
+
session.query(EdgeJobModel)
|
203
|
+
.with_for_update(skip_locked=True)
|
204
|
+
.filter(
|
205
|
+
EdgeJobModel.state == TaskInstanceState.RUNNING,
|
206
|
+
EdgeJobModel.last_update < (timezone.utcnow() - timedelta(seconds=heartbeat_interval)),
|
207
|
+
)
|
208
|
+
.all()
|
209
|
+
)
|
210
|
+
|
211
|
+
for job in lifeless_jobs:
|
212
|
+
ti = TaskInstance.get_task_instance(
|
213
|
+
dag_id=job.dag_id,
|
214
|
+
run_id=job.run_id,
|
215
|
+
task_id=job.task_id,
|
216
|
+
map_index=job.map_index,
|
217
|
+
session=session,
|
218
|
+
)
|
219
|
+
job.state = ti.state if ti else TaskInstanceState.REMOVED
|
220
|
+
|
221
|
+
if job.state != TaskInstanceState.RUNNING:
|
222
|
+
# Edge worker does not backport emitted Airflow metrics, so export some metrics
|
223
|
+
# Export metrics as failed as these jobs will be deleted in the future
|
224
|
+
tags = {
|
225
|
+
"dag_id": job.dag_id,
|
226
|
+
"task_id": job.task_id,
|
227
|
+
"queue": job.queue,
|
228
|
+
"state": str(TaskInstanceState.FAILED),
|
229
|
+
}
|
230
|
+
Stats.incr(
|
231
|
+
f"edge_worker.ti.finish.{job.queue}.{TaskInstanceState.FAILED}.{job.dag_id}.{job.task_id}",
|
232
|
+
tags=tags,
|
233
|
+
)
|
234
|
+
Stats.incr("edge_worker.ti.finish", tags=tags)
|
235
|
+
|
236
|
+
return bool(lifeless_jobs)
|
237
|
+
|
238
|
+
def _purge_jobs(self, session: Session) -> bool:
|
239
|
+
"""Clean finished jobs."""
|
240
|
+
purged_marker = False
|
241
|
+
job_success_purge = conf.getint("edge", "job_success_purge")
|
242
|
+
job_fail_purge = conf.getint("edge", "job_fail_purge")
|
243
|
+
jobs: list[EdgeJobModel] = (
|
244
|
+
session.query(EdgeJobModel)
|
245
|
+
.with_for_update(skip_locked=True)
|
246
|
+
.filter(
|
247
|
+
EdgeJobModel.state.in_(
|
248
|
+
[
|
249
|
+
TaskInstanceState.RUNNING,
|
250
|
+
TaskInstanceState.SUCCESS,
|
251
|
+
TaskInstanceState.FAILED,
|
252
|
+
TaskInstanceState.REMOVED,
|
253
|
+
TaskInstanceState.RESTARTING,
|
254
|
+
TaskInstanceState.UP_FOR_RETRY,
|
255
|
+
]
|
256
|
+
)
|
257
|
+
)
|
258
|
+
.all()
|
259
|
+
)
|
260
|
+
|
261
|
+
# Sync DB with executor otherwise runs out of sync in multi scheduler deployment
|
262
|
+
already_removed = self.running - set(job.key for job in jobs)
|
263
|
+
self.running = self.running - already_removed
|
264
|
+
|
265
|
+
for job in jobs:
|
266
|
+
if job.key in self.running:
|
267
|
+
if job.state == TaskInstanceState.RUNNING:
|
268
|
+
if (
|
269
|
+
job.key not in self.last_reported_state
|
270
|
+
or self.last_reported_state[job.key] != job.state
|
271
|
+
):
|
272
|
+
self.running_state(job.key)
|
273
|
+
self.last_reported_state[job.key] = job.state
|
274
|
+
elif job.state == TaskInstanceState.SUCCESS:
|
275
|
+
if job.key in self.last_reported_state:
|
276
|
+
del self.last_reported_state[job.key]
|
277
|
+
self.success(job.key)
|
278
|
+
elif job.state in [
|
279
|
+
TaskInstanceState.FAILED,
|
280
|
+
TaskInstanceState.RESTARTING,
|
281
|
+
TaskInstanceState.UP_FOR_RETRY,
|
282
|
+
]:
|
283
|
+
if job.key in self.last_reported_state:
|
284
|
+
del self.last_reported_state[job.key]
|
285
|
+
self.fail(job.key)
|
286
|
+
else:
|
287
|
+
self.last_reported_state[job.key] = job.state
|
288
|
+
if (
|
289
|
+
job.state == TaskInstanceState.SUCCESS
|
290
|
+
and job.last_update_t < (datetime.now() - timedelta(minutes=job_success_purge)).timestamp()
|
291
|
+
) or (
|
292
|
+
job.state
|
293
|
+
in (
|
294
|
+
TaskInstanceState.FAILED,
|
295
|
+
TaskInstanceState.REMOVED,
|
296
|
+
TaskInstanceState.RESTARTING,
|
297
|
+
TaskInstanceState.UP_FOR_RETRY,
|
298
|
+
)
|
299
|
+
and job.last_update_t < (datetime.now() - timedelta(minutes=job_fail_purge)).timestamp()
|
300
|
+
):
|
301
|
+
if job.key in self.last_reported_state:
|
302
|
+
del self.last_reported_state[job.key]
|
303
|
+
purged_marker = True
|
304
|
+
session.delete(job)
|
305
|
+
session.execute(
|
306
|
+
delete(EdgeLogsModel).where(
|
307
|
+
EdgeLogsModel.dag_id == job.dag_id,
|
308
|
+
EdgeLogsModel.run_id == job.run_id,
|
309
|
+
EdgeLogsModel.task_id == job.task_id,
|
310
|
+
EdgeLogsModel.map_index == job.map_index,
|
311
|
+
EdgeLogsModel.try_number == job.try_number,
|
312
|
+
)
|
313
|
+
)
|
314
|
+
|
315
|
+
return purged_marker
|
316
|
+
|
317
|
+
@provide_session
|
318
|
+
def sync(self, session: Session = NEW_SESSION) -> None:
|
319
|
+
"""Sync will get called periodically by the heartbeat method."""
|
320
|
+
with Stats.timer("edge_executor.sync.duration"):
|
321
|
+
orphaned = self._update_orphaned_jobs(session)
|
322
|
+
purged = self._purge_jobs(session)
|
323
|
+
liveness = self._check_worker_liveness(session)
|
324
|
+
if purged or liveness or orphaned:
|
325
|
+
session.commit()
|
326
|
+
|
327
|
+
def end(self) -> None:
|
328
|
+
"""End the executor."""
|
329
|
+
self.log.info("Shutting down EdgeExecutor")
|
330
|
+
|
331
|
+
def terminate(self):
|
332
|
+
"""Terminate the executor is not doing anything."""
|
333
|
+
|
334
|
+
def try_adopt_task_instances(self, tis: Sequence[TaskInstance]) -> Sequence[TaskInstance]:
|
335
|
+
"""
|
336
|
+
Try to adopt running task instances that have been abandoned by a SchedulerJob dying.
|
337
|
+
|
338
|
+
Anything that is not adopted will be cleared by the scheduler (and then become eligible for
|
339
|
+
re-scheduling)
|
340
|
+
|
341
|
+
:return: any TaskInstances that were unable to be adopted
|
342
|
+
"""
|
343
|
+
# We handle all running tasks from the DB in sync, no adoption logic needed.
|
344
|
+
return []
|
345
|
+
|
346
|
+
@staticmethod
|
347
|
+
def get_cli_commands() -> list[GroupCommand]:
|
348
|
+
return [
|
349
|
+
GroupCommand(
|
350
|
+
name="edge",
|
351
|
+
help="Edge Worker components",
|
352
|
+
description=(
|
353
|
+
"Start and manage Edge Worker. Works only when using EdgeExecutor. For more information, "
|
354
|
+
"see https://airflow.apache.org/docs/apache-airflow-providers-edge3/stable/edge_executor.html"
|
355
|
+
),
|
356
|
+
subcommands=EDGE_COMMANDS,
|
357
|
+
),
|
358
|
+
]
|
359
|
+
|
360
|
+
|
361
|
+
def _get_parser() -> argparse.ArgumentParser:
|
362
|
+
"""
|
363
|
+
Generate documentation; used by Sphinx.
|
364
|
+
|
365
|
+
:meta private:
|
366
|
+
"""
|
367
|
+
return EdgeExecutor._get_parser()
|
@@ -0,0 +1,99 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
# NOTE! THIS FILE IS AUTOMATICALLY GENERATED AND WILL BE OVERWRITTEN!
|
19
|
+
#
|
20
|
+
# IF YOU WANT TO MODIFY THIS FILE, YOU SHOULD MODIFY THE TEMPLATE
|
21
|
+
# `get_provider_info_TEMPLATE.py.jinja2` IN the `dev/breeze/src/airflow_breeze/templates` DIRECTORY
|
22
|
+
|
23
|
+
|
24
|
+
def get_provider_info():
|
25
|
+
return {
|
26
|
+
"package-name": "apache-airflow-providers-edge3",
|
27
|
+
"name": "Edge Executor",
|
28
|
+
"description": "Handle edge workers on remote sites via HTTP(s) connection and orchestrates work over distributed sites\n",
|
29
|
+
"plugins": [
|
30
|
+
{
|
31
|
+
"name": "edge_executor",
|
32
|
+
"plugin-class": "airflow.providers.edge3.plugins.edge_executor_plugin.EdgeExecutorPlugin",
|
33
|
+
}
|
34
|
+
],
|
35
|
+
"executors": ["airflow.providers.edge3.executors.EdgeExecutor"],
|
36
|
+
"config": {
|
37
|
+
"edge": {
|
38
|
+
"description": "This section only applies if you are using the EdgeExecutor in\n``[core]`` section above\n",
|
39
|
+
"options": {
|
40
|
+
"api_enabled": {
|
41
|
+
"description": "Flag if the plugin endpoint is enabled to serve Edge Workers.\n",
|
42
|
+
"version_added": None,
|
43
|
+
"type": "boolean",
|
44
|
+
"example": "True",
|
45
|
+
"default": "False",
|
46
|
+
},
|
47
|
+
"api_url": {
|
48
|
+
"description": "URL endpoint on which the Airflow code edge API is accessible from edge worker.\n",
|
49
|
+
"version_added": None,
|
50
|
+
"type": "string",
|
51
|
+
"example": "https://airflow.hosting.org/edge_worker/v1/rpcapi",
|
52
|
+
"default": None,
|
53
|
+
},
|
54
|
+
"job_poll_interval": {
|
55
|
+
"description": "Edge Worker currently polls for new jobs via HTTP. This parameter defines the number\nof seconds it should sleep between polls for new jobs.\nJob polling only happens if the Edge Worker seeks for new work. Not if busy.\n",
|
56
|
+
"version_added": None,
|
57
|
+
"type": "integer",
|
58
|
+
"example": "5",
|
59
|
+
"default": "5",
|
60
|
+
},
|
61
|
+
"heartbeat_interval": {
|
62
|
+
"description": "Edge Worker continuously reports status to the central site. This parameter defines\nhow often a status with heartbeat should be sent.\nDuring heartbeat status is reported as well as it is checked if a running task is to be terminated.\n",
|
63
|
+
"version_added": None,
|
64
|
+
"type": "integer",
|
65
|
+
"example": "10",
|
66
|
+
"default": "30",
|
67
|
+
},
|
68
|
+
"worker_concurrency": {
|
69
|
+
"description": "The concurrency defines the default max parallel running task instances and can also be set during\nstart of worker with the ``airflow edge worker`` command parameter. The size of the workers\nand the resources must support the nature of your tasks. The parameter\nworks together with the concurrency_slots parameter of a task.\n",
|
70
|
+
"version_added": None,
|
71
|
+
"type": "integer",
|
72
|
+
"example": None,
|
73
|
+
"default": "8",
|
74
|
+
},
|
75
|
+
"job_success_purge": {
|
76
|
+
"description": "Minutes after which successful jobs for EdgeExecutor are purged from database\n",
|
77
|
+
"version_added": None,
|
78
|
+
"type": "integer",
|
79
|
+
"example": None,
|
80
|
+
"default": "5",
|
81
|
+
},
|
82
|
+
"job_fail_purge": {
|
83
|
+
"description": "Minutes after which failed jobs for EdgeExecutor are purged from database\n",
|
84
|
+
"version_added": None,
|
85
|
+
"type": "integer",
|
86
|
+
"example": None,
|
87
|
+
"default": "60",
|
88
|
+
},
|
89
|
+
"push_log_chunk_size": {
|
90
|
+
"description": "Edge Worker uploads log files in chunks. If the log file part which is uploaded\nexceeds the chunk size it creates a new request. The application gateway can\nlimit the max body size see:\nhttps://nginx.org/en/docs/http/ngx_http_core_module.html#client_max_body_size\nA HTTP 413 issue can point to this value to fix the issue.\nThis value must be defined in Bytes.\n",
|
91
|
+
"version_added": None,
|
92
|
+
"type": "integer",
|
93
|
+
"example": None,
|
94
|
+
"default": "524288",
|
95
|
+
},
|
96
|
+
},
|
97
|
+
}
|
98
|
+
},
|
99
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
@@ -0,0 +1,94 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
from __future__ import annotations
|
18
|
+
|
19
|
+
from datetime import datetime
|
20
|
+
|
21
|
+
from sqlalchemy import (
|
22
|
+
Column,
|
23
|
+
Index,
|
24
|
+
Integer,
|
25
|
+
String,
|
26
|
+
text,
|
27
|
+
)
|
28
|
+
|
29
|
+
from airflow.models.base import Base, StringID
|
30
|
+
from airflow.models.taskinstancekey import TaskInstanceKey
|
31
|
+
from airflow.utils import timezone
|
32
|
+
from airflow.utils.log.logging_mixin import LoggingMixin
|
33
|
+
from airflow.utils.sqlalchemy import UtcDateTime
|
34
|
+
|
35
|
+
|
36
|
+
class EdgeJobModel(Base, LoggingMixin):
|
37
|
+
"""
|
38
|
+
A job which is queued, waiting or running on a Edge Worker.
|
39
|
+
|
40
|
+
Each tuple in the database represents and describes the state of one job.
|
41
|
+
"""
|
42
|
+
|
43
|
+
__tablename__ = "edge_job"
|
44
|
+
dag_id = Column(StringID(), primary_key=True, nullable=False)
|
45
|
+
task_id = Column(StringID(), primary_key=True, nullable=False)
|
46
|
+
run_id = Column(StringID(), primary_key=True, nullable=False)
|
47
|
+
map_index = Column(Integer, primary_key=True, nullable=False, server_default=text("-1"))
|
48
|
+
try_number = Column(Integer, primary_key=True, default=0)
|
49
|
+
state = Column(String(20))
|
50
|
+
queue = Column(String(256))
|
51
|
+
concurrency_slots = Column(Integer)
|
52
|
+
command = Column(String(1000))
|
53
|
+
queued_dttm = Column(UtcDateTime)
|
54
|
+
edge_worker = Column(String(64))
|
55
|
+
last_update = Column(UtcDateTime)
|
56
|
+
|
57
|
+
def __init__(
|
58
|
+
self,
|
59
|
+
dag_id: str,
|
60
|
+
task_id: str,
|
61
|
+
run_id: str,
|
62
|
+
map_index: int,
|
63
|
+
try_number: int,
|
64
|
+
state: str,
|
65
|
+
queue: str,
|
66
|
+
concurrency_slots: int,
|
67
|
+
command: str,
|
68
|
+
queued_dttm: datetime | None = None,
|
69
|
+
edge_worker: str | None = None,
|
70
|
+
last_update: datetime | None = None,
|
71
|
+
):
|
72
|
+
self.dag_id = dag_id
|
73
|
+
self.task_id = task_id
|
74
|
+
self.run_id = run_id
|
75
|
+
self.map_index = map_index
|
76
|
+
self.try_number = try_number
|
77
|
+
self.state = state
|
78
|
+
self.queue = queue
|
79
|
+
self.concurrency_slots = concurrency_slots
|
80
|
+
self.command = command
|
81
|
+
self.queued_dttm = queued_dttm or timezone.utcnow()
|
82
|
+
self.edge_worker = edge_worker
|
83
|
+
self.last_update = last_update
|
84
|
+
super().__init__()
|
85
|
+
|
86
|
+
__table_args__ = (Index("rj_order", state, queued_dttm, queue),)
|
87
|
+
|
88
|
+
@property
|
89
|
+
def key(self):
|
90
|
+
return TaskInstanceKey(self.dag_id, self.task_id, self.run_id, self.try_number, self.map_index)
|
91
|
+
|
92
|
+
@property
|
93
|
+
def last_update_t(self) -> float:
|
94
|
+
return self.last_update.timestamp()
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
from __future__ import annotations
|
18
|
+
|
19
|
+
from datetime import datetime
|
20
|
+
|
21
|
+
from sqlalchemy import (
|
22
|
+
Column,
|
23
|
+
Integer,
|
24
|
+
Text,
|
25
|
+
text,
|
26
|
+
)
|
27
|
+
from sqlalchemy.dialects.mysql import MEDIUMTEXT
|
28
|
+
|
29
|
+
from airflow.models.base import Base, StringID
|
30
|
+
from airflow.utils.log.logging_mixin import LoggingMixin
|
31
|
+
from airflow.utils.sqlalchemy import UtcDateTime
|
32
|
+
|
33
|
+
|
34
|
+
class EdgeLogsModel(Base, LoggingMixin):
|
35
|
+
"""
|
36
|
+
Temporary collected logs from a Edge Worker while job runs on remote site.
|
37
|
+
|
38
|
+
As the Edge Worker in most cases has a local file system and the web UI no access
|
39
|
+
to read files from remote site, Edge Workers will send incremental chunks of logs
|
40
|
+
of running jobs to the central site. As log storage backends in most cloud cases can not
|
41
|
+
append logs, the table is used as buffer to receive. Upon task completion logs can be
|
42
|
+
flushed to task log handler.
|
43
|
+
|
44
|
+
Log data therefore is collected in chunks and is only temporary.
|
45
|
+
"""
|
46
|
+
|
47
|
+
__tablename__ = "edge_logs"
|
48
|
+
dag_id = Column(StringID(), primary_key=True, nullable=False)
|
49
|
+
task_id = Column(StringID(), primary_key=True, nullable=False)
|
50
|
+
run_id = Column(StringID(), primary_key=True, nullable=False)
|
51
|
+
map_index = Column(Integer, primary_key=True, nullable=False, server_default=text("-1"))
|
52
|
+
try_number = Column(Integer, primary_key=True, default=0)
|
53
|
+
log_chunk_time = Column(UtcDateTime, primary_key=True, nullable=False)
|
54
|
+
log_chunk_data = Column(Text().with_variant(MEDIUMTEXT(), "mysql"), nullable=False)
|
55
|
+
|
56
|
+
def __init__(
|
57
|
+
self,
|
58
|
+
dag_id: str,
|
59
|
+
task_id: str,
|
60
|
+
run_id: str,
|
61
|
+
map_index: int,
|
62
|
+
try_number: int,
|
63
|
+
log_chunk_time: datetime,
|
64
|
+
log_chunk_data: str,
|
65
|
+
):
|
66
|
+
self.dag_id = dag_id
|
67
|
+
self.task_id = task_id
|
68
|
+
self.run_id = run_id
|
69
|
+
self.map_index = map_index
|
70
|
+
self.try_number = try_number
|
71
|
+
self.log_chunk_time = log_chunk_time
|
72
|
+
self.log_chunk_data = log_chunk_data
|
73
|
+
super().__init__()
|