apache-airflow-providers-edge3 2.0.0rc1__py3-none-any.whl → 3.0.1rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/edge3/__init__.py +1 -1
- airflow/providers/edge3/cli/api_client.py +30 -28
- airflow/providers/edge3/cli/dataclasses.py +3 -10
- airflow/providers/edge3/cli/definition.py +261 -0
- airflow/providers/edge3/cli/edge_command.py +8 -206
- airflow/providers/edge3/cli/worker.py +226 -198
- airflow/providers/edge3/example_dags/win_notepad.py +1 -1
- airflow/providers/edge3/executors/edge_executor.py +24 -49
- airflow/providers/edge3/get_provider_info.py +1 -0
- airflow/providers/edge3/models/edge_job.py +1 -2
- airflow/providers/edge3/models/edge_worker.py +61 -16
- airflow/providers/edge3/plugins/edge_executor_plugin.py +1 -1
- airflow/providers/edge3/plugins/www/dist/main.umd.cjs +8 -8
- airflow/providers/edge3/plugins/www/package.json +32 -27
- airflow/providers/edge3/plugins/www/pnpm-lock.yaml +1625 -1716
- airflow/providers/edge3/plugins/www/src/global.d.ts +24 -0
- airflow/providers/edge3/plugins/www/src/layouts/NavTabs.tsx +25 -3
- airflow/providers/edge3/plugins/www/src/main.tsx +6 -1
- airflow/providers/edge3/plugins/www/src/theme.ts +1 -1
- airflow/providers/edge3/worker_api/datamodels.py +12 -1
- airflow/providers/edge3/worker_api/routes/jobs.py +21 -8
- airflow/providers/edge3/worker_api/routes/logs.py +1 -1
- airflow/providers/edge3/worker_api/routes/worker.py +16 -3
- {apache_airflow_providers_edge3-2.0.0rc1.dist-info → apache_airflow_providers_edge3-3.0.1rc1.dist-info}/METADATA +14 -10
- {apache_airflow_providers_edge3-2.0.0rc1.dist-info → apache_airflow_providers_edge3-3.0.1rc1.dist-info}/RECORD +29 -29
- {apache_airflow_providers_edge3-2.0.0rc1.dist-info → apache_airflow_providers_edge3-3.0.1rc1.dist-info}/licenses/NOTICE +1 -1
- airflow/providers/edge3/plugins/templates/edge_worker_hosts.html +0 -175
- airflow/providers/edge3/plugins/templates/edge_worker_jobs.html +0 -69
- {apache_airflow_providers_edge3-2.0.0rc1.dist-info → apache_airflow_providers_edge3-3.0.1rc1.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_edge3-2.0.0rc1.dist-info → apache_airflow_providers_edge3-3.0.1rc1.dist-info}/entry_points.txt +0 -0
- {apache_airflow_providers_edge3-2.0.0rc1.dist-info → apache_airflow_providers_edge3-3.0.1rc1.dist-info}/licenses/LICENSE +0 -0
|
@@ -20,16 +20,18 @@ import logging
|
|
|
20
20
|
import os
|
|
21
21
|
import signal
|
|
22
22
|
import sys
|
|
23
|
+
import traceback
|
|
24
|
+
from asyncio import Task, create_task, get_running_loop, sleep
|
|
23
25
|
from datetime import datetime
|
|
24
26
|
from functools import cache
|
|
25
27
|
from http import HTTPStatus
|
|
26
|
-
from multiprocessing import Process
|
|
28
|
+
from multiprocessing import Process, Queue
|
|
27
29
|
from pathlib import Path
|
|
28
|
-
from time import sleep
|
|
29
30
|
from typing import TYPE_CHECKING
|
|
30
31
|
|
|
32
|
+
from aiofiles import open as aio_open
|
|
33
|
+
from aiohttp import ClientResponseError
|
|
31
34
|
from lockfile.pidlockfile import remove_existing_pidfile
|
|
32
|
-
from requests import HTTPError
|
|
33
35
|
|
|
34
36
|
from airflow import __version__ as airflow_version
|
|
35
37
|
from airflow.configuration import conf
|
|
@@ -58,9 +60,12 @@ from airflow.utils.net import getfqdn
|
|
|
58
60
|
from airflow.utils.state import TaskInstanceState
|
|
59
61
|
|
|
60
62
|
if TYPE_CHECKING:
|
|
61
|
-
from airflow.
|
|
63
|
+
from airflow.executors.workloads import ExecuteTask
|
|
62
64
|
|
|
63
65
|
logger = logging.getLogger(__name__)
|
|
66
|
+
base_log_folder = conf.get("logging", "base_log_folder", fallback="NOT AVAILABLE")
|
|
67
|
+
push_logs = conf.getboolean("edge", "push_logs")
|
|
68
|
+
push_log_chunk_size = conf.getint("edge", "push_log_chunk_size")
|
|
64
69
|
|
|
65
70
|
if sys.platform == "darwin":
|
|
66
71
|
setproctitle = lambda title: logger.debug("Mac OS detected, skipping setproctitle")
|
|
@@ -73,21 +78,30 @@ def _edge_hostname() -> str:
|
|
|
73
78
|
return os.environ.get("HOSTNAME", getfqdn())
|
|
74
79
|
|
|
75
80
|
|
|
81
|
+
@cache
|
|
82
|
+
def _execution_api_server_url() -> str:
|
|
83
|
+
"""Get the execution api server url from config or environment."""
|
|
84
|
+
api_url = conf.get("edge", "api_url")
|
|
85
|
+
execution_api_server_url = conf.get("core", "execution_api_server_url", fallback="")
|
|
86
|
+
if not execution_api_server_url and api_url:
|
|
87
|
+
# Derive execution api url from edge api url as fallback
|
|
88
|
+
execution_api_server_url = api_url.replace("edge_worker/v1/rpcapi", "execution")
|
|
89
|
+
logger.info("Using execution api server url: %s", execution_api_server_url)
|
|
90
|
+
return execution_api_server_url
|
|
91
|
+
|
|
92
|
+
|
|
76
93
|
class EdgeWorker:
|
|
77
94
|
"""Runner instance which executes the Edge Worker."""
|
|
78
95
|
|
|
79
96
|
jobs: list[Job] = []
|
|
80
97
|
"""List of jobs that the worker is running currently."""
|
|
81
|
-
last_hb: datetime | None = None
|
|
82
|
-
"""Timestamp of last heart beat sent to server."""
|
|
83
98
|
drain: bool = False
|
|
84
99
|
"""Flag if job processing should be completed and no new jobs fetched for a graceful stop/shutdown."""
|
|
85
100
|
maintenance_mode: bool = False
|
|
86
101
|
"""Flag if job processing should be completed and no new jobs fetched for maintenance mode. """
|
|
87
102
|
maintenance_comments: str | None = None
|
|
88
103
|
"""Comments for maintenance mode."""
|
|
89
|
-
|
|
90
|
-
"""Singleton instance of the worker."""
|
|
104
|
+
background_tasks: set[Task] = set()
|
|
91
105
|
|
|
92
106
|
def __init__(
|
|
93
107
|
self,
|
|
@@ -105,48 +119,54 @@ class EdgeWorker:
|
|
|
105
119
|
self.hostname = hostname
|
|
106
120
|
self.queues = queues
|
|
107
121
|
self.concurrency = concurrency
|
|
108
|
-
self.free_concurrency = concurrency
|
|
109
122
|
self.daemon = daemon
|
|
110
123
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
status_path.write_text(
|
|
132
|
-
WorkerStatus(
|
|
133
|
-
job_count=len(EdgeWorker.jobs),
|
|
134
|
-
jobs=[job.edge_job.key for job in EdgeWorker.jobs],
|
|
135
|
-
state=EdgeWorker._get_state(),
|
|
136
|
-
maintenance=EdgeWorker.maintenance_mode,
|
|
137
|
-
maintenance_comments=EdgeWorker.maintenance_comments,
|
|
138
|
-
drain=EdgeWorker.drain,
|
|
139
|
-
).json
|
|
140
|
-
)
|
|
124
|
+
@property
|
|
125
|
+
def free_concurrency(self) -> int:
|
|
126
|
+
"""Calculate the free concurrency of the worker."""
|
|
127
|
+
used_concurrency = sum(job.edge_job.concurrency_slots for job in self.jobs)
|
|
128
|
+
return self.concurrency - used_concurrency
|
|
129
|
+
|
|
130
|
+
def signal_status(self):
|
|
131
|
+
marker_path = Path(maintenance_marker_file_path(None))
|
|
132
|
+
if marker_path.exists():
|
|
133
|
+
request = MaintenanceMarker.from_json(marker_path.read_text())
|
|
134
|
+
logger.info("Requested to set maintenance mode to %s", request.maintenance)
|
|
135
|
+
self.maintenance_mode = request.maintenance == "on"
|
|
136
|
+
if self.maintenance_mode and request.comments:
|
|
137
|
+
logger.info("Comments: %s", request.comments)
|
|
138
|
+
self.maintenance_comments = request.comments
|
|
139
|
+
marker_path.unlink()
|
|
140
|
+
# send heartbeat immediately to update state
|
|
141
|
+
task = get_running_loop().create_task(self.heartbeat(self.maintenance_comments))
|
|
142
|
+
self.background_tasks.add(task)
|
|
143
|
+
task.add_done_callback(self.background_tasks.discard)
|
|
141
144
|
else:
|
|
142
|
-
logger.info("Request to
|
|
143
|
-
|
|
145
|
+
logger.info("Request to get status of Edge Worker received.")
|
|
146
|
+
status_path = Path(status_file_path(None))
|
|
147
|
+
status_path.write_text(
|
|
148
|
+
WorkerStatus(
|
|
149
|
+
job_count=len(self.jobs),
|
|
150
|
+
jobs=[job.edge_job.key for job in self.jobs],
|
|
151
|
+
state=self._get_state(),
|
|
152
|
+
maintenance=self.maintenance_mode,
|
|
153
|
+
maintenance_comments=self.maintenance_comments,
|
|
154
|
+
drain=self.drain,
|
|
155
|
+
).json
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
def signal_drain(self):
|
|
159
|
+
self.drain = True
|
|
160
|
+
logger.info("Request to shut down Edge Worker received, waiting for jobs to complete.")
|
|
144
161
|
|
|
145
|
-
def shutdown_handler(self
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
162
|
+
def shutdown_handler(self):
|
|
163
|
+
self.drain = True
|
|
164
|
+
msg = "SIGTERM received. Sending SIGTERM to all jobs and quit"
|
|
165
|
+
logger.info(msg)
|
|
166
|
+
for job in self.jobs:
|
|
167
|
+
if job.process.pid:
|
|
168
|
+
os.setpgid(job.process.pid, 0)
|
|
169
|
+
os.kill(job.process.pid, signal.SIGTERM)
|
|
150
170
|
|
|
151
171
|
def _get_sysinfo(self) -> dict:
|
|
152
172
|
"""Produce the sysinfo from worker to post to central site."""
|
|
@@ -157,116 +177,121 @@ class EdgeWorker:
|
|
|
157
177
|
"free_concurrency": self.free_concurrency,
|
|
158
178
|
}
|
|
159
179
|
|
|
160
|
-
|
|
161
|
-
def _get_state() -> EdgeWorkerState:
|
|
180
|
+
def _get_state(self) -> EdgeWorkerState:
|
|
162
181
|
"""State of the Edge Worker."""
|
|
163
|
-
if
|
|
164
|
-
if
|
|
182
|
+
if self.jobs:
|
|
183
|
+
if self.drain:
|
|
165
184
|
return EdgeWorkerState.TERMINATING
|
|
166
|
-
if
|
|
185
|
+
if self.maintenance_mode:
|
|
167
186
|
return EdgeWorkerState.MAINTENANCE_PENDING
|
|
168
187
|
return EdgeWorkerState.RUNNING
|
|
169
188
|
|
|
170
|
-
if
|
|
171
|
-
if
|
|
189
|
+
if self.drain:
|
|
190
|
+
if self.maintenance_mode:
|
|
172
191
|
return EdgeWorkerState.OFFLINE_MAINTENANCE
|
|
173
192
|
return EdgeWorkerState.OFFLINE
|
|
174
193
|
|
|
175
|
-
if
|
|
194
|
+
if self.maintenance_mode:
|
|
176
195
|
return EdgeWorkerState.MAINTENANCE_MODE
|
|
177
196
|
return EdgeWorkerState.IDLE
|
|
178
197
|
|
|
179
|
-
|
|
180
|
-
@cache
|
|
181
|
-
def _execution_api_server_url() -> str:
|
|
182
|
-
"""Get the execution api server url from config or environment."""
|
|
183
|
-
api_url = conf.get("edge", "api_url")
|
|
184
|
-
execution_api_server_url = conf.get("core", "execution_api_server_url", fallback="")
|
|
185
|
-
if not execution_api_server_url and api_url:
|
|
186
|
-
# Derive execution api url from edge api url as fallback
|
|
187
|
-
execution_api_server_url = api_url.replace("edge_worker/v1/rpcapi", "execution")
|
|
188
|
-
logger.info("Using execution api server url: %s", execution_api_server_url)
|
|
189
|
-
return execution_api_server_url
|
|
190
|
-
|
|
191
|
-
@staticmethod
|
|
192
|
-
def _run_job_via_supervisor(workload, execution_api_server_url) -> int:
|
|
198
|
+
def _run_job_via_supervisor(self, workload: ExecuteTask, results_queue: Queue) -> int:
|
|
193
199
|
from airflow.sdk.execution_time.supervisor import supervise
|
|
194
200
|
|
|
195
201
|
# Ignore ctrl-c in this process -- we don't want to kill _this_ one. we let tasks run to completion
|
|
196
|
-
|
|
202
|
+
os.setpgrp()
|
|
197
203
|
|
|
198
204
|
logger.info("Worker starting up pid=%d", os.getpid())
|
|
199
|
-
|
|
205
|
+
ti = workload.ti
|
|
206
|
+
setproctitle(
|
|
207
|
+
"airflow edge supervisor: "
|
|
208
|
+
f"dag_id={ti.dag_id} task_id={ti.task_id} run_id={ti.run_id} map_index={ti.map_index} "
|
|
209
|
+
f"try_number={ti.try_number}"
|
|
210
|
+
)
|
|
200
211
|
|
|
201
212
|
try:
|
|
202
213
|
supervise(
|
|
203
214
|
# This is the "wrong" ti type, but it duck types the same. TODO: Create a protocol for this.
|
|
204
215
|
# Same like in airflow/executors/local_executor.py:_execute_work()
|
|
205
|
-
ti=
|
|
216
|
+
ti=ti, # type: ignore[arg-type]
|
|
206
217
|
dag_rel_path=workload.dag_rel_path,
|
|
207
218
|
bundle_info=workload.bundle_info,
|
|
208
219
|
token=workload.token,
|
|
209
|
-
server=
|
|
220
|
+
server=_execution_api_server_url(),
|
|
210
221
|
log_path=workload.log_path,
|
|
211
222
|
)
|
|
212
223
|
return 0
|
|
213
224
|
except Exception as e:
|
|
214
|
-
logger.exception("Task execution failed
|
|
225
|
+
logger.exception("Task execution failed")
|
|
226
|
+
results_queue.put(e)
|
|
215
227
|
return 1
|
|
216
228
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
workload: ExecuteTask = edge_job.command
|
|
229
|
+
def _launch_job(self, workload: ExecuteTask) -> tuple[Process, Queue[Exception]]:
|
|
230
|
+
# Improvement: Use frozen GC to prevent child process from copying unnecessary memory
|
|
231
|
+
# See _spawn_workers_with_gc_freeze() in airflow-core/src/airflow/executors/local_executor.py
|
|
232
|
+
results_queue: Queue[Exception] = Queue()
|
|
223
233
|
process = Process(
|
|
224
|
-
target=
|
|
225
|
-
kwargs={"workload": workload, "
|
|
234
|
+
target=self._run_job_via_supervisor,
|
|
235
|
+
kwargs={"workload": workload, "results_queue": results_queue},
|
|
226
236
|
)
|
|
227
237
|
process.start()
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
logfile
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
238
|
+
return process, results_queue
|
|
239
|
+
|
|
240
|
+
async def _push_logs_in_chunks(self, job: Job):
|
|
241
|
+
if push_logs and job.logfile.exists() and job.logfile.stat().st_size > job.logsize:
|
|
242
|
+
async with aio_open(job.logfile, mode="rb") as logf:
|
|
243
|
+
await logf.seek(job.logsize, os.SEEK_SET)
|
|
244
|
+
read_data = await logf.read()
|
|
245
|
+
job.logsize += len(read_data)
|
|
246
|
+
# backslashreplace to keep not decoded characters and not raising exception
|
|
247
|
+
# replace null with question mark to fix issue during DB push
|
|
248
|
+
log_data = read_data.decode(errors="backslashreplace").replace("\x00", "\ufffd")
|
|
249
|
+
while True:
|
|
250
|
+
chunk_data = log_data[:push_log_chunk_size]
|
|
251
|
+
log_data = log_data[push_log_chunk_size:]
|
|
252
|
+
if not chunk_data:
|
|
253
|
+
break
|
|
254
|
+
|
|
255
|
+
await logs_push(
|
|
256
|
+
task=job.edge_job.key,
|
|
257
|
+
log_chunk_time=timezone.utcnow(),
|
|
258
|
+
log_chunk_data=chunk_data,
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
async def start(self):
|
|
235
262
|
"""Start the execution in a loop until terminated."""
|
|
236
263
|
try:
|
|
237
|
-
self.
|
|
238
|
-
self.hostname, EdgeWorkerState.STARTING, self.queues, self._get_sysinfo()
|
|
239
|
-
).last_update
|
|
264
|
+
await worker_register(self.hostname, EdgeWorkerState.STARTING, self.queues, self._get_sysinfo())
|
|
240
265
|
except EdgeWorkerVersionException as e:
|
|
241
266
|
logger.info("Version mismatch of Edge worker and Core. Shutting down worker.")
|
|
242
267
|
raise SystemExit(str(e))
|
|
243
268
|
except EdgeWorkerDuplicateException as e:
|
|
244
269
|
logger.error(str(e))
|
|
245
270
|
raise SystemExit(str(e))
|
|
246
|
-
except
|
|
247
|
-
if
|
|
248
|
-
|
|
271
|
+
except ClientResponseError as e:
|
|
272
|
+
# Note: Method not allowed is raised by FastAPI if the API is not enabled (not 404)
|
|
273
|
+
if e.status in {HTTPStatus.NOT_FOUND, HTTPStatus.METHOD_NOT_ALLOWED}:
|
|
274
|
+
raise SystemExit(
|
|
275
|
+
"Error: API endpoint is not ready, please set [edge] api_enabled=True. Or check if the URL is correct to your deployment."
|
|
276
|
+
)
|
|
249
277
|
raise SystemExit(str(e))
|
|
250
278
|
if not self.daemon:
|
|
251
279
|
write_pid_to_pidfile(self.pid_file_path)
|
|
252
|
-
|
|
253
|
-
signal.
|
|
254
|
-
|
|
280
|
+
loop = get_running_loop()
|
|
281
|
+
loop.add_signal_handler(signal.SIGINT, self.signal_drain)
|
|
282
|
+
loop.add_signal_handler(SIG_STATUS, self.signal_status)
|
|
283
|
+
loop.add_signal_handler(signal.SIGTERM, self.shutdown_handler)
|
|
284
|
+
setproctitle(f"airflow edge worker: {self.hostname}")
|
|
255
285
|
os.environ["HOSTNAME"] = self.hostname
|
|
256
286
|
os.environ["AIRFLOW__CORE__HOSTNAME_CALLABLE"] = f"{_edge_hostname.__module__}._edge_hostname"
|
|
257
287
|
try:
|
|
258
|
-
|
|
259
|
-
self.last_hb = datetime.now()
|
|
260
|
-
while not EdgeWorker.drain or EdgeWorker.jobs:
|
|
261
|
-
self.loop()
|
|
288
|
+
await self.loop()
|
|
262
289
|
|
|
263
290
|
logger.info("Quitting worker, signal being offline.")
|
|
264
291
|
try:
|
|
265
|
-
worker_set_state(
|
|
292
|
+
await worker_set_state(
|
|
266
293
|
self.hostname,
|
|
267
|
-
EdgeWorkerState.OFFLINE_MAINTENANCE
|
|
268
|
-
if EdgeWorker.maintenance_mode
|
|
269
|
-
else EdgeWorkerState.OFFLINE,
|
|
294
|
+
EdgeWorkerState.OFFLINE_MAINTENANCE if self.maintenance_mode else EdgeWorkerState.OFFLINE,
|
|
270
295
|
0,
|
|
271
296
|
self.queues,
|
|
272
297
|
self._get_sysinfo(),
|
|
@@ -277,95 +302,98 @@ class EdgeWorker:
|
|
|
277
302
|
if not self.daemon:
|
|
278
303
|
remove_existing_pidfile(self.pid_file_path)
|
|
279
304
|
|
|
280
|
-
def loop(self):
|
|
305
|
+
async def loop(self):
|
|
281
306
|
"""Run a loop of scheduling and monitoring tasks."""
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
self.
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
307
|
+
last_hb = datetime.now()
|
|
308
|
+
worker_state_changed = True # force heartbeat at start
|
|
309
|
+
previous_jobs = 0
|
|
310
|
+
while not self.drain or self.jobs:
|
|
311
|
+
if (
|
|
312
|
+
self.drain
|
|
313
|
+
or datetime.now().timestamp() - last_hb.timestamp() > self.hb_interval
|
|
314
|
+
or worker_state_changed # send heartbeat immediately if the state is different in db
|
|
315
|
+
or previous_jobs != len(self.jobs) # when number of jobs changes
|
|
316
|
+
):
|
|
317
|
+
worker_state_changed = await self.heartbeat()
|
|
318
|
+
last_hb = datetime.now()
|
|
319
|
+
previous_jobs = len(self.jobs)
|
|
320
|
+
|
|
321
|
+
if self.maintenance_mode:
|
|
322
|
+
logger.info("in maintenance mode%s", f", {len(self.jobs)} draining jobs" if self.jobs else "")
|
|
323
|
+
elif not self.drain and self.free_concurrency > 0:
|
|
324
|
+
task = create_task(self.fetch_and_run_job())
|
|
325
|
+
self.background_tasks.add(task)
|
|
326
|
+
task.add_done_callback(self.background_tasks.discard)
|
|
327
|
+
else:
|
|
328
|
+
logger.info("%i %s running", len(self.jobs), "job is" if len(self.jobs) == 1 else "jobs are")
|
|
329
|
+
|
|
330
|
+
await self.interruptible_sleep()
|
|
331
|
+
|
|
332
|
+
async def fetch_and_run_job(self) -> None:
|
|
333
|
+
"""Fetch, start and monitor a new job."""
|
|
302
334
|
logger.debug("Attempting to fetch a new job...")
|
|
303
|
-
edge_job = jobs_fetch(self.hostname, self.queues, self.free_concurrency)
|
|
304
|
-
if edge_job:
|
|
305
|
-
logger.info(
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
335
|
+
edge_job = await jobs_fetch(self.hostname, self.queues, self.free_concurrency)
|
|
336
|
+
if not edge_job:
|
|
337
|
+
logger.info(
|
|
338
|
+
"No new job to process%s",
|
|
339
|
+
f", {len(self.jobs)} still running" if self.jobs else "",
|
|
340
|
+
)
|
|
341
|
+
return
|
|
342
|
+
|
|
343
|
+
logger.info("Received job: %s", edge_job.identifier)
|
|
344
|
+
|
|
345
|
+
workload: ExecuteTask = edge_job.command
|
|
346
|
+
process, results_queue = self._launch_job(workload)
|
|
347
|
+
if TYPE_CHECKING:
|
|
348
|
+
assert workload.log_path # We need to assume this is defined in here
|
|
349
|
+
logfile = Path(base_log_folder, workload.log_path)
|
|
350
|
+
job = Job(edge_job, process, logfile)
|
|
351
|
+
self.jobs.append(job)
|
|
352
|
+
await jobs_set_state(edge_job.key, TaskInstanceState.RUNNING)
|
|
353
|
+
|
|
354
|
+
# As we got one job, directly fetch another one if possible
|
|
355
|
+
if self.free_concurrency > 0:
|
|
356
|
+
task = create_task(self.fetch_and_run_job())
|
|
357
|
+
self.background_tasks.add(task)
|
|
358
|
+
task.add_done_callback(self.background_tasks.discard)
|
|
359
|
+
|
|
360
|
+
while job.is_running:
|
|
361
|
+
await self._push_logs_in_chunks(job)
|
|
362
|
+
for _ in range(0, self.job_poll_interval * 10):
|
|
363
|
+
await sleep(0.1)
|
|
364
|
+
if not job.is_running:
|
|
365
|
+
break
|
|
366
|
+
await self._push_logs_in_chunks(job)
|
|
367
|
+
|
|
368
|
+
self.jobs.remove(job)
|
|
369
|
+
if job.is_success:
|
|
370
|
+
logger.info("Job completed: %s", job.edge_job.identifier)
|
|
371
|
+
await jobs_set_state(job.edge_job.key, TaskInstanceState.SUCCESS)
|
|
372
|
+
else:
|
|
373
|
+
if results_queue.empty():
|
|
374
|
+
ex_txt = "(Unknown error, no exception details available)"
|
|
329
375
|
else:
|
|
330
|
-
|
|
376
|
+
ex = results_queue.get()
|
|
377
|
+
ex_txt = "\n".join(traceback.format_exception(ex))
|
|
378
|
+
logger.error("Job failed: %s with:\n%s", job.edge_job.identifier, ex_txt)
|
|
379
|
+
# Push it upwards to logs for better diagnostic as well
|
|
380
|
+
await logs_push(
|
|
381
|
+
task=job.edge_job.key,
|
|
382
|
+
log_chunk_time=timezone.utcnow(),
|
|
383
|
+
log_chunk_data=f"Error starting job:\n{ex_txt}",
|
|
384
|
+
)
|
|
385
|
+
await jobs_set_state(job.edge_job.key, TaskInstanceState.FAILED)
|
|
331
386
|
|
|
332
|
-
|
|
333
|
-
conf.getboolean("edge", "push_logs")
|
|
334
|
-
and job.logfile.exists()
|
|
335
|
-
and job.logfile.stat().st_size > job.logsize
|
|
336
|
-
):
|
|
337
|
-
with job.logfile.open("rb") as logfile:
|
|
338
|
-
push_log_chunk_size = conf.getint("edge", "push_log_chunk_size")
|
|
339
|
-
logfile.seek(job.logsize, os.SEEK_SET)
|
|
340
|
-
read_data = logfile.read()
|
|
341
|
-
job.logsize += len(read_data)
|
|
342
|
-
# backslashreplace to keep not decoded characters and not raising exception
|
|
343
|
-
# replace null with question mark to fix issue during DB push
|
|
344
|
-
log_data = read_data.decode(errors="backslashreplace").replace("\x00", "\ufffd")
|
|
345
|
-
while True:
|
|
346
|
-
chunk_data = log_data[:push_log_chunk_size]
|
|
347
|
-
log_data = log_data[push_log_chunk_size:]
|
|
348
|
-
if not chunk_data:
|
|
349
|
-
break
|
|
350
|
-
|
|
351
|
-
logs_push(
|
|
352
|
-
task=job.edge_job.key,
|
|
353
|
-
log_chunk_time=timezone.utcnow(),
|
|
354
|
-
log_chunk_data=chunk_data,
|
|
355
|
-
)
|
|
356
|
-
|
|
357
|
-
self.free_concurrency = self.concurrency - used_concurrency
|
|
358
|
-
|
|
359
|
-
def heartbeat(self, new_maintenance_comments: str | None = None) -> bool:
|
|
387
|
+
async def heartbeat(self, new_maintenance_comments: str | None = None) -> bool:
|
|
360
388
|
"""Report liveness state of worker to central site with stats."""
|
|
361
|
-
state =
|
|
389
|
+
state = self._get_state()
|
|
362
390
|
sysinfo = self._get_sysinfo()
|
|
363
391
|
worker_state_changed: bool = False
|
|
364
392
|
try:
|
|
365
|
-
worker_info = worker_set_state(
|
|
393
|
+
worker_info = await worker_set_state(
|
|
366
394
|
self.hostname,
|
|
367
395
|
state,
|
|
368
|
-
len(
|
|
396
|
+
len(self.jobs),
|
|
369
397
|
self.queues,
|
|
370
398
|
sysinfo,
|
|
371
399
|
new_maintenance_comments,
|
|
@@ -373,31 +401,31 @@ class EdgeWorker:
|
|
|
373
401
|
self.queues = worker_info.queues
|
|
374
402
|
if worker_info.state == EdgeWorkerState.MAINTENANCE_REQUEST:
|
|
375
403
|
logger.info("Maintenance mode requested!")
|
|
376
|
-
|
|
404
|
+
self.maintenance_mode = True
|
|
377
405
|
elif (
|
|
378
|
-
worker_info.state in [EdgeWorkerState.IDLE, EdgeWorkerState.RUNNING]
|
|
379
|
-
and EdgeWorker.maintenance_mode
|
|
406
|
+
worker_info.state in [EdgeWorkerState.IDLE, EdgeWorkerState.RUNNING] and self.maintenance_mode
|
|
380
407
|
):
|
|
381
408
|
logger.info("Exit Maintenance mode requested!")
|
|
382
|
-
|
|
383
|
-
if
|
|
384
|
-
|
|
409
|
+
self.maintenance_mode = False
|
|
410
|
+
if self.maintenance_mode:
|
|
411
|
+
self.maintenance_comments = worker_info.maintenance_comments
|
|
385
412
|
else:
|
|
386
|
-
|
|
413
|
+
self.maintenance_comments = None
|
|
387
414
|
if worker_info.state == EdgeWorkerState.SHUTDOWN_REQUEST:
|
|
388
415
|
logger.info("Shutdown requested!")
|
|
389
|
-
|
|
416
|
+
self.drain = True
|
|
390
417
|
|
|
391
418
|
worker_state_changed = worker_info.state != state
|
|
392
419
|
except EdgeWorkerVersionException:
|
|
393
420
|
logger.info("Version mismatch of Edge worker and Core. Shutting down worker.")
|
|
394
|
-
|
|
421
|
+
self.drain = True
|
|
395
422
|
return worker_state_changed
|
|
396
423
|
|
|
397
|
-
def interruptible_sleep(self):
|
|
398
|
-
"""Sleeps but stops sleeping if drain is made."""
|
|
399
|
-
drain_before_sleep =
|
|
424
|
+
async def interruptible_sleep(self):
|
|
425
|
+
"""Sleeps but stops sleeping if drain is made or some job completed."""
|
|
426
|
+
drain_before_sleep = self.drain
|
|
427
|
+
jobcount_before_sleep = len(self.jobs)
|
|
400
428
|
for _ in range(0, self.job_poll_interval * 10):
|
|
401
|
-
sleep(0.1)
|
|
402
|
-
if drain_before_sleep !=
|
|
429
|
+
await sleep(0.1)
|
|
430
|
+
if drain_before_sleep != self.drain or len(self.jobs) < jobcount_before_sleep:
|
|
403
431
|
return
|