apache-airflow-providers-edge3 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. airflow/providers/edge3/LICENSE +201 -0
  2. airflow/providers/edge3/__init__.py +39 -0
  3. airflow/providers/edge3/cli/__init__.py +16 -0
  4. airflow/providers/edge3/cli/api_client.py +206 -0
  5. airflow/providers/edge3/cli/dataclasses.py +95 -0
  6. airflow/providers/edge3/cli/edge_command.py +689 -0
  7. airflow/providers/edge3/example_dags/__init__.py +16 -0
  8. airflow/providers/edge3/example_dags/integration_test.py +164 -0
  9. airflow/providers/edge3/example_dags/win_notepad.py +83 -0
  10. airflow/providers/edge3/example_dags/win_test.py +342 -0
  11. airflow/providers/edge3/executors/__init__.py +22 -0
  12. airflow/providers/edge3/executors/edge_executor.py +367 -0
  13. airflow/providers/edge3/get_provider_info.py +99 -0
  14. airflow/providers/edge3/models/__init__.py +16 -0
  15. airflow/providers/edge3/models/edge_job.py +94 -0
  16. airflow/providers/edge3/models/edge_logs.py +73 -0
  17. airflow/providers/edge3/models/edge_worker.py +230 -0
  18. airflow/providers/edge3/openapi/__init__.py +19 -0
  19. airflow/providers/edge3/openapi/edge_worker_api_v1.yaml +808 -0
  20. airflow/providers/edge3/plugins/__init__.py +16 -0
  21. airflow/providers/edge3/plugins/edge_executor_plugin.py +229 -0
  22. airflow/providers/edge3/plugins/templates/edge_worker_hosts.html +175 -0
  23. airflow/providers/edge3/plugins/templates/edge_worker_jobs.html +69 -0
  24. airflow/providers/edge3/version_compat.py +36 -0
  25. airflow/providers/edge3/worker_api/__init__.py +17 -0
  26. airflow/providers/edge3/worker_api/app.py +43 -0
  27. airflow/providers/edge3/worker_api/auth.py +135 -0
  28. airflow/providers/edge3/worker_api/datamodels.py +190 -0
  29. airflow/providers/edge3/worker_api/routes/__init__.py +16 -0
  30. airflow/providers/edge3/worker_api/routes/_v2_compat.py +135 -0
  31. airflow/providers/edge3/worker_api/routes/_v2_routes.py +237 -0
  32. airflow/providers/edge3/worker_api/routes/health.py +28 -0
  33. airflow/providers/edge3/worker_api/routes/jobs.py +162 -0
  34. airflow/providers/edge3/worker_api/routes/logs.py +133 -0
  35. airflow/providers/edge3/worker_api/routes/worker.py +224 -0
  36. apache_airflow_providers_edge3-1.0.0.dist-info/METADATA +117 -0
  37. apache_airflow_providers_edge3-1.0.0.dist-info/RECORD +39 -0
  38. apache_airflow_providers_edge3-1.0.0.dist-info/WHEEL +4 -0
  39. apache_airflow_providers_edge3-1.0.0.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,689 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import logging
21
+ import os
22
+ import signal
23
+ import sys
24
+ from dataclasses import asdict
25
+ from datetime import datetime
26
+ from http import HTTPStatus
27
+ from multiprocessing import Process
28
+ from pathlib import Path
29
+ from subprocess import Popen
30
+ from time import sleep, time
31
+ from typing import TYPE_CHECKING
32
+
33
+ import psutil
34
+ from lockfile.pidlockfile import read_pid_from_pidfile, remove_existing_pidfile, write_pid_to_pidfile
35
+ from requests import HTTPError
36
+
37
+ from airflow import __version__ as airflow_version, settings
38
+ from airflow.cli.cli_config import ARG_PID, ARG_VERBOSE, ActionCommand, Arg
39
+ from airflow.configuration import conf
40
+ from airflow.providers.edge3 import __version__ as edge_provider_version
41
+ from airflow.providers.edge3.cli.api_client import (
42
+ jobs_fetch,
43
+ jobs_set_state,
44
+ logs_logfile_path,
45
+ logs_push,
46
+ worker_register,
47
+ worker_set_state,
48
+ )
49
+ from airflow.providers.edge3.cli.dataclasses import Job, MaintenanceMarker, WorkerStatus
50
+ from airflow.providers.edge3.models.edge_worker import EdgeWorkerState, EdgeWorkerVersionException
51
+ from airflow.providers.edge3.version_compat import AIRFLOW_V_3_0_PLUS
52
+ from airflow.utils import cli as cli_utils, timezone
53
+ from airflow.utils.net import getfqdn
54
+ from airflow.utils.platform import IS_WINDOWS
55
+ from airflow.utils.providers_configuration_loader import providers_configuration_loaded
56
+ from airflow.utils.state import TaskInstanceState
57
+
58
+ if TYPE_CHECKING:
59
+ from airflow.providers.edge3.worker_api.datamodels import EdgeJobFetched
60
+
61
+ logger = logging.getLogger(__name__)
62
+ EDGE_WORKER_PROCESS_NAME = "edge-worker"
63
+ EDGE_WORKER_HEADER = "\n".join(
64
+ [
65
+ r" ____ __ _ __ __",
66
+ r" / __/__/ /__ ____ | | /| / /__ ____/ /_____ ____",
67
+ r" / _// _ / _ `/ -_) | |/ |/ / _ \/ __/ '_/ -_) __/",
68
+ r"/___/\_,_/\_, /\__/ |__/|__/\___/_/ /_/\_\\__/_/",
69
+ r" /___/",
70
+ r"",
71
+ ]
72
+ )
73
+
74
+
75
+ @providers_configuration_loaded
76
+ def force_use_internal_api_on_edge_worker():
77
+ """
78
+ Ensure that the environment is configured for the internal API without needing to declare it outside.
79
+
80
+ This is only required for an Edge worker and must to be done before the Click CLI wrapper is initiated.
81
+ That is because the CLI wrapper will attempt to establish a DB connection, which will fail before the
82
+ function call can take effect. In an Edge worker, we need to "patch" the environment before starting.
83
+ """
84
+ # export Edge API to be used for internal API
85
+ os.environ["_AIRFLOW__SKIP_DATABASE_EXECUTOR_COMPATIBILITY_CHECK"] = "1"
86
+ os.environ["AIRFLOW_ENABLE_AIP_44"] = "True"
87
+ if "airflow" in sys.argv[0] and sys.argv[1:3] == ["edge", "worker"]:
88
+ api_url = conf.get("edge", "api_url")
89
+ if not api_url:
90
+ raise SystemExit("Error: API URL is not configured, please correct configuration.")
91
+ logger.info("Starting worker with API endpoint %s", api_url)
92
+ os.environ["AIRFLOW__CORE__INTERNAL_API_URL"] = api_url
93
+
94
+
95
+ force_use_internal_api_on_edge_worker()
96
+
97
+
98
+ def _status_signal() -> signal.Signals:
99
+ if IS_WINDOWS:
100
+ return signal.SIGBREAK # type: ignore[attr-defined]
101
+ return signal.SIGUSR2
102
+
103
+
104
+ SIG_STATUS = _status_signal()
105
+
106
+
107
+ def _pid_file_path(pid_file: str | None) -> str:
108
+ return cli_utils.setup_locations(process=EDGE_WORKER_PROCESS_NAME, pid=pid_file)[0]
109
+
110
+
111
+ def _get_pid(pid_file: str | None) -> int:
112
+ pid = read_pid_from_pidfile(_pid_file_path(pid_file))
113
+ if not pid:
114
+ logger.warning("Could not find PID of worker.")
115
+ sys.exit(1)
116
+ return pid
117
+
118
+
119
+ def _status_file_path(pid_file: str | None) -> str:
120
+ return cli_utils.setup_locations(process=EDGE_WORKER_PROCESS_NAME, pid=pid_file)[1]
121
+
122
+
123
+ def _maintenance_marker_file_path(pid_file: str | None) -> str:
124
+ return cli_utils.setup_locations(process=EDGE_WORKER_PROCESS_NAME, pid=pid_file)[1][:-4] + ".in"
125
+
126
+
127
+ def _write_pid_to_pidfile(pid_file_path: str):
128
+ """Write PIDs for Edge Workers to disk, handling existing PID files."""
129
+ if Path(pid_file_path).exists():
130
+ # Handle existing PID files on disk
131
+ logger.info("An existing PID file has been found: %s.", pid_file_path)
132
+ pid_stored_in_pid_file = read_pid_from_pidfile(pid_file_path)
133
+ if os.getpid() == pid_stored_in_pid_file:
134
+ raise SystemExit("A PID file has already been written")
135
+ # PID file was written by dead or already running instance
136
+ if psutil.pid_exists(pid_stored_in_pid_file):
137
+ # case 1: another instance uses the same path for its PID file
138
+ raise SystemExit(
139
+ f"The PID file {pid_file_path} contains the PID of another running process. "
140
+ "Configuration issue: edge worker instance must use different PID file paths!"
141
+ )
142
+ # case 2: previous instance crashed without cleaning up its PID file
143
+ logger.warning("PID file is orphaned. Cleaning up.")
144
+ remove_existing_pidfile(pid_file_path)
145
+ logger.debug("PID file written to %s.", pid_file_path)
146
+ write_pid_to_pidfile(pid_file_path)
147
+
148
+
149
+ def _edge_hostname() -> str:
150
+ """Get the hostname of the edge worker that should be reported by tasks."""
151
+ return os.environ.get("HOSTNAME", getfqdn())
152
+
153
+
154
+ class _EdgeWorkerCli:
155
+ """Runner instance which executes the Edge Worker."""
156
+
157
+ jobs: list[Job] = []
158
+ """List of jobs that the worker is running currently."""
159
+ last_hb: datetime | None = None
160
+ """Timestamp of last heart beat sent to server."""
161
+ drain: bool = False
162
+ """Flag if job processing should be completed and no new jobs fetched for a graceful stop/shutdown."""
163
+ maintenance_mode: bool = False
164
+ """Flag if job processing should be completed and no new jobs fetched for maintenance mode. """
165
+ maintenance_comments: str | None = None
166
+ """Comments for maintenance mode."""
167
+
168
+ edge_instance: _EdgeWorkerCli | None = None
169
+ """Singleton instance of the worker."""
170
+
171
+ def __init__(
172
+ self,
173
+ pid_file_path: str,
174
+ hostname: str,
175
+ queues: list[str] | None,
176
+ concurrency: int,
177
+ job_poll_interval: int,
178
+ heartbeat_interval: int,
179
+ ):
180
+ self.pid_file_path = pid_file_path
181
+ self.job_poll_interval = job_poll_interval
182
+ self.hb_interval = heartbeat_interval
183
+ self.hostname = hostname
184
+ self.queues = queues
185
+ self.concurrency = concurrency
186
+ self.free_concurrency = concurrency
187
+
188
+ _EdgeWorkerCli.edge_instance = self
189
+
190
+ @staticmethod
191
+ def signal_handler(sig: signal.Signals, frame):
192
+ if sig == SIG_STATUS:
193
+ marker_path = Path(_maintenance_marker_file_path(None))
194
+ if marker_path.exists():
195
+ request = MaintenanceMarker.from_json(marker_path.read_text())
196
+ logger.info("Requested to set maintenance mode to %s", request.maintenance)
197
+ _EdgeWorkerCli.maintenance_mode = request.maintenance == "on"
198
+ if _EdgeWorkerCli.maintenance_mode and request.comments:
199
+ logger.info("Comments: %s", request.comments)
200
+ _EdgeWorkerCli.maintenance_comments = request.comments
201
+ marker_path.unlink()
202
+ # send heartbeat immediately to update state
203
+ if _EdgeWorkerCli.edge_instance:
204
+ _EdgeWorkerCli.edge_instance.heartbeat(_EdgeWorkerCli.maintenance_comments)
205
+ else:
206
+ logger.info("Request to get status of Edge Worker received.")
207
+ status_path = Path(_status_file_path(None))
208
+ status_path.write_text(
209
+ WorkerStatus(
210
+ job_count=len(_EdgeWorkerCli.jobs),
211
+ jobs=[job.edge_job.key for job in _EdgeWorkerCli.jobs],
212
+ state=_EdgeWorkerCli._get_state(),
213
+ maintenance=_EdgeWorkerCli.maintenance_mode,
214
+ maintenance_comments=_EdgeWorkerCli.maintenance_comments,
215
+ drain=_EdgeWorkerCli.drain,
216
+ ).json
217
+ )
218
+ else:
219
+ logger.info("Request to shut down Edge Worker received, waiting for jobs to complete.")
220
+ _EdgeWorkerCli.drain = True
221
+
222
+ def shutdown_handler(self, sig, frame):
223
+ logger.info("SIGTERM received. Terminating all jobs and quit")
224
+ for job in _EdgeWorkerCli.jobs:
225
+ os.killpg(job.process.pid, signal.SIGTERM)
226
+ _EdgeWorkerCli.drain = True
227
+
228
+ def _get_sysinfo(self) -> dict:
229
+ """Produce the sysinfo from worker to post to central site."""
230
+ return {
231
+ "airflow_version": airflow_version,
232
+ "edge_provider_version": edge_provider_version,
233
+ "concurrency": self.concurrency,
234
+ "free_concurrency": self.free_concurrency,
235
+ }
236
+
237
+ @staticmethod
238
+ def _get_state() -> EdgeWorkerState:
239
+ """State of the Edge Worker."""
240
+ if _EdgeWorkerCli.jobs:
241
+ if _EdgeWorkerCli.drain:
242
+ return EdgeWorkerState.TERMINATING
243
+ if _EdgeWorkerCli.maintenance_mode:
244
+ return EdgeWorkerState.MAINTENANCE_PENDING
245
+ return EdgeWorkerState.RUNNING
246
+
247
+ if _EdgeWorkerCli.drain:
248
+ if _EdgeWorkerCli.maintenance_mode:
249
+ return EdgeWorkerState.OFFLINE_MAINTENANCE
250
+ return EdgeWorkerState.OFFLINE
251
+
252
+ if _EdgeWorkerCli.maintenance_mode:
253
+ return EdgeWorkerState.MAINTENANCE_MODE
254
+ return EdgeWorkerState.IDLE
255
+
256
+ def _launch_job_af3(self, edge_job: EdgeJobFetched) -> tuple[Process, Path]:
257
+ if TYPE_CHECKING:
258
+ from airflow.executors.workloads import ExecuteTask
259
+
260
+ def _run_job_via_supervisor(
261
+ workload: ExecuteTask,
262
+ ) -> int:
263
+ from setproctitle import setproctitle
264
+
265
+ from airflow.sdk.execution_time.supervisor import supervise
266
+
267
+ # Ignore ctrl-c in this process -- we don't want to kill _this_ one. we let tasks run to completion
268
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
269
+
270
+ logger.info("Worker starting up pid=%d", os.getpid())
271
+ setproctitle(f"airflow edge worker: {workload.ti.key}")
272
+
273
+ try:
274
+ supervise(
275
+ # This is the "wrong" ti type, but it duck types the same. TODO: Create a protocol for this.
276
+ # Same like in airflow/executors/local_executor.py:_execute_work()
277
+ ti=workload.ti, # type: ignore[arg-type]
278
+ dag_rel_path=workload.dag_rel_path,
279
+ bundle_info=workload.bundle_info,
280
+ token=workload.token,
281
+ server=conf.get("core", "execution_api_server_url"),
282
+ log_path=workload.log_path,
283
+ )
284
+ return 0
285
+ except Exception as e:
286
+ logger.exception("Task execution failed: %s", e)
287
+ return 1
288
+
289
+ workload: ExecuteTask = edge_job.command
290
+ process = Process(
291
+ target=_run_job_via_supervisor,
292
+ kwargs={"workload": workload},
293
+ )
294
+ process.start()
295
+ base_log_folder = conf.get("logging", "base_log_folder", fallback="NOT AVAILABLE")
296
+ if TYPE_CHECKING:
297
+ assert workload.log_path # We need to assume this is defined in here
298
+ logfile = Path(base_log_folder, workload.log_path)
299
+ return process, logfile
300
+
301
+ def _launch_job_af2_10(self, edge_job: EdgeJobFetched) -> tuple[Popen, Path]:
302
+ """Compatibility for Airflow 2.10 Launch."""
303
+ env = os.environ.copy()
304
+ env["AIRFLOW__CORE__DATABASE_ACCESS_ISOLATION"] = "True"
305
+ env["AIRFLOW__CORE__INTERNAL_API_URL"] = conf.get("edge", "api_url")
306
+ env["_AIRFLOW__SKIP_DATABASE_EXECUTOR_COMPATIBILITY_CHECK"] = "1"
307
+ command: list[str] = edge_job.command # type: ignore[assignment]
308
+ process = Popen(command, close_fds=True, env=env, start_new_session=True)
309
+ logfile = logs_logfile_path(edge_job.key)
310
+ return process, logfile
311
+
312
+ def _launch_job(self, edge_job: EdgeJobFetched):
313
+ """Get the received job executed."""
314
+ process: Popen | Process
315
+ if AIRFLOW_V_3_0_PLUS:
316
+ process, logfile = self._launch_job_af3(edge_job)
317
+ else:
318
+ # Airflow 2.10
319
+ process, logfile = self._launch_job_af2_10(edge_job)
320
+ _EdgeWorkerCli.jobs.append(Job(edge_job, process, logfile, 0))
321
+
322
+ def start(self):
323
+ """Start the execution in a loop until terminated."""
324
+ try:
325
+ self.last_hb = worker_register(
326
+ self.hostname, EdgeWorkerState.STARTING, self.queues, self._get_sysinfo()
327
+ ).last_update
328
+ except EdgeWorkerVersionException as e:
329
+ logger.info("Version mismatch of Edge worker and Core. Shutting down worker.")
330
+ raise SystemExit(str(e))
331
+ except HTTPError as e:
332
+ if e.response.status_code == HTTPStatus.NOT_FOUND:
333
+ raise SystemExit("Error: API endpoint is not ready, please set [edge] api_enabled=True.")
334
+ raise SystemExit(str(e))
335
+ _write_pid_to_pidfile(self.pid_file_path)
336
+ signal.signal(signal.SIGINT, _EdgeWorkerCli.signal_handler)
337
+ signal.signal(SIG_STATUS, _EdgeWorkerCli.signal_handler)
338
+ signal.signal(signal.SIGTERM, self.shutdown_handler)
339
+ os.environ["HOSTNAME"] = self.hostname
340
+ os.environ["AIRFLOW__CORE__HOSTNAME_CALLABLE"] = f"{_edge_hostname.__module__}._edge_hostname"
341
+ try:
342
+ self.worker_state_changed = self.heartbeat()
343
+ self.last_hb = datetime.now()
344
+ while not _EdgeWorkerCli.drain or _EdgeWorkerCli.jobs:
345
+ self.loop()
346
+
347
+ logger.info("Quitting worker, signal being offline.")
348
+ try:
349
+ worker_set_state(
350
+ self.hostname,
351
+ EdgeWorkerState.OFFLINE_MAINTENANCE
352
+ if _EdgeWorkerCli.maintenance_mode
353
+ else EdgeWorkerState.OFFLINE,
354
+ 0,
355
+ self.queues,
356
+ self._get_sysinfo(),
357
+ )
358
+ except EdgeWorkerVersionException:
359
+ logger.info("Version mismatch of Edge worker and Core. Quitting worker anyway.")
360
+ finally:
361
+ remove_existing_pidfile(self.pid_file_path)
362
+
363
+ def loop(self):
364
+ """Run a loop of scheduling and monitoring tasks."""
365
+ new_job = False
366
+ previous_jobs = _EdgeWorkerCli.jobs
367
+ if not any((_EdgeWorkerCli.drain, _EdgeWorkerCli.maintenance_mode)) and self.free_concurrency > 0:
368
+ new_job = self.fetch_job()
369
+ self.check_running_jobs()
370
+
371
+ if (
372
+ _EdgeWorkerCli.drain
373
+ or datetime.now().timestamp() - self.last_hb.timestamp() > self.hb_interval
374
+ or self.worker_state_changed # send heartbeat immediately if the state is different in db
375
+ or bool(previous_jobs) != bool(_EdgeWorkerCli.jobs) # when number of jobs changes from/to 0
376
+ ):
377
+ self.worker_state_changed = self.heartbeat()
378
+ self.last_hb = datetime.now()
379
+
380
+ if not new_job:
381
+ self.interruptible_sleep()
382
+
383
+ def fetch_job(self) -> bool:
384
+ """Fetch and start a new job from central site."""
385
+ logger.debug("Attempting to fetch a new job...")
386
+ edge_job = jobs_fetch(self.hostname, self.queues, self.free_concurrency)
387
+ if edge_job:
388
+ logger.info("Received job: %s", edge_job)
389
+ self._launch_job(edge_job)
390
+ jobs_set_state(edge_job.key, TaskInstanceState.RUNNING)
391
+ return True
392
+
393
+ logger.info(
394
+ "No new job to process%s",
395
+ f", {len(_EdgeWorkerCli.jobs)} still running" if _EdgeWorkerCli.jobs else "",
396
+ )
397
+ return False
398
+
399
+ def check_running_jobs(self) -> None:
400
+ """Check which of the running tasks/jobs are completed and report back."""
401
+ used_concurrency = 0
402
+ for i in range(len(_EdgeWorkerCli.jobs) - 1, -1, -1):
403
+ job = _EdgeWorkerCli.jobs[i]
404
+ if not job.is_running:
405
+ _EdgeWorkerCli.jobs.remove(job)
406
+ if job.is_success:
407
+ logger.info("Job completed: %s", job.edge_job)
408
+ jobs_set_state(job.edge_job.key, TaskInstanceState.SUCCESS)
409
+ else:
410
+ logger.error("Job failed: %s", job.edge_job)
411
+ jobs_set_state(job.edge_job.key, TaskInstanceState.FAILED)
412
+ else:
413
+ used_concurrency += job.edge_job.concurrency_slots
414
+
415
+ if job.logfile.exists() and job.logfile.stat().st_size > job.logsize:
416
+ with job.logfile.open("rb") as logfile:
417
+ push_log_chunk_size = conf.getint("edge", "push_log_chunk_size")
418
+ logfile.seek(job.logsize, os.SEEK_SET)
419
+ read_data = logfile.read()
420
+ job.logsize += len(read_data)
421
+ # backslashreplace to keep not decoded characters and not raising exception
422
+ # replace null with question mark to fix issue during DB push
423
+ log_data = read_data.decode(errors="backslashreplace").replace("\x00", "\ufffd")
424
+ while True:
425
+ chunk_data = log_data[:push_log_chunk_size]
426
+ log_data = log_data[push_log_chunk_size:]
427
+ if not chunk_data:
428
+ break
429
+
430
+ logs_push(
431
+ task=job.edge_job.key,
432
+ log_chunk_time=timezone.utcnow(),
433
+ log_chunk_data=chunk_data,
434
+ )
435
+
436
+ self.free_concurrency = self.concurrency - used_concurrency
437
+
438
+ def heartbeat(self, new_maintenance_comments: str | None = None) -> bool:
439
+ """Report liveness state of worker to central site with stats."""
440
+ state = _EdgeWorkerCli._get_state()
441
+ sysinfo = self._get_sysinfo()
442
+ worker_state_changed: bool = False
443
+ try:
444
+ worker_info = worker_set_state(
445
+ self.hostname,
446
+ state,
447
+ len(_EdgeWorkerCli.jobs),
448
+ self.queues,
449
+ sysinfo,
450
+ new_maintenance_comments,
451
+ )
452
+ self.queues = worker_info.queues
453
+ if worker_info.state == EdgeWorkerState.MAINTENANCE_REQUEST:
454
+ logger.info("Maintenance mode requested!")
455
+ _EdgeWorkerCli.maintenance_mode = True
456
+ elif (
457
+ worker_info.state in [EdgeWorkerState.IDLE, EdgeWorkerState.RUNNING]
458
+ and _EdgeWorkerCli.maintenance_mode
459
+ ):
460
+ logger.info("Exit Maintenance mode requested!")
461
+ _EdgeWorkerCli.maintenance_mode = False
462
+ if _EdgeWorkerCli.maintenance_mode:
463
+ _EdgeWorkerCli.maintenance_comments = worker_info.maintenance_comments
464
+ else:
465
+ _EdgeWorkerCli.maintenance_comments = None
466
+
467
+ worker_state_changed = worker_info.state != state
468
+ except EdgeWorkerVersionException:
469
+ logger.info("Version mismatch of Edge worker and Core. Shutting down worker.")
470
+ _EdgeWorkerCli.drain = True
471
+ return worker_state_changed
472
+
473
+ def interruptible_sleep(self):
474
+ """Sleeps but stops sleeping if drain is made."""
475
+ drain_before_sleep = _EdgeWorkerCli.drain
476
+ for _ in range(0, self.job_poll_interval * 10):
477
+ sleep(0.1)
478
+ if drain_before_sleep != _EdgeWorkerCli.drain:
479
+ return
480
+
481
+
482
+ @cli_utils.action_cli(check_db=False)
483
+ @providers_configuration_loaded
484
+ def worker(args):
485
+ """Start Airflow Edge Worker."""
486
+ print(settings.HEADER)
487
+ print(EDGE_WORKER_HEADER)
488
+
489
+ edge_worker = _EdgeWorkerCli(
490
+ pid_file_path=_pid_file_path(args.pid),
491
+ hostname=args.edge_hostname or getfqdn(),
492
+ queues=args.queues.split(",") if args.queues else None,
493
+ concurrency=args.concurrency,
494
+ job_poll_interval=conf.getint("edge", "job_poll_interval"),
495
+ heartbeat_interval=conf.getint("edge", "heartbeat_interval"),
496
+ )
497
+ edge_worker.start()
498
+
499
+
500
+ @cli_utils.action_cli(check_db=False)
501
+ @providers_configuration_loaded
502
+ def status(args):
503
+ """Check for Airflow Edge Worker status."""
504
+ pid = _get_pid(args.pid)
505
+
506
+ # Send Signal as notification to drop status JSON
507
+ logger.debug("Sending SIGUSR2 to worker pid %i.", pid)
508
+ status_min_date = time() - 1
509
+ status_path = Path(_status_file_path(args.pid))
510
+ worker_process = psutil.Process(pid)
511
+ worker_process.send_signal(SIG_STATUS)
512
+ while psutil.pid_exists(pid) and (
513
+ not status_path.exists() or status_path.stat().st_mtime < status_min_date
514
+ ):
515
+ sleep(0.1)
516
+ if not psutil.pid_exists(pid):
517
+ logger.warning("PID of worker dis-appeared while checking for status.")
518
+ sys.exit(2)
519
+ if not status_path.exists() or status_path.stat().st_mtime < status_min_date:
520
+ logger.warning("Could not read status of worker.")
521
+ sys.exit(3)
522
+ status = WorkerStatus.from_json(status_path.read_text())
523
+ print(json.dumps(asdict(status), indent=4))
524
+
525
+
526
+ @cli_utils.action_cli(check_db=False)
527
+ @providers_configuration_loaded
528
+ def maintenance(args):
529
+ """Set or Unset maintenance mode of worker."""
530
+ if args.maintenance == "on" and not args.comments:
531
+ logger.error("Comments are required when setting maintenance mode.")
532
+ sys.exit(4)
533
+
534
+ pid = _get_pid(args.pid)
535
+
536
+ # Write marker JSON file
537
+ from getpass import getuser
538
+
539
+ marker_path = Path(_maintenance_marker_file_path(args.pid))
540
+ logger.debug("Writing maintenance marker file to %s.", marker_path)
541
+ marker_path.write_text(
542
+ MaintenanceMarker(
543
+ maintenance=args.maintenance,
544
+ comments=f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}] - {getuser()} put "
545
+ f"node into maintenance mode via cli\nComment: {args.comments}"
546
+ if args.maintenance == "on"
547
+ else None,
548
+ ).json
549
+ )
550
+
551
+ # Send Signal as notification to fetch maintenance marker
552
+ logger.debug("Sending SIGUSR2 to worker pid %i.", pid)
553
+ status_min_date = time() - 1
554
+ status_path = Path(_status_file_path(args.pid))
555
+ worker_process = psutil.Process(pid)
556
+ worker_process.send_signal(SIG_STATUS)
557
+ while psutil.pid_exists(pid) and (
558
+ not status_path.exists() or status_path.stat().st_mtime < status_min_date
559
+ ):
560
+ sleep(0.1)
561
+ if not psutil.pid_exists(pid):
562
+ logger.warning("PID of worker dis-appeared while checking for status.")
563
+ sys.exit(2)
564
+ if not status_path.exists() or status_path.stat().st_mtime < status_min_date:
565
+ logger.warning("Could not read status of worker.")
566
+ sys.exit(3)
567
+ status = WorkerStatus.from_json(status_path.read_text())
568
+
569
+ if args.wait:
570
+ if args.maintenance == "on" and status.state != EdgeWorkerState.MAINTENANCE_MODE:
571
+ logger.info("Waiting for worker to be drained...")
572
+ while True:
573
+ sleep(4.5)
574
+ worker_process.send_signal(SIG_STATUS)
575
+ sleep(0.5)
576
+ status = WorkerStatus.from_json(status_path.read_text())
577
+ if status.state == EdgeWorkerState.MAINTENANCE_MODE:
578
+ logger.info("Worker was drained successfully!")
579
+ break
580
+ if status.state not in [
581
+ EdgeWorkerState.MAINTENANCE_REQUEST,
582
+ EdgeWorkerState.MAINTENANCE_PENDING,
583
+ ]:
584
+ logger.info("Worker maintenance was exited by someone else!")
585
+ break
586
+ if args.maintenance == "off" and status.state == EdgeWorkerState.MAINTENANCE_MODE:
587
+ logger.info("Waiting for worker to exit maintenance...")
588
+ while status.state in [EdgeWorkerState.MAINTENANCE_MODE, EdgeWorkerState.MAINTENANCE_EXIT]:
589
+ sleep(4.5)
590
+ worker_process.send_signal(SIG_STATUS)
591
+ sleep(0.5)
592
+ status = WorkerStatus.from_json(status_path.read_text())
593
+
594
+ print(json.dumps(asdict(status), indent=4))
595
+
596
+
597
+ @cli_utils.action_cli(check_db=False)
598
+ @providers_configuration_loaded
599
+ def stop(args):
600
+ """Stop a running Airflow Edge Worker."""
601
+ pid = _get_pid(args.pid)
602
+ # Send SIGINT
603
+ logger.info("Sending SIGINT to worker pid %i.", pid)
604
+ worker_process = psutil.Process(pid)
605
+ worker_process.send_signal(signal.SIGINT)
606
+
607
+ if args.wait:
608
+ logger.info("Waiting for worker to stop...")
609
+ while psutil.pid_exists(pid):
610
+ sleep(0.1)
611
+ logger.info("Worker has been shut down.")
612
+
613
+
614
+ ARG_CONCURRENCY = Arg(
615
+ ("-c", "--concurrency"),
616
+ type=int,
617
+ help="The number of worker processes",
618
+ default=conf.getint("edge", "worker_concurrency", fallback=8),
619
+ )
620
+ ARG_QUEUES = Arg(
621
+ ("-q", "--queues"),
622
+ help="Comma delimited list of queues to serve, serve all queues if not provided.",
623
+ )
624
+ ARG_EDGE_HOSTNAME = Arg(
625
+ ("-H", "--edge-hostname"),
626
+ help="Set the hostname of worker if you have multiple workers on a single machine",
627
+ )
628
+ ARG_MAINTENANCE = Arg(("maintenance",), help="Desired maintenance state", choices=("on", "off"))
629
+ ARG_MAINTENANCE_COMMENT = Arg(
630
+ ("-c", "--comments"),
631
+ help="Maintenance comments to report reason. Required if maintenance is turned on.",
632
+ )
633
+ ARG_WAIT_MAINT = Arg(
634
+ ("-w", "--wait"),
635
+ default=False,
636
+ help="Wait until edge worker has reached desired state.",
637
+ action="store_true",
638
+ )
639
+ ARG_WAIT_STOP = Arg(
640
+ ("-w", "--wait"),
641
+ default=False,
642
+ help="Wait until edge worker is shut down.",
643
+ action="store_true",
644
+ )
645
+ EDGE_COMMANDS: list[ActionCommand] = [
646
+ ActionCommand(
647
+ name=worker.__name__,
648
+ help=worker.__doc__,
649
+ func=worker,
650
+ args=(
651
+ ARG_CONCURRENCY,
652
+ ARG_QUEUES,
653
+ ARG_EDGE_HOSTNAME,
654
+ ARG_PID,
655
+ ARG_VERBOSE,
656
+ ),
657
+ ),
658
+ ActionCommand(
659
+ name=status.__name__,
660
+ help=status.__doc__,
661
+ func=status,
662
+ args=(
663
+ ARG_PID,
664
+ ARG_VERBOSE,
665
+ ),
666
+ ),
667
+ ActionCommand(
668
+ name=maintenance.__name__,
669
+ help=maintenance.__doc__,
670
+ func=maintenance,
671
+ args=(
672
+ ARG_MAINTENANCE,
673
+ ARG_MAINTENANCE_COMMENT,
674
+ ARG_WAIT_MAINT,
675
+ ARG_PID,
676
+ ARG_VERBOSE,
677
+ ),
678
+ ),
679
+ ActionCommand(
680
+ name=stop.__name__,
681
+ help=stop.__doc__,
682
+ func=stop,
683
+ args=(
684
+ ARG_WAIT_STOP,
685
+ ARG_PID,
686
+ ARG_VERBOSE,
687
+ ),
688
+ ),
689
+ ]