apache-airflow-providers-edge3 2.0.0rc1__py3-none-any.whl → 3.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. airflow/providers/edge3/__init__.py +1 -1
  2. airflow/providers/edge3/cli/api_client.py +30 -28
  3. airflow/providers/edge3/cli/dataclasses.py +3 -10
  4. airflow/providers/edge3/cli/definition.py +261 -0
  5. airflow/providers/edge3/cli/edge_command.py +8 -206
  6. airflow/providers/edge3/cli/worker.py +226 -198
  7. airflow/providers/edge3/example_dags/win_notepad.py +1 -1
  8. airflow/providers/edge3/executors/edge_executor.py +24 -49
  9. airflow/providers/edge3/get_provider_info.py +1 -0
  10. airflow/providers/edge3/models/edge_job.py +1 -2
  11. airflow/providers/edge3/models/edge_worker.py +61 -16
  12. airflow/providers/edge3/plugins/edge_executor_plugin.py +1 -1
  13. airflow/providers/edge3/plugins/www/dist/main.umd.cjs +8 -8
  14. airflow/providers/edge3/plugins/www/package.json +32 -27
  15. airflow/providers/edge3/plugins/www/pnpm-lock.yaml +1625 -1716
  16. airflow/providers/edge3/plugins/www/src/global.d.ts +24 -0
  17. airflow/providers/edge3/plugins/www/src/layouts/NavTabs.tsx +25 -3
  18. airflow/providers/edge3/plugins/www/src/main.tsx +6 -1
  19. airflow/providers/edge3/plugins/www/src/theme.ts +1 -1
  20. airflow/providers/edge3/worker_api/datamodels.py +12 -1
  21. airflow/providers/edge3/worker_api/routes/jobs.py +21 -8
  22. airflow/providers/edge3/worker_api/routes/logs.py +1 -1
  23. airflow/providers/edge3/worker_api/routes/worker.py +16 -3
  24. {apache_airflow_providers_edge3-2.0.0rc1.dist-info → apache_airflow_providers_edge3-3.0.1rc1.dist-info}/METADATA +14 -10
  25. {apache_airflow_providers_edge3-2.0.0rc1.dist-info → apache_airflow_providers_edge3-3.0.1rc1.dist-info}/RECORD +29 -29
  26. {apache_airflow_providers_edge3-2.0.0rc1.dist-info → apache_airflow_providers_edge3-3.0.1rc1.dist-info}/licenses/NOTICE +1 -1
  27. airflow/providers/edge3/plugins/templates/edge_worker_hosts.html +0 -175
  28. airflow/providers/edge3/plugins/templates/edge_worker_jobs.html +0 -69
  29. {apache_airflow_providers_edge3-2.0.0rc1.dist-info → apache_airflow_providers_edge3-3.0.1rc1.dist-info}/WHEEL +0 -0
  30. {apache_airflow_providers_edge3-2.0.0rc1.dist-info → apache_airflow_providers_edge3-3.0.1rc1.dist-info}/entry_points.txt +0 -0
  31. {apache_airflow_providers_edge3-2.0.0rc1.dist-info → apache_airflow_providers_edge3-3.0.1rc1.dist-info}/licenses/LICENSE +0 -0
@@ -20,16 +20,18 @@ import logging
20
20
  import os
21
21
  import signal
22
22
  import sys
23
+ import traceback
24
+ from asyncio import Task, create_task, get_running_loop, sleep
23
25
  from datetime import datetime
24
26
  from functools import cache
25
27
  from http import HTTPStatus
26
- from multiprocessing import Process
28
+ from multiprocessing import Process, Queue
27
29
  from pathlib import Path
28
- from time import sleep
29
30
  from typing import TYPE_CHECKING
30
31
 
32
+ from aiofiles import open as aio_open
33
+ from aiohttp import ClientResponseError
31
34
  from lockfile.pidlockfile import remove_existing_pidfile
32
- from requests import HTTPError
33
35
 
34
36
  from airflow import __version__ as airflow_version
35
37
  from airflow.configuration import conf
@@ -58,9 +60,12 @@ from airflow.utils.net import getfqdn
58
60
  from airflow.utils.state import TaskInstanceState
59
61
 
60
62
  if TYPE_CHECKING:
61
- from airflow.providers.edge3.worker_api.datamodels import EdgeJobFetched
63
+ from airflow.executors.workloads import ExecuteTask
62
64
 
63
65
  logger = logging.getLogger(__name__)
66
+ base_log_folder = conf.get("logging", "base_log_folder", fallback="NOT AVAILABLE")
67
+ push_logs = conf.getboolean("edge", "push_logs")
68
+ push_log_chunk_size = conf.getint("edge", "push_log_chunk_size")
64
69
 
65
70
  if sys.platform == "darwin":
66
71
  setproctitle = lambda title: logger.debug("Mac OS detected, skipping setproctitle")
@@ -73,21 +78,30 @@ def _edge_hostname() -> str:
73
78
  return os.environ.get("HOSTNAME", getfqdn())
74
79
 
75
80
 
81
+ @cache
82
+ def _execution_api_server_url() -> str:
83
+ """Get the execution api server url from config or environment."""
84
+ api_url = conf.get("edge", "api_url")
85
+ execution_api_server_url = conf.get("core", "execution_api_server_url", fallback="")
86
+ if not execution_api_server_url and api_url:
87
+ # Derive execution api url from edge api url as fallback
88
+ execution_api_server_url = api_url.replace("edge_worker/v1/rpcapi", "execution")
89
+ logger.info("Using execution api server url: %s", execution_api_server_url)
90
+ return execution_api_server_url
91
+
92
+
76
93
  class EdgeWorker:
77
94
  """Runner instance which executes the Edge Worker."""
78
95
 
79
96
  jobs: list[Job] = []
80
97
  """List of jobs that the worker is running currently."""
81
- last_hb: datetime | None = None
82
- """Timestamp of last heart beat sent to server."""
83
98
  drain: bool = False
84
99
  """Flag if job processing should be completed and no new jobs fetched for a graceful stop/shutdown."""
85
100
  maintenance_mode: bool = False
86
101
  """Flag if job processing should be completed and no new jobs fetched for maintenance mode. """
87
102
  maintenance_comments: str | None = None
88
103
  """Comments for maintenance mode."""
89
- edge_instance: EdgeWorker | None = None
90
- """Singleton instance of the worker."""
104
+ background_tasks: set[Task] = set()
91
105
 
92
106
  def __init__(
93
107
  self,
@@ -105,48 +119,54 @@ class EdgeWorker:
105
119
  self.hostname = hostname
106
120
  self.queues = queues
107
121
  self.concurrency = concurrency
108
- self.free_concurrency = concurrency
109
122
  self.daemon = daemon
110
123
 
111
- EdgeWorker.edge_instance = self
112
-
113
- @staticmethod
114
- def signal_handler(sig: signal.Signals, frame):
115
- if sig == SIG_STATUS:
116
- marker_path = Path(maintenance_marker_file_path(None))
117
- if marker_path.exists():
118
- request = MaintenanceMarker.from_json(marker_path.read_text())
119
- logger.info("Requested to set maintenance mode to %s", request.maintenance)
120
- EdgeWorker.maintenance_mode = request.maintenance == "on"
121
- if EdgeWorker.maintenance_mode and request.comments:
122
- logger.info("Comments: %s", request.comments)
123
- EdgeWorker.maintenance_comments = request.comments
124
- marker_path.unlink()
125
- # send heartbeat immediately to update state
126
- if EdgeWorker.edge_instance:
127
- EdgeWorker.edge_instance.heartbeat(EdgeWorker.maintenance_comments)
128
- else:
129
- logger.info("Request to get status of Edge Worker received.")
130
- status_path = Path(status_file_path(None))
131
- status_path.write_text(
132
- WorkerStatus(
133
- job_count=len(EdgeWorker.jobs),
134
- jobs=[job.edge_job.key for job in EdgeWorker.jobs],
135
- state=EdgeWorker._get_state(),
136
- maintenance=EdgeWorker.maintenance_mode,
137
- maintenance_comments=EdgeWorker.maintenance_comments,
138
- drain=EdgeWorker.drain,
139
- ).json
140
- )
124
+ @property
125
+ def free_concurrency(self) -> int:
126
+ """Calculate the free concurrency of the worker."""
127
+ used_concurrency = sum(job.edge_job.concurrency_slots for job in self.jobs)
128
+ return self.concurrency - used_concurrency
129
+
130
+ def signal_status(self):
131
+ marker_path = Path(maintenance_marker_file_path(None))
132
+ if marker_path.exists():
133
+ request = MaintenanceMarker.from_json(marker_path.read_text())
134
+ logger.info("Requested to set maintenance mode to %s", request.maintenance)
135
+ self.maintenance_mode = request.maintenance == "on"
136
+ if self.maintenance_mode and request.comments:
137
+ logger.info("Comments: %s", request.comments)
138
+ self.maintenance_comments = request.comments
139
+ marker_path.unlink()
140
+ # send heartbeat immediately to update state
141
+ task = get_running_loop().create_task(self.heartbeat(self.maintenance_comments))
142
+ self.background_tasks.add(task)
143
+ task.add_done_callback(self.background_tasks.discard)
141
144
  else:
142
- logger.info("Request to shut down Edge Worker received, waiting for jobs to complete.")
143
- EdgeWorker.drain = True
145
+ logger.info("Request to get status of Edge Worker received.")
146
+ status_path = Path(status_file_path(None))
147
+ status_path.write_text(
148
+ WorkerStatus(
149
+ job_count=len(self.jobs),
150
+ jobs=[job.edge_job.key for job in self.jobs],
151
+ state=self._get_state(),
152
+ maintenance=self.maintenance_mode,
153
+ maintenance_comments=self.maintenance_comments,
154
+ drain=self.drain,
155
+ ).json
156
+ )
157
+
158
+ def signal_drain(self):
159
+ self.drain = True
160
+ logger.info("Request to shut down Edge Worker received, waiting for jobs to complete.")
144
161
 
145
- def shutdown_handler(self, sig, frame):
146
- logger.info("SIGTERM received. Terminating all jobs and quit")
147
- for job in EdgeWorker.jobs:
148
- os.killpg(job.process.pid, signal.SIGTERM)
149
- EdgeWorker.drain = True
162
+ def shutdown_handler(self):
163
+ self.drain = True
164
+ msg = "SIGTERM received. Sending SIGTERM to all jobs and quit"
165
+ logger.info(msg)
166
+ for job in self.jobs:
167
+ if job.process.pid:
168
+ os.setpgid(job.process.pid, 0)
169
+ os.kill(job.process.pid, signal.SIGTERM)
150
170
 
151
171
  def _get_sysinfo(self) -> dict:
152
172
  """Produce the sysinfo from worker to post to central site."""
@@ -157,116 +177,121 @@ class EdgeWorker:
157
177
  "free_concurrency": self.free_concurrency,
158
178
  }
159
179
 
160
- @staticmethod
161
- def _get_state() -> EdgeWorkerState:
180
+ def _get_state(self) -> EdgeWorkerState:
162
181
  """State of the Edge Worker."""
163
- if EdgeWorker.jobs:
164
- if EdgeWorker.drain:
182
+ if self.jobs:
183
+ if self.drain:
165
184
  return EdgeWorkerState.TERMINATING
166
- if EdgeWorker.maintenance_mode:
185
+ if self.maintenance_mode:
167
186
  return EdgeWorkerState.MAINTENANCE_PENDING
168
187
  return EdgeWorkerState.RUNNING
169
188
 
170
- if EdgeWorker.drain:
171
- if EdgeWorker.maintenance_mode:
189
+ if self.drain:
190
+ if self.maintenance_mode:
172
191
  return EdgeWorkerState.OFFLINE_MAINTENANCE
173
192
  return EdgeWorkerState.OFFLINE
174
193
 
175
- if EdgeWorker.maintenance_mode:
194
+ if self.maintenance_mode:
176
195
  return EdgeWorkerState.MAINTENANCE_MODE
177
196
  return EdgeWorkerState.IDLE
178
197
 
179
- @staticmethod
180
- @cache
181
- def _execution_api_server_url() -> str:
182
- """Get the execution api server url from config or environment."""
183
- api_url = conf.get("edge", "api_url")
184
- execution_api_server_url = conf.get("core", "execution_api_server_url", fallback="")
185
- if not execution_api_server_url and api_url:
186
- # Derive execution api url from edge api url as fallback
187
- execution_api_server_url = api_url.replace("edge_worker/v1/rpcapi", "execution")
188
- logger.info("Using execution api server url: %s", execution_api_server_url)
189
- return execution_api_server_url
190
-
191
- @staticmethod
192
- def _run_job_via_supervisor(workload, execution_api_server_url) -> int:
198
+ def _run_job_via_supervisor(self, workload: ExecuteTask, results_queue: Queue) -> int:
193
199
  from airflow.sdk.execution_time.supervisor import supervise
194
200
 
195
201
  # Ignore ctrl-c in this process -- we don't want to kill _this_ one. we let tasks run to completion
196
- signal.signal(signal.SIGINT, signal.SIG_IGN)
202
+ os.setpgrp()
197
203
 
198
204
  logger.info("Worker starting up pid=%d", os.getpid())
199
- setproctitle(f"airflow edge worker: {workload.ti.key}")
205
+ ti = workload.ti
206
+ setproctitle(
207
+ "airflow edge supervisor: "
208
+ f"dag_id={ti.dag_id} task_id={ti.task_id} run_id={ti.run_id} map_index={ti.map_index} "
209
+ f"try_number={ti.try_number}"
210
+ )
200
211
 
201
212
  try:
202
213
  supervise(
203
214
  # This is the "wrong" ti type, but it duck types the same. TODO: Create a protocol for this.
204
215
  # Same like in airflow/executors/local_executor.py:_execute_work()
205
- ti=workload.ti, # type: ignore[arg-type]
216
+ ti=ti, # type: ignore[arg-type]
206
217
  dag_rel_path=workload.dag_rel_path,
207
218
  bundle_info=workload.bundle_info,
208
219
  token=workload.token,
209
- server=execution_api_server_url,
220
+ server=_execution_api_server_url(),
210
221
  log_path=workload.log_path,
211
222
  )
212
223
  return 0
213
224
  except Exception as e:
214
- logger.exception("Task execution failed: %s", e)
225
+ logger.exception("Task execution failed")
226
+ results_queue.put(e)
215
227
  return 1
216
228
 
217
- @staticmethod
218
- def _launch_job(edge_job: EdgeJobFetched):
219
- if TYPE_CHECKING:
220
- from airflow.executors.workloads import ExecuteTask
221
-
222
- workload: ExecuteTask = edge_job.command
229
+ def _launch_job(self, workload: ExecuteTask) -> tuple[Process, Queue[Exception]]:
230
+ # Improvement: Use frozen GC to prevent child process from copying unnecessary memory
231
+ # See _spawn_workers_with_gc_freeze() in airflow-core/src/airflow/executors/local_executor.py
232
+ results_queue: Queue[Exception] = Queue()
223
233
  process = Process(
224
- target=EdgeWorker._run_job_via_supervisor,
225
- kwargs={"workload": workload, "execution_api_server_url": EdgeWorker._execution_api_server_url()},
234
+ target=self._run_job_via_supervisor,
235
+ kwargs={"workload": workload, "results_queue": results_queue},
226
236
  )
227
237
  process.start()
228
- base_log_folder = conf.get("logging", "base_log_folder", fallback="NOT AVAILABLE")
229
- if TYPE_CHECKING:
230
- assert workload.log_path # We need to assume this is defined in here
231
- logfile = Path(base_log_folder, workload.log_path)
232
- EdgeWorker.jobs.append(Job(edge_job, process, logfile, 0))
233
-
234
- def start(self):
238
+ return process, results_queue
239
+
240
+ async def _push_logs_in_chunks(self, job: Job):
241
+ if push_logs and job.logfile.exists() and job.logfile.stat().st_size > job.logsize:
242
+ async with aio_open(job.logfile, mode="rb") as logf:
243
+ await logf.seek(job.logsize, os.SEEK_SET)
244
+ read_data = await logf.read()
245
+ job.logsize += len(read_data)
246
+ # backslashreplace to keep not decoded characters and not raising exception
247
+ # replace null with question mark to fix issue during DB push
248
+ log_data = read_data.decode(errors="backslashreplace").replace("\x00", "\ufffd")
249
+ while True:
250
+ chunk_data = log_data[:push_log_chunk_size]
251
+ log_data = log_data[push_log_chunk_size:]
252
+ if not chunk_data:
253
+ break
254
+
255
+ await logs_push(
256
+ task=job.edge_job.key,
257
+ log_chunk_time=timezone.utcnow(),
258
+ log_chunk_data=chunk_data,
259
+ )
260
+
261
+ async def start(self):
235
262
  """Start the execution in a loop until terminated."""
236
263
  try:
237
- self.last_hb = worker_register(
238
- self.hostname, EdgeWorkerState.STARTING, self.queues, self._get_sysinfo()
239
- ).last_update
264
+ await worker_register(self.hostname, EdgeWorkerState.STARTING, self.queues, self._get_sysinfo())
240
265
  except EdgeWorkerVersionException as e:
241
266
  logger.info("Version mismatch of Edge worker and Core. Shutting down worker.")
242
267
  raise SystemExit(str(e))
243
268
  except EdgeWorkerDuplicateException as e:
244
269
  logger.error(str(e))
245
270
  raise SystemExit(str(e))
246
- except HTTPError as e:
247
- if e.response.status_code == HTTPStatus.NOT_FOUND:
248
- raise SystemExit("Error: API endpoint is not ready, please set [edge] api_enabled=True.")
271
+ except ClientResponseError as e:
272
+ # Note: Method not allowed is raised by FastAPI if the API is not enabled (not 404)
273
+ if e.status in {HTTPStatus.NOT_FOUND, HTTPStatus.METHOD_NOT_ALLOWED}:
274
+ raise SystemExit(
275
+ "Error: API endpoint is not ready, please set [edge] api_enabled=True. Or check if the URL is correct to your deployment."
276
+ )
249
277
  raise SystemExit(str(e))
250
278
  if not self.daemon:
251
279
  write_pid_to_pidfile(self.pid_file_path)
252
- signal.signal(signal.SIGINT, EdgeWorker.signal_handler)
253
- signal.signal(SIG_STATUS, EdgeWorker.signal_handler)
254
- signal.signal(signal.SIGTERM, self.shutdown_handler)
280
+ loop = get_running_loop()
281
+ loop.add_signal_handler(signal.SIGINT, self.signal_drain)
282
+ loop.add_signal_handler(SIG_STATUS, self.signal_status)
283
+ loop.add_signal_handler(signal.SIGTERM, self.shutdown_handler)
284
+ setproctitle(f"airflow edge worker: {self.hostname}")
255
285
  os.environ["HOSTNAME"] = self.hostname
256
286
  os.environ["AIRFLOW__CORE__HOSTNAME_CALLABLE"] = f"{_edge_hostname.__module__}._edge_hostname"
257
287
  try:
258
- self.worker_state_changed = self.heartbeat()
259
- self.last_hb = datetime.now()
260
- while not EdgeWorker.drain or EdgeWorker.jobs:
261
- self.loop()
288
+ await self.loop()
262
289
 
263
290
  logger.info("Quitting worker, signal being offline.")
264
291
  try:
265
- worker_set_state(
292
+ await worker_set_state(
266
293
  self.hostname,
267
- EdgeWorkerState.OFFLINE_MAINTENANCE
268
- if EdgeWorker.maintenance_mode
269
- else EdgeWorkerState.OFFLINE,
294
+ EdgeWorkerState.OFFLINE_MAINTENANCE if self.maintenance_mode else EdgeWorkerState.OFFLINE,
270
295
  0,
271
296
  self.queues,
272
297
  self._get_sysinfo(),
@@ -277,95 +302,98 @@ class EdgeWorker:
277
302
  if not self.daemon:
278
303
  remove_existing_pidfile(self.pid_file_path)
279
304
 
280
- def loop(self):
305
+ async def loop(self):
281
306
  """Run a loop of scheduling and monitoring tasks."""
282
- new_job = False
283
- previous_jobs = EdgeWorker.jobs
284
- if not any((EdgeWorker.drain, EdgeWorker.maintenance_mode)) and self.free_concurrency > 0:
285
- new_job = self.fetch_job()
286
- self.check_running_jobs()
287
-
288
- if (
289
- EdgeWorker.drain
290
- or datetime.now().timestamp() - self.last_hb.timestamp() > self.hb_interval
291
- or self.worker_state_changed # send heartbeat immediately if the state is different in db
292
- or bool(previous_jobs) != bool(EdgeWorker.jobs) # when number of jobs changes from/to 0
293
- ):
294
- self.worker_state_changed = self.heartbeat()
295
- self.last_hb = datetime.now()
296
-
297
- if not new_job:
298
- self.interruptible_sleep()
299
-
300
- def fetch_job(self) -> bool:
301
- """Fetch and start a new job from central site."""
307
+ last_hb = datetime.now()
308
+ worker_state_changed = True # force heartbeat at start
309
+ previous_jobs = 0
310
+ while not self.drain or self.jobs:
311
+ if (
312
+ self.drain
313
+ or datetime.now().timestamp() - last_hb.timestamp() > self.hb_interval
314
+ or worker_state_changed # send heartbeat immediately if the state is different in db
315
+ or previous_jobs != len(self.jobs) # when number of jobs changes
316
+ ):
317
+ worker_state_changed = await self.heartbeat()
318
+ last_hb = datetime.now()
319
+ previous_jobs = len(self.jobs)
320
+
321
+ if self.maintenance_mode:
322
+ logger.info("in maintenance mode%s", f", {len(self.jobs)} draining jobs" if self.jobs else "")
323
+ elif not self.drain and self.free_concurrency > 0:
324
+ task = create_task(self.fetch_and_run_job())
325
+ self.background_tasks.add(task)
326
+ task.add_done_callback(self.background_tasks.discard)
327
+ else:
328
+ logger.info("%i %s running", len(self.jobs), "job is" if len(self.jobs) == 1 else "jobs are")
329
+
330
+ await self.interruptible_sleep()
331
+
332
+ async def fetch_and_run_job(self) -> None:
333
+ """Fetch, start and monitor a new job."""
302
334
  logger.debug("Attempting to fetch a new job...")
303
- edge_job = jobs_fetch(self.hostname, self.queues, self.free_concurrency)
304
- if edge_job:
305
- logger.info("Received job: %s", edge_job)
306
- EdgeWorker._launch_job(edge_job)
307
- jobs_set_state(edge_job.key, TaskInstanceState.RUNNING)
308
- return True
309
-
310
- logger.info(
311
- "No new job to process%s",
312
- f", {len(EdgeWorker.jobs)} still running" if EdgeWorker.jobs else "",
313
- )
314
- return False
315
-
316
- def check_running_jobs(self) -> None:
317
- """Check which of the running tasks/jobs are completed and report back."""
318
- used_concurrency = 0
319
- for i in range(len(EdgeWorker.jobs) - 1, -1, -1):
320
- job = EdgeWorker.jobs[i]
321
- if not job.is_running:
322
- EdgeWorker.jobs.remove(job)
323
- if job.is_success:
324
- logger.info("Job completed: %s", job.edge_job)
325
- jobs_set_state(job.edge_job.key, TaskInstanceState.SUCCESS)
326
- else:
327
- logger.error("Job failed: %s", job.edge_job)
328
- jobs_set_state(job.edge_job.key, TaskInstanceState.FAILED)
335
+ edge_job = await jobs_fetch(self.hostname, self.queues, self.free_concurrency)
336
+ if not edge_job:
337
+ logger.info(
338
+ "No new job to process%s",
339
+ f", {len(self.jobs)} still running" if self.jobs else "",
340
+ )
341
+ return
342
+
343
+ logger.info("Received job: %s", edge_job.identifier)
344
+
345
+ workload: ExecuteTask = edge_job.command
346
+ process, results_queue = self._launch_job(workload)
347
+ if TYPE_CHECKING:
348
+ assert workload.log_path # We need to assume this is defined in here
349
+ logfile = Path(base_log_folder, workload.log_path)
350
+ job = Job(edge_job, process, logfile)
351
+ self.jobs.append(job)
352
+ await jobs_set_state(edge_job.key, TaskInstanceState.RUNNING)
353
+
354
+ # As we got one job, directly fetch another one if possible
355
+ if self.free_concurrency > 0:
356
+ task = create_task(self.fetch_and_run_job())
357
+ self.background_tasks.add(task)
358
+ task.add_done_callback(self.background_tasks.discard)
359
+
360
+ while job.is_running:
361
+ await self._push_logs_in_chunks(job)
362
+ for _ in range(0, self.job_poll_interval * 10):
363
+ await sleep(0.1)
364
+ if not job.is_running:
365
+ break
366
+ await self._push_logs_in_chunks(job)
367
+
368
+ self.jobs.remove(job)
369
+ if job.is_success:
370
+ logger.info("Job completed: %s", job.edge_job.identifier)
371
+ await jobs_set_state(job.edge_job.key, TaskInstanceState.SUCCESS)
372
+ else:
373
+ if results_queue.empty():
374
+ ex_txt = "(Unknown error, no exception details available)"
329
375
  else:
330
- used_concurrency += job.edge_job.concurrency_slots
376
+ ex = results_queue.get()
377
+ ex_txt = "\n".join(traceback.format_exception(ex))
378
+ logger.error("Job failed: %s with:\n%s", job.edge_job.identifier, ex_txt)
379
+ # Push it upwards to logs for better diagnostic as well
380
+ await logs_push(
381
+ task=job.edge_job.key,
382
+ log_chunk_time=timezone.utcnow(),
383
+ log_chunk_data=f"Error starting job:\n{ex_txt}",
384
+ )
385
+ await jobs_set_state(job.edge_job.key, TaskInstanceState.FAILED)
331
386
 
332
- if (
333
- conf.getboolean("edge", "push_logs")
334
- and job.logfile.exists()
335
- and job.logfile.stat().st_size > job.logsize
336
- ):
337
- with job.logfile.open("rb") as logfile:
338
- push_log_chunk_size = conf.getint("edge", "push_log_chunk_size")
339
- logfile.seek(job.logsize, os.SEEK_SET)
340
- read_data = logfile.read()
341
- job.logsize += len(read_data)
342
- # backslashreplace to keep not decoded characters and not raising exception
343
- # replace null with question mark to fix issue during DB push
344
- log_data = read_data.decode(errors="backslashreplace").replace("\x00", "\ufffd")
345
- while True:
346
- chunk_data = log_data[:push_log_chunk_size]
347
- log_data = log_data[push_log_chunk_size:]
348
- if not chunk_data:
349
- break
350
-
351
- logs_push(
352
- task=job.edge_job.key,
353
- log_chunk_time=timezone.utcnow(),
354
- log_chunk_data=chunk_data,
355
- )
356
-
357
- self.free_concurrency = self.concurrency - used_concurrency
358
-
359
- def heartbeat(self, new_maintenance_comments: str | None = None) -> bool:
387
+ async def heartbeat(self, new_maintenance_comments: str | None = None) -> bool:
360
388
  """Report liveness state of worker to central site with stats."""
361
- state = EdgeWorker._get_state()
389
+ state = self._get_state()
362
390
  sysinfo = self._get_sysinfo()
363
391
  worker_state_changed: bool = False
364
392
  try:
365
- worker_info = worker_set_state(
393
+ worker_info = await worker_set_state(
366
394
  self.hostname,
367
395
  state,
368
- len(EdgeWorker.jobs),
396
+ len(self.jobs),
369
397
  self.queues,
370
398
  sysinfo,
371
399
  new_maintenance_comments,
@@ -373,31 +401,31 @@ class EdgeWorker:
373
401
  self.queues = worker_info.queues
374
402
  if worker_info.state == EdgeWorkerState.MAINTENANCE_REQUEST:
375
403
  logger.info("Maintenance mode requested!")
376
- EdgeWorker.maintenance_mode = True
404
+ self.maintenance_mode = True
377
405
  elif (
378
- worker_info.state in [EdgeWorkerState.IDLE, EdgeWorkerState.RUNNING]
379
- and EdgeWorker.maintenance_mode
406
+ worker_info.state in [EdgeWorkerState.IDLE, EdgeWorkerState.RUNNING] and self.maintenance_mode
380
407
  ):
381
408
  logger.info("Exit Maintenance mode requested!")
382
- EdgeWorker.maintenance_mode = False
383
- if EdgeWorker.maintenance_mode:
384
- EdgeWorker.maintenance_comments = worker_info.maintenance_comments
409
+ self.maintenance_mode = False
410
+ if self.maintenance_mode:
411
+ self.maintenance_comments = worker_info.maintenance_comments
385
412
  else:
386
- EdgeWorker.maintenance_comments = None
413
+ self.maintenance_comments = None
387
414
  if worker_info.state == EdgeWorkerState.SHUTDOWN_REQUEST:
388
415
  logger.info("Shutdown requested!")
389
- EdgeWorker.drain = True
416
+ self.drain = True
390
417
 
391
418
  worker_state_changed = worker_info.state != state
392
419
  except EdgeWorkerVersionException:
393
420
  logger.info("Version mismatch of Edge worker and Core. Shutting down worker.")
394
- EdgeWorker.drain = True
421
+ self.drain = True
395
422
  return worker_state_changed
396
423
 
397
- def interruptible_sleep(self):
398
- """Sleeps but stops sleeping if drain is made."""
399
- drain_before_sleep = EdgeWorker.drain
424
+ async def interruptible_sleep(self):
425
+ """Sleeps but stops sleeping if drain is made or some job completed."""
426
+ drain_before_sleep = self.drain
427
+ jobcount_before_sleep = len(self.jobs)
400
428
  for _ in range(0, self.job_poll_interval * 10):
401
- sleep(0.1)
402
- if drain_before_sleep != EdgeWorker.drain:
429
+ await sleep(0.1)
430
+ if drain_before_sleep != self.drain or len(self.jobs) < jobcount_before_sleep:
403
431
  return
@@ -37,7 +37,7 @@ from airflow.models.dag import DAG
37
37
  from airflow.sdk import Param
38
38
 
39
39
  if TYPE_CHECKING:
40
- from airflow.utils.context import Context
40
+ from airflow.sdk import Context
41
41
 
42
42
 
43
43
  class NotepadOperator(BaseOperator):