indexify 0.3.30__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. indexify/cli/__init__.py +18 -0
  2. indexify/cli/build_image.py +51 -0
  3. indexify/cli/deploy.py +57 -0
  4. indexify/cli/executor.py +205 -0
  5. indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
  6. indexify/executor/executor.py +57 -311
  7. indexify/executor/function_allowlist.py +59 -0
  8. indexify/executor/function_executor/function_executor.py +12 -6
  9. indexify/executor/function_executor/invocation_state_client.py +25 -3
  10. indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
  11. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
  12. indexify/executor/function_executor_controller/__init__.py +13 -0
  13. indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
  14. indexify/executor/function_executor_controller/create_function_executor.py +154 -0
  15. indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
  16. indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
  17. indexify/executor/function_executor_controller/downloads.py +199 -0
  18. indexify/executor/function_executor_controller/events.py +172 -0
  19. indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
  20. indexify/executor/function_executor_controller/loggers.py +57 -0
  21. indexify/executor/function_executor_controller/message_validators.py +65 -0
  22. indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
  23. indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
  24. indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
  25. indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
  26. indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
  27. indexify/executor/function_executor_controller/prepare_task.py +38 -0
  28. indexify/executor/function_executor_controller/run_task.py +201 -0
  29. indexify/executor/function_executor_controller/task_info.py +33 -0
  30. indexify/executor/function_executor_controller/task_output.py +122 -0
  31. indexify/executor/function_executor_controller/upload_task_output.py +234 -0
  32. indexify/executor/host_resources/host_resources.py +20 -25
  33. indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
  34. indexify/executor/metrics/executor.py +0 -47
  35. indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
  36. indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
  37. indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
  38. indexify/executor/monitoring/health_checker/health_checker.py +0 -11
  39. indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
  40. indexify/executor/state_reporter.py +364 -0
  41. indexify/proto/executor_api.proto +67 -59
  42. indexify/proto/executor_api_pb2.py +52 -52
  43. indexify/proto/executor_api_pb2.pyi +125 -104
  44. indexify/proto/executor_api_pb2_grpc.py +0 -47
  45. {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/METADATA +1 -3
  46. indexify-0.4.2.dist-info/RECORD +68 -0
  47. indexify-0.4.2.dist-info/entry_points.txt +3 -0
  48. indexify/cli/cli.py +0 -267
  49. indexify/executor/api_objects.py +0 -92
  50. indexify/executor/downloader.py +0 -417
  51. indexify/executor/executor_flavor.py +0 -7
  52. indexify/executor/function_executor/function_executor_state.py +0 -107
  53. indexify/executor/function_executor/function_executor_states_container.py +0 -93
  54. indexify/executor/function_executor/function_executor_status.py +0 -95
  55. indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
  56. indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
  57. indexify/executor/function_executor/single_task_runner.py +0 -345
  58. indexify/executor/function_executor/task_input.py +0 -21
  59. indexify/executor/function_executor/task_output.py +0 -105
  60. indexify/executor/grpc/function_executor_controller.py +0 -418
  61. indexify/executor/grpc/metrics/task_controller.py +0 -8
  62. indexify/executor/grpc/state_reporter.py +0 -314
  63. indexify/executor/grpc/task_controller.py +0 -508
  64. indexify/executor/metrics/task_fetcher.py +0 -21
  65. indexify/executor/metrics/task_reporter.py +0 -53
  66. indexify/executor/metrics/task_runner.py +0 -52
  67. indexify/executor/monitoring/function_allowlist.py +0 -25
  68. indexify/executor/runtime_probes.py +0 -68
  69. indexify/executor/task_fetcher.py +0 -96
  70. indexify/executor/task_reporter.py +0 -459
  71. indexify/executor/task_runner.py +0 -177
  72. indexify-0.3.30.dist-info/RECORD +0 -68
  73. indexify-0.3.30.dist-info/entry_points.txt +0 -3
  74. {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,759 @@
1
+ import asyncio
2
+ import time
3
+ from collections.abc import Coroutine
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from indexify.executor.blob_store.blob_store import BLOBStore
8
+ from indexify.executor.function_executor.function_executor import FunctionExecutor
9
+ from indexify.executor.function_executor.health_checker import HealthCheckResult
10
+ from indexify.executor.function_executor.server.function_executor_server_factory import (
11
+ FunctionExecutorServerFactory,
12
+ )
13
+ from indexify.executor.state_reporter import ExecutorStateReporter
14
+ from indexify.proto.executor_api_pb2 import (
15
+ FunctionExecutorDescription,
16
+ FunctionExecutorState,
17
+ FunctionExecutorStatus,
18
+ FunctionExecutorTerminationReason,
19
+ Task,
20
+ )
21
+
22
+ from .completed_task_metrics import emit_completed_task_metrics
23
+ from .create_function_executor import create_function_executor
24
+ from .debug_event_loop import (
25
+ debug_print_adding_event,
26
+ debug_print_events,
27
+ debug_print_processing_event,
28
+ )
29
+ from .destroy_function_executor import destroy_function_executor
30
+ from .events import (
31
+ BaseEvent,
32
+ EventType,
33
+ FunctionExecutorCreated,
34
+ FunctionExecutorDestroyed,
35
+ ScheduleTaskExecution,
36
+ ShutdownInitiated,
37
+ TaskExecutionFinished,
38
+ TaskOutputUploadFinished,
39
+ TaskPreparationFinished,
40
+ )
41
+ from .loggers import function_executor_logger, task_logger
42
+ from .metrics.function_executor_controller import (
43
+ METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING,
44
+ METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING,
45
+ METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED,
46
+ METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN,
47
+ metric_control_loop_handle_event_latency,
48
+ metric_function_executors_with_status,
49
+ metric_runnable_tasks,
50
+ metric_runnable_tasks_per_function_name,
51
+ metric_schedule_task_latency,
52
+ metric_tasks_fetched,
53
+ )
54
+ from .prepare_task import prepare_task
55
+ from .run_task import run_task_on_function_executor
56
+ from .task_info import TaskInfo
57
+ from .task_output import TaskOutput
58
+ from .upload_task_output import upload_task_output
59
+
60
+
61
+ class FunctionExecutorController:
62
+ def __init__(
63
+ self,
64
+ executor_id: str,
65
+ function_executor_description: FunctionExecutorDescription,
66
+ function_executor_server_factory: FunctionExecutorServerFactory,
67
+ state_reporter: ExecutorStateReporter,
68
+ blob_store: BLOBStore,
69
+ base_url: str,
70
+ config_path: str,
71
+ cache_path: Path,
72
+ logger: Any,
73
+ ):
74
+ """Initializes the FunctionExecutorController.
75
+
76
+ The supplied FunctionExecutorDescription must be already validated by the caller
77
+ using validate_function_executor_description().
78
+ """
79
+ self._executor_id: str = executor_id
80
+ self._function_executor_description: FunctionExecutorDescription = (
81
+ function_executor_description
82
+ )
83
+ self._function_executor_server_factory: FunctionExecutorServerFactory = (
84
+ function_executor_server_factory
85
+ )
86
+ self._state_reporter: ExecutorStateReporter = state_reporter
87
+ self._blob_store: BLOBStore = blob_store
88
+ self._base_url: str = base_url
89
+ self._config_path: str = config_path
90
+ self._cache_path: Path = cache_path
91
+ self._logger: Any = function_executor_logger(
92
+ function_executor_description, logger.bind(module=__name__)
93
+ )
94
+ # Mutable state. No lock needed as it's modified by async tasks running in
95
+ # the same event loop.
96
+ self._function_executor: Optional[FunctionExecutor] = None
97
+ # FE Status reported to Server.
98
+ self._status: FunctionExecutorStatus = (
99
+ FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN
100
+ )
101
+ metric_function_executors_with_status.labels(
102
+ status=_to_fe_status_metric_label(self._status, self._logger)
103
+ ).inc()
104
+ # Ordered list of events to be processed by the control loop.
105
+ self._events: List[BaseEvent] = []
106
+ # Asyncio event used to notify the control loop that there are new events to process.
107
+ self._event_added: asyncio.Event = asyncio.Event()
108
+ # Control loop asyncio task.
109
+ self._control_loop_aio_task: Optional[asyncio.Task] = None
110
+ # aio tasks spawned by the control loop.
111
+ self._running_aio_tasks: List[asyncio.Task] = []
112
+ # Info for all known tasks, Task ID -> TaskInfo.
113
+ self._tasks: Dict[str, TaskInfo] = {}
114
+ # Tracking of task execution on Function Executor.
115
+ self._runnable_tasks: List[TaskInfo] = []
116
+ self._running_task: Optional[TaskInfo] = None
117
+
118
+ def function_executor_id(self) -> str:
119
+ return self._function_executor_description.id
120
+
121
+ def status(self) -> FunctionExecutorStatus:
122
+ """Returns the current status of the Function Executor.
123
+
124
+ Not blocking.
125
+ """
126
+ return self._status
127
+
128
+ def add_task(self, task: Task, allocation_id: str) -> None:
129
+ """Adds a task to the Function Executor.
130
+
131
+ Not blocking. Never raises exceptions.
132
+ """
133
+ logger = task_logger(task, self._logger)
134
+ if self.has_task(task.id):
135
+ logger.warning(
136
+ "attempted to add already added task to Function Executor",
137
+ )
138
+ return
139
+
140
+ metric_tasks_fetched.inc()
141
+ task_info: TaskInfo = TaskInfo(
142
+ task=task, allocation_id=allocation_id, start_time=time.monotonic()
143
+ )
144
+ self._tasks[task.id] = task_info
145
+ next_aio = prepare_task(
146
+ task_info=task_info,
147
+ blob_store=self._blob_store,
148
+ logger=logger,
149
+ )
150
+ self._spawn_aio_for_task(
151
+ aio=next_aio,
152
+ task_info=task_info,
153
+ on_exception=TaskPreparationFinished(task_info=task_info, is_success=False),
154
+ )
155
+
156
+ def has_task(self, task_id: str) -> bool:
157
+ """Checks if the Function Executor has a task with the given ID.
158
+
159
+ Not blocking. Never raises exceptions.
160
+ """
161
+ return task_id in self._tasks
162
+
163
+ def task_ids(self) -> List[str]:
164
+ """Returns the list of task IDs known to the Function Executor.
165
+
166
+ Not blocking. Never raises exceptions.
167
+ """
168
+ return list(self._tasks.keys())
169
+
170
+ def remove_task(self, task_id: str) -> None:
171
+ """Removes the task from the Function Executor.
172
+
173
+ Cancels the task if it's in progress. Just removes the task if it was already completed.
174
+ The cancellation is asynchronous and might take a while to complete.
175
+ Until the cancellation is complete, the task won't be removed from the Function Executor.
176
+ Not blocking. Never raises exceptions.
177
+ """
178
+ if not self.has_task(task_id):
179
+ self._logger.warning(
180
+ "attempted to cancel a task that is not known to the Function Executor",
181
+ task_id=task_id,
182
+ )
183
+ return
184
+
185
+ task_info: TaskInfo = self._tasks.pop(task_id)
186
+ if task_info.is_completed:
187
+ return # Server processed the completed task outputs, we can forget it now.
188
+
189
+ # Task cancellation is required as the task is not completed yet.
190
+ logger = task_logger(task_info.task, self._logger)
191
+ task_info.is_cancelled = True
192
+ logger.info(
193
+ "cancelling task",
194
+ allocation_id=task_info.allocation_id,
195
+ )
196
+ if task_info.aio_task is not None:
197
+ task_info.aio_task.cancel()
198
+
199
+ def startup(self) -> None:
200
+ """Starts up the Function Executor and prepares it to run tasks.
201
+
202
+ Not blocking. Never raises exceptions."""
203
+ if self._control_loop_aio_task is not None:
204
+ self._logger.warning(
205
+ "ignoring startup call as the Function Executor is already started"
206
+ )
207
+ return
208
+
209
+ self._control_loop_aio_task = asyncio.create_task(
210
+ self._control_loop(),
211
+ name="function executor control loop",
212
+ )
213
+ self._set_status(FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_PENDING)
214
+ next_aio = create_function_executor(
215
+ function_executor_description=self._function_executor_description,
216
+ function_executor_server_factory=self._function_executor_server_factory,
217
+ blob_store=self._blob_store,
218
+ executor_id=self._executor_id,
219
+ base_url=self._base_url,
220
+ config_path=self._config_path,
221
+ cache_path=self._cache_path,
222
+ logger=self._logger,
223
+ )
224
+ self._spawn_aio_for_fe(
225
+ aio=next_aio,
226
+ on_exception=FunctionExecutorCreated(
227
+ function_executor=None,
228
+ termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR,
229
+ ),
230
+ )
231
+
232
+ async def shutdown(
233
+ self, termination_reason: FunctionExecutorTerminationReason
234
+ ) -> None:
235
+ """Shutsdown the Function Executor and frees all of its resources.
236
+
237
+ All the tasks are reported as failed with FE Terminated failure code.
238
+ Doesn't raise any exceptions. Blocks until the shutdown is complete.
239
+ """
240
+ self._add_event(
241
+ ShutdownInitiated(termination_reason=termination_reason), source="shutdown"
242
+ )
243
+ try:
244
+ await self._control_loop_aio_task
245
+ except asyncio.CancelledError:
246
+ pass # Expected exception on shutdown
247
+ except Exception as e:
248
+ self._logger.error(
249
+ "function executor controller control loop raised unexpected exception",
250
+ exc_info=e,
251
+ )
252
+ self._logger.info("function executor controller shutdown finished")
253
+
254
+ def _set_status(
255
+ self,
256
+ status: FunctionExecutorStatus,
257
+ termination_reason: FunctionExecutorTerminationReason = None, # type: Optional[FunctionExecutorTerminationReason]
258
+ ) -> None:
259
+ """Sets Function Executor status and reports it to the Server.
260
+
261
+ Not blocking. Never raises exceptions."""
262
+ old_status: FunctionExecutorStatus = self._status
263
+ new_status: FunctionExecutorStatus = status
264
+ self._status: FunctionExecutorStatus = new_status
265
+
266
+ self._logger.info(
267
+ "function executor status changed",
268
+ old_status=FunctionExecutorStatus.Name(old_status),
269
+ new_status=FunctionExecutorStatus.Name(new_status),
270
+ termination_reason=_termination_reason_to_short_name(termination_reason),
271
+ )
272
+ metric_function_executors_with_status.labels(
273
+ status=_to_fe_status_metric_label(old_status, self._logger)
274
+ ).dec()
275
+ metric_function_executors_with_status.labels(
276
+ status=_to_fe_status_metric_label(new_status, self._logger)
277
+ ).inc()
278
+
279
+ new_fe_state = FunctionExecutorState(
280
+ description=self._function_executor_description, status=new_status
281
+ )
282
+ if termination_reason is not None:
283
+ new_fe_state.termination_reason = termination_reason
284
+ self._state_reporter.update_function_executor_state(new_fe_state)
285
+ # Report the status change to the Server asap to reduce latency in the system.
286
+ self._state_reporter.schedule_state_report()
287
+
288
+ async def _control_loop(self) -> None:
289
+ """Runs control loop that coordinates all the work done by the Function Executor.
290
+
291
+ Doesn't raise any Exceptions.
292
+ """
293
+ self._logger.info("function executor controller control loop started")
294
+
295
+ while True:
296
+ await self._event_added.wait()
297
+ self._event_added.clear()
298
+
299
+ while self._events:
300
+ event: BaseEvent = self._events.pop(0)
301
+ debug_print_processing_event(event, self._logger)
302
+
303
+ try:
304
+ if event.event_type == EventType.SHUTDOWN_INITIATED:
305
+ return await self._shutdown_no_exceptions(event)
306
+
307
+ with metric_control_loop_handle_event_latency.time():
308
+ self._handle_event(event)
309
+ except BaseException as e:
310
+ # None of the event handlers should raise exceptions, but still catch all exceptions to ensure
311
+ # that the control loop doesn't crash if an unexpected exception happen.
312
+ self._logger.error(
313
+ "unexpected exception in function executor controller control loop",
314
+ exc_info=e,
315
+ fe_event=str(event),
316
+ )
317
+
318
+ def _handle_event(self, event: BaseEvent) -> None:
319
+ """Handles the event.
320
+
321
+ Doesn't raise any exceptions. Doesn't block.
322
+ """
323
+ if event.event_type == EventType.FUNCTION_EXECUTOR_CREATED:
324
+ return self._handle_event_function_executor_created(event)
325
+ elif event.event_type == EventType.FUNCTION_EXECUTOR_DESTROYED:
326
+ return self._handle_event_function_executor_destroyed(event)
327
+ elif event.event_type == EventType.TASK_PREPARATION_FINISHED:
328
+ return self._handle_event_task_preparation_finished(event)
329
+ elif event.event_type == EventType.SCHEDULE_TASK_EXECUTION:
330
+ return self._handle_event_schedule_task_execution(event)
331
+ elif event.event_type == EventType.TASK_EXECUTION_FINISHED:
332
+ return self._handle_event_task_execution_finished(event)
333
+ elif event.event_type == EventType.TASK_OUTPUT_UPLOAD_FINISHED:
334
+ return self._handle_event_task_output_upload_finished(event)
335
+
336
+ self._logger.warning(
337
+ "unexpected event type received", event_type=event.event_type.name
338
+ )
339
+
340
+ def _add_event(self, event: BaseEvent, source: str) -> None:
341
+ """Adds an event to the list of events to be processed by the control loop.
342
+
343
+ Doesn't raise any exceptions. Doesn't block."""
344
+ debug_print_adding_event(event=event, source=source, logger=self._logger)
345
+ self._events.append(event)
346
+ self._event_added.set()
347
+
348
+ def _spawn_aio_for_task(
349
+ self,
350
+ aio: Coroutine[Any, Any, BaseEvent],
351
+ task_info: TaskInfo,
352
+ on_exception: BaseEvent,
353
+ ) -> None:
354
+ self._spawn_aio(
355
+ aio=aio,
356
+ task_info=task_info,
357
+ on_exception=on_exception,
358
+ logger=task_logger(task_info.task, self._logger),
359
+ )
360
+
361
+ def _spawn_aio_for_fe(
362
+ self, aio: Coroutine[Any, Any, BaseEvent], on_exception: BaseEvent
363
+ ) -> None:
364
+ self._spawn_aio(
365
+ aio=aio,
366
+ task_info=None,
367
+ on_exception=on_exception,
368
+ logger=self._logger,
369
+ )
370
+
371
+ def _spawn_aio(
372
+ self,
373
+ aio: Coroutine[Any, Any, BaseEvent],
374
+ task_info: Optional[TaskInfo],
375
+ on_exception: BaseEvent,
376
+ logger: Any,
377
+ ) -> None:
378
+ """Spawns an aio task for the supplied coroutine.
379
+
380
+ The coroutine should return an event that will be added to the FE controller events.
381
+ The coroutine should not raise any exceptions.
382
+ on_exception event will be added to the FE controller events if the aio task raises an unexpected exception.
383
+ on_exception is required to not silently stall the task processing due to an unexpected exception.
384
+ If task_info is not None, the aio task will be associated with the task_info while the aio task is running.
385
+ Doesn't raise any exceptions. Doesn't block.
386
+ Use `_spawn_aio_for_task` and `_spawn_aio_for_fe` instead of directly calling this method.
387
+ """
388
+
389
+ aio_task_name: str = str(aio)
390
+ # Wrap the coroutine into aio task to disable warning "coroutine was never awaited" when the task is cancelled.
391
+ aio: asyncio.Task = asyncio.create_task(aio, name=aio_task_name)
392
+
393
+ async def coroutine_wrapper() -> None:
394
+ try:
395
+ self._add_event(await aio, source=aio_task_name)
396
+ except asyncio.CancelledError:
397
+ pass # Expected exception on aio task cancellation.
398
+ except BaseException as e:
399
+ logger.error(
400
+ "unexpected exception in aio task",
401
+ exc_info=e,
402
+ aio_task_name=aio_task_name,
403
+ )
404
+ self._add_event(on_exception, source=aio_task_name)
405
+ finally:
406
+ if task_info is not None:
407
+ task_info.aio_task = None
408
+ self._running_aio_tasks.remove(asyncio.current_task())
409
+
410
+ aio_wrapper_task: asyncio.Task = asyncio.create_task(
411
+ coroutine_wrapper(),
412
+ name=f"function executor controller aio task '{aio_task_name}'",
413
+ )
414
+ self._running_aio_tasks.append(aio_wrapper_task)
415
+ if task_info is not None:
416
+ task_info.aio_task = aio_wrapper_task
417
+
418
+ # Event handlers for the events added to the control loop.
419
+ # All the event handlers are synchronous and never block on any long running operations.
420
+
421
+ def _handle_event_function_executor_created(
422
+ self, event: FunctionExecutorCreated
423
+ ) -> None:
424
+ """Handles the startup finished event.
425
+
426
+ Doesn't raise any exceptions. Doesn't block.
427
+ """
428
+ if event.function_executor is None:
429
+ self._destroy_function_executor_before_termination(event.termination_reason)
430
+ if event.function_error is not None:
431
+ # TODO: Save stdout and stderr of customer code that ran during FE creation into BLOBs
432
+ # so customers can debug their function initialization errors.
433
+ # https://github.com/tensorlakeai/indexify/issues/1426
434
+ self._logger.error(
435
+ "failed to create function executor due to error in customer code",
436
+ exc_info=event.function_error,
437
+ )
438
+ return
439
+
440
+ self._function_executor = event.function_executor
441
+ self._set_status(FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING)
442
+ # Health checker starts after FE creation and gets automatically stopped on FE destroy.
443
+ self._function_executor.health_checker().start(
444
+ self._health_check_failed_callback
445
+ )
446
+ self._add_event(
447
+ ScheduleTaskExecution(),
448
+ source="_handle_event_function_executor_created",
449
+ )
450
+
451
+ def _handle_event_function_executor_destroyed(
452
+ self, event: FunctionExecutorDestroyed
453
+ ) -> None:
454
+ """Handles the Function Executor destroy finished event.
455
+
456
+ Doesn't raise any exceptions. Doesn't block.
457
+ """
458
+ if not event.is_success:
459
+ self._logger.error(
460
+ "Function Executor destroy failed unexpectedly, this should never happen",
461
+ )
462
+ # Set the status only after the FE got destroyed because Server assumes that all FE resources are freed when the status changes.
463
+ self._set_status(
464
+ FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED,
465
+ termination_reason=event.termination_reason,
466
+ )
467
+ # Invoke the scheduler so it can fail runnable tasks with FE Terminated error.
468
+ self._add_event(
469
+ ScheduleTaskExecution(),
470
+ source="_handle_event_function_executor_destroyed",
471
+ )
472
+
473
+ async def _health_check_failed_callback(self, result: HealthCheckResult):
474
+ self._logger.error(
475
+ "Function Executor health check failed, terminating Function Executor",
476
+ reason=result.reason,
477
+ )
478
+ self._destroy_function_executor_before_termination(
479
+ termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
480
+ )
481
+
482
+ def _handle_event_task_preparation_finished(
483
+ self, event: TaskPreparationFinished
484
+ ) -> None:
485
+ """Handles the task preparation finished event.
486
+
487
+ Doesn't raise any exceptions. Doesn't block.
488
+ """
489
+ task_info: TaskInfo = event.task_info
490
+
491
+ if task_info.is_cancelled:
492
+ task_info.output = TaskOutput.task_cancelled(
493
+ task=task_info.task, allocation_id=task_info.allocation_id
494
+ )
495
+ self._start_task_output_upload(task_info)
496
+ return
497
+ if not event.is_success:
498
+ task_info.output = TaskOutput.internal_error(
499
+ task=task_info.task, allocation_id=task_info.allocation_id
500
+ )
501
+ self._start_task_output_upload(task_info)
502
+ return
503
+
504
+ task_info.prepared_time = time.monotonic()
505
+ metric_runnable_tasks.inc()
506
+ metric_runnable_tasks_per_function_name.labels(
507
+ task_info.task.function_name
508
+ ).inc()
509
+ self._runnable_tasks.append(task_info)
510
+ self._add_event(
511
+ ScheduleTaskExecution(),
512
+ source="_handle_event_task_preparation_finished",
513
+ )
514
+
515
+ def _handle_event_schedule_task_execution(
516
+ self, event: ScheduleTaskExecution
517
+ ) -> None:
518
+ if len(self._runnable_tasks) == 0:
519
+ return
520
+
521
+ if self._status not in [
522
+ FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING,
523
+ FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED,
524
+ ]:
525
+ return # Can't progress pending task with the current status.
526
+
527
+ if (
528
+ self._status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING
529
+ and self._running_task is not None
530
+ ):
531
+ return
532
+
533
+ # Take the next task from head to get FIFO order and improve fairness.
534
+ task_info: TaskInfo = self._pop_runnable_task()
535
+ # Re-invoke the scheduler later to process the next runnable task if this one can't run on FE.
536
+ self._add_event(
537
+ ScheduleTaskExecution(),
538
+ source="_handle_event_schedule_task_execution",
539
+ )
540
+
541
+ if task_info.is_cancelled:
542
+ task_info.output = TaskOutput.task_cancelled(
543
+ task=task_info.task, allocation_id=task_info.allocation_id
544
+ )
545
+ self._start_task_output_upload(task_info)
546
+ elif self._status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED:
547
+ task_info.output = TaskOutput.function_executor_terminated(
548
+ task=task_info.task, allocation_id=task_info.allocation_id
549
+ )
550
+ self._start_task_output_upload(task_info)
551
+ elif self._status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING:
552
+ self._running_task = task_info
553
+ next_aio = run_task_on_function_executor(
554
+ task_info=task_info,
555
+ function_executor=self._function_executor,
556
+ logger=task_logger(task_info.task, self._logger),
557
+ )
558
+ self._spawn_aio_for_task(
559
+ aio=next_aio,
560
+ task_info=task_info,
561
+ on_exception=TaskExecutionFinished(
562
+ task_info=task_info,
563
+ function_executor_termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR,
564
+ ),
565
+ )
566
+ else:
567
+ task_logger(task_info.task, self._logger).error(
568
+ "failed to schedule task execution, this should never happen"
569
+ )
570
+
571
+ def _pop_runnable_task(self) -> TaskInfo:
572
+ task_info: TaskInfo = self._runnable_tasks.pop(0)
573
+ metric_schedule_task_latency.observe(time.monotonic() - task_info.prepared_time)
574
+ metric_runnable_tasks.dec()
575
+ metric_runnable_tasks_per_function_name.labels(
576
+ task_info.task.function_name
577
+ ).dec()
578
+ return task_info
579
+
580
+ def _handle_event_task_execution_finished(
581
+ self, event: TaskExecutionFinished
582
+ ) -> None:
583
+ """Handles the task execution finished event.
584
+
585
+ Doesn't raise any exceptions. Doesn't block.
586
+ """
587
+ self._running_task = None
588
+
589
+ if event.function_executor_termination_reason is None:
590
+ self._add_event(
591
+ ScheduleTaskExecution(), source="_handle_event_task_execution_finished"
592
+ )
593
+ else:
594
+ self._destroy_function_executor_before_termination(
595
+ termination_reason=event.function_executor_termination_reason
596
+ )
597
+
598
+ # Ignore is_cancelled because cancelling a task still involves uploading its output.
599
+ # We'll just upload a real output instead of "task cancelled" output.
600
+ # Adds TaskOutputUploadFinished event when done.
601
+ self._start_task_output_upload(event.task_info)
602
+
603
+ def _start_task_output_upload(self, task_info: TaskInfo) -> None:
604
+ """Starts the task output upload for the given task.
605
+
606
+ Doesn't raise any exceptions. Doesn't block.
607
+ """
608
+ next_aio = upload_task_output(
609
+ task_info=task_info,
610
+ blob_store=self._blob_store,
611
+ logger=task_logger(task_info.task, self._logger),
612
+ )
613
+ self._spawn_aio_for_task(
614
+ aio=next_aio,
615
+ task_info=task_info,
616
+ on_exception=TaskOutputUploadFinished(
617
+ task_info=task_info, is_success=False
618
+ ),
619
+ )
620
+
621
+ def _handle_event_task_output_upload_finished(
622
+ self, event: TaskOutputUploadFinished
623
+ ) -> None:
624
+ """Handles the task output upload finished event.
625
+
626
+ Doesn't raise any exceptions. Doesn't block.
627
+ """
628
+ # Ignore task cancellation because we need to report it to the server anyway.
629
+ task_info: TaskInfo = event.task_info
630
+ if not event.is_success:
631
+ task_info.output = TaskOutput.internal_error(
632
+ task=task_info.task, allocation_id=task_info.allocation_id
633
+ )
634
+
635
+ self._complete_task(event.task_info)
636
+
637
+ def _complete_task(self, task_info: TaskInfo) -> None:
638
+ """Marks the task as completed and reports it to the Server.
639
+
640
+ Doesn't raise any exceptions. Doesn't block.
641
+ """
642
+ task_info.is_completed = True
643
+ emit_completed_task_metrics(
644
+ task_info=task_info,
645
+ logger=task_logger(task_info.task, self._logger),
646
+ )
647
+ # Reconciler will call .remove_task() once Server signals that it processed this update.
648
+ self._state_reporter.add_completed_task_output(task_info.output)
649
+ self._state_reporter.schedule_state_report()
650
+
651
+ def _destroy_function_executor_before_termination(
652
+ self, termination_reason: FunctionExecutorTerminationReason
653
+ ) -> None:
654
+ """Destroys the Function Executor and frees all its resources to prepare for transitioning to the TERMINATED state.
655
+
656
+ Doesn't raise any exceptions. Doesn't block.
657
+ """
658
+ next_aio = destroy_function_executor(
659
+ function_executor=self._function_executor,
660
+ termination_reason=termination_reason,
661
+ logger=self._logger,
662
+ )
663
+ self._function_executor = None
664
+ self._spawn_aio_for_fe(
665
+ aio=next_aio,
666
+ on_exception=FunctionExecutorDestroyed(
667
+ is_success=False, termination_reason=termination_reason
668
+ ),
669
+ )
670
+
671
+ async def _shutdown_no_exceptions(self, event: ShutdownInitiated) -> None:
672
+ try:
673
+ await self._shutdown(event)
674
+ except BaseException as e:
675
+ # This would result in resource leaks.
676
+ self._logger.error(
677
+ "unexpected exception in function executor controller shutdown, this should never happen",
678
+ exc_info=e,
679
+ )
680
+
681
+ async def _shutdown(self, event: ShutdownInitiated) -> None:
682
+ """Shuts down the Function Executor and frees all its resources.
683
+
684
+ The control loop must be blocked while this method is running.
685
+ The control loop must exit immediately after this method returns.
686
+ Doesn't raise any exceptions.
687
+
688
+ Server needs to wait until all the tasks its interested in got their outcomes reported
689
+ before calling the FE shutdown as we don't report anything on FE shutdown.
690
+ """
691
+ self._logger.info("function executor controller shutdown initiated")
692
+ # Control loop is blocked executing this method, no new aio tasks will be spawned concurrently.
693
+ # Create a copy of the running aio tasks because they remove themselves from the list when they finish.
694
+ cancelled_tasks: List[asyncio.Task] = self._running_aio_tasks.copy()
695
+ for cancelled_task in cancelled_tasks:
696
+ cancelled_task.cancel()
697
+
698
+ # Await all aio tasks to make sure that nothing is mutating this FE controller state concurrently.
699
+ for cancelled_task in cancelled_tasks:
700
+ try:
701
+ await cancelled_task
702
+ except BaseException:
703
+ # Ignore any errors as we expect them when cancelling tasks.
704
+ # BaseException includes asyncio.CancelledError which is always raised here.
705
+ pass
706
+
707
+ if self._status != FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED:
708
+ self._handle_event_function_executor_destroyed(
709
+ await destroy_function_executor(
710
+ function_executor=self._function_executor,
711
+ termination_reason=event.termination_reason,
712
+ logger=self._logger,
713
+ )
714
+ )
715
+
716
+ self._state_reporter.remove_function_executor_info(self.function_executor_id())
717
+ self._state_reporter.schedule_state_report()
718
+
719
+ self._logger.info("function executor controller control loop finished")
720
+ debug_print_events(events=self._events, logger=self._logger)
721
+
722
+
723
+ def _to_fe_status_metric_label(status: FunctionExecutorStatus, logger: Any) -> str:
724
+ if status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN:
725
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN
726
+ elif status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_PENDING:
727
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING
728
+ elif status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING:
729
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING
730
+ elif status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED:
731
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED
732
+ else:
733
+ logger.error(
734
+ "unexpected Function Executor status",
735
+ status=FunctionExecutorStatus.Name(status),
736
+ )
737
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN
738
+
739
+
740
+ _termination_reason_to_short_name_map = {
741
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNKNOWN: "UNKNOWN",
742
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR: "STARTUP_FAILED_INTERNAL_ERROR",
743
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_ERROR: "STARTUP_FAILED_FUNCTION_ERROR",
744
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_TIMEOUT: "STARTUP_FAILED_FUNCTION_TIMEOUT",
745
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_EXECUTOR_SHUTDOWN: "EXECUTOR_SHUTDOWN",
746
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_REMOVED_FROM_DESIRED_STATE: "REMOVED_FROM_DESIRED_STATE",
747
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY: "UNHEALTHY",
748
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR: "INTERNAL_ERROR",
749
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT: "FUNCTION_TIMEOUT",
750
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED: "FUNCTION_CANCELLED",
751
+ }
752
+
753
+
754
+ def _termination_reason_to_short_name(value: FunctionExecutorTerminationReason) -> str:
755
+ # The enum value names are really long, shorten them to make the logs more readable.
756
+ if value is None:
757
+ return "None"
758
+
759
+ return _termination_reason_to_short_name_map.get(value, "UNEXPECTED")