indexify 0.3.31__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/__init__.py +18 -0
- indexify/cli/build_image.py +51 -0
- indexify/cli/deploy.py +57 -0
- indexify/cli/executor.py +205 -0
- indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
- indexify/executor/executor.py +57 -313
- indexify/executor/function_allowlist.py +59 -0
- indexify/executor/function_executor/function_executor.py +12 -6
- indexify/executor/function_executor/invocation_state_client.py +25 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
- indexify/executor/function_executor_controller/__init__.py +13 -0
- indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
- indexify/executor/function_executor_controller/create_function_executor.py +154 -0
- indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
- indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
- indexify/executor/function_executor_controller/downloads.py +199 -0
- indexify/executor/function_executor_controller/events.py +172 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
- indexify/executor/function_executor_controller/loggers.py +57 -0
- indexify/executor/function_executor_controller/message_validators.py +65 -0
- indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
- indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
- indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
- indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
- indexify/executor/function_executor_controller/prepare_task.py +38 -0
- indexify/executor/function_executor_controller/run_task.py +201 -0
- indexify/executor/function_executor_controller/task_info.py +33 -0
- indexify/executor/function_executor_controller/task_output.py +122 -0
- indexify/executor/function_executor_controller/upload_task_output.py +234 -0
- indexify/executor/host_resources/host_resources.py +20 -25
- indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
- indexify/executor/metrics/executor.py +0 -47
- indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
- indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
- indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
- indexify/executor/monitoring/health_checker/health_checker.py +0 -11
- indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
- indexify/executor/state_reporter.py +364 -0
- indexify/proto/executor_api.proto +67 -59
- indexify/proto/executor_api_pb2.py +52 -52
- indexify/proto/executor_api_pb2.pyi +125 -104
- indexify/proto/executor_api_pb2_grpc.py +0 -47
- {indexify-0.3.31.dist-info → indexify-0.4.2.dist-info}/METADATA +1 -3
- indexify-0.4.2.dist-info/RECORD +68 -0
- indexify-0.4.2.dist-info/entry_points.txt +3 -0
- indexify/cli/cli.py +0 -268
- indexify/executor/api_objects.py +0 -92
- indexify/executor/downloader.py +0 -417
- indexify/executor/executor_flavor.py +0 -7
- indexify/executor/function_executor/function_executor_state.py +0 -107
- indexify/executor/function_executor/function_executor_states_container.py +0 -93
- indexify/executor/function_executor/function_executor_status.py +0 -95
- indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
- indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
- indexify/executor/function_executor/single_task_runner.py +0 -345
- indexify/executor/function_executor/task_input.py +0 -21
- indexify/executor/function_executor/task_output.py +0 -105
- indexify/executor/grpc/function_executor_controller.py +0 -418
- indexify/executor/grpc/metrics/task_controller.py +0 -8
- indexify/executor/grpc/state_reporter.py +0 -317
- indexify/executor/grpc/task_controller.py +0 -508
- indexify/executor/metrics/task_fetcher.py +0 -21
- indexify/executor/metrics/task_reporter.py +0 -53
- indexify/executor/metrics/task_runner.py +0 -52
- indexify/executor/monitoring/function_allowlist.py +0 -25
- indexify/executor/runtime_probes.py +0 -68
- indexify/executor/task_fetcher.py +0 -96
- indexify/executor/task_reporter.py +0 -459
- indexify/executor/task_runner.py +0 -177
- indexify-0.3.31.dist-info/RECORD +0 -68
- indexify-0.3.31.dist-info/entry_points.txt +0 -3
- {indexify-0.3.31.dist-info → indexify-0.4.2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,759 @@
|
|
1
|
+
import asyncio
|
2
|
+
import time
|
3
|
+
from collections.abc import Coroutine
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Any, Dict, List, Optional
|
6
|
+
|
7
|
+
from indexify.executor.blob_store.blob_store import BLOBStore
|
8
|
+
from indexify.executor.function_executor.function_executor import FunctionExecutor
|
9
|
+
from indexify.executor.function_executor.health_checker import HealthCheckResult
|
10
|
+
from indexify.executor.function_executor.server.function_executor_server_factory import (
|
11
|
+
FunctionExecutorServerFactory,
|
12
|
+
)
|
13
|
+
from indexify.executor.state_reporter import ExecutorStateReporter
|
14
|
+
from indexify.proto.executor_api_pb2 import (
|
15
|
+
FunctionExecutorDescription,
|
16
|
+
FunctionExecutorState,
|
17
|
+
FunctionExecutorStatus,
|
18
|
+
FunctionExecutorTerminationReason,
|
19
|
+
Task,
|
20
|
+
)
|
21
|
+
|
22
|
+
from .completed_task_metrics import emit_completed_task_metrics
|
23
|
+
from .create_function_executor import create_function_executor
|
24
|
+
from .debug_event_loop import (
|
25
|
+
debug_print_adding_event,
|
26
|
+
debug_print_events,
|
27
|
+
debug_print_processing_event,
|
28
|
+
)
|
29
|
+
from .destroy_function_executor import destroy_function_executor
|
30
|
+
from .events import (
|
31
|
+
BaseEvent,
|
32
|
+
EventType,
|
33
|
+
FunctionExecutorCreated,
|
34
|
+
FunctionExecutorDestroyed,
|
35
|
+
ScheduleTaskExecution,
|
36
|
+
ShutdownInitiated,
|
37
|
+
TaskExecutionFinished,
|
38
|
+
TaskOutputUploadFinished,
|
39
|
+
TaskPreparationFinished,
|
40
|
+
)
|
41
|
+
from .loggers import function_executor_logger, task_logger
|
42
|
+
from .metrics.function_executor_controller import (
|
43
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING,
|
44
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING,
|
45
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED,
|
46
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN,
|
47
|
+
metric_control_loop_handle_event_latency,
|
48
|
+
metric_function_executors_with_status,
|
49
|
+
metric_runnable_tasks,
|
50
|
+
metric_runnable_tasks_per_function_name,
|
51
|
+
metric_schedule_task_latency,
|
52
|
+
metric_tasks_fetched,
|
53
|
+
)
|
54
|
+
from .prepare_task import prepare_task
|
55
|
+
from .run_task import run_task_on_function_executor
|
56
|
+
from .task_info import TaskInfo
|
57
|
+
from .task_output import TaskOutput
|
58
|
+
from .upload_task_output import upload_task_output
|
59
|
+
|
60
|
+
|
61
|
+
class FunctionExecutorController:
|
62
|
+
def __init__(
|
63
|
+
self,
|
64
|
+
executor_id: str,
|
65
|
+
function_executor_description: FunctionExecutorDescription,
|
66
|
+
function_executor_server_factory: FunctionExecutorServerFactory,
|
67
|
+
state_reporter: ExecutorStateReporter,
|
68
|
+
blob_store: BLOBStore,
|
69
|
+
base_url: str,
|
70
|
+
config_path: str,
|
71
|
+
cache_path: Path,
|
72
|
+
logger: Any,
|
73
|
+
):
|
74
|
+
"""Initializes the FunctionExecutorController.
|
75
|
+
|
76
|
+
The supplied FunctionExecutorDescription must be already validated by the caller
|
77
|
+
using validate_function_executor_description().
|
78
|
+
"""
|
79
|
+
self._executor_id: str = executor_id
|
80
|
+
self._function_executor_description: FunctionExecutorDescription = (
|
81
|
+
function_executor_description
|
82
|
+
)
|
83
|
+
self._function_executor_server_factory: FunctionExecutorServerFactory = (
|
84
|
+
function_executor_server_factory
|
85
|
+
)
|
86
|
+
self._state_reporter: ExecutorStateReporter = state_reporter
|
87
|
+
self._blob_store: BLOBStore = blob_store
|
88
|
+
self._base_url: str = base_url
|
89
|
+
self._config_path: str = config_path
|
90
|
+
self._cache_path: Path = cache_path
|
91
|
+
self._logger: Any = function_executor_logger(
|
92
|
+
function_executor_description, logger.bind(module=__name__)
|
93
|
+
)
|
94
|
+
# Mutable state. No lock needed as it's modified by async tasks running in
|
95
|
+
# the same event loop.
|
96
|
+
self._function_executor: Optional[FunctionExecutor] = None
|
97
|
+
# FE Status reported to Server.
|
98
|
+
self._status: FunctionExecutorStatus = (
|
99
|
+
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN
|
100
|
+
)
|
101
|
+
metric_function_executors_with_status.labels(
|
102
|
+
status=_to_fe_status_metric_label(self._status, self._logger)
|
103
|
+
).inc()
|
104
|
+
# Ordered list of events to be processed by the control loop.
|
105
|
+
self._events: List[BaseEvent] = []
|
106
|
+
# Asyncio event used to notify the control loop that there are new events to process.
|
107
|
+
self._event_added: asyncio.Event = asyncio.Event()
|
108
|
+
# Control loop asyncio task.
|
109
|
+
self._control_loop_aio_task: Optional[asyncio.Task] = None
|
110
|
+
# aio tasks spawned by the control loop.
|
111
|
+
self._running_aio_tasks: List[asyncio.Task] = []
|
112
|
+
# Info for all known tasks, Task ID -> TaskInfo.
|
113
|
+
self._tasks: Dict[str, TaskInfo] = {}
|
114
|
+
# Tracking of task execution on Function Executor.
|
115
|
+
self._runnable_tasks: List[TaskInfo] = []
|
116
|
+
self._running_task: Optional[TaskInfo] = None
|
117
|
+
|
118
|
+
def function_executor_id(self) -> str:
|
119
|
+
return self._function_executor_description.id
|
120
|
+
|
121
|
+
def status(self) -> FunctionExecutorStatus:
|
122
|
+
"""Returns the current status of the Function Executor.
|
123
|
+
|
124
|
+
Not blocking.
|
125
|
+
"""
|
126
|
+
return self._status
|
127
|
+
|
128
|
+
def add_task(self, task: Task, allocation_id: str) -> None:
|
129
|
+
"""Adds a task to the Function Executor.
|
130
|
+
|
131
|
+
Not blocking. Never raises exceptions.
|
132
|
+
"""
|
133
|
+
logger = task_logger(task, self._logger)
|
134
|
+
if self.has_task(task.id):
|
135
|
+
logger.warning(
|
136
|
+
"attempted to add already added task to Function Executor",
|
137
|
+
)
|
138
|
+
return
|
139
|
+
|
140
|
+
metric_tasks_fetched.inc()
|
141
|
+
task_info: TaskInfo = TaskInfo(
|
142
|
+
task=task, allocation_id=allocation_id, start_time=time.monotonic()
|
143
|
+
)
|
144
|
+
self._tasks[task.id] = task_info
|
145
|
+
next_aio = prepare_task(
|
146
|
+
task_info=task_info,
|
147
|
+
blob_store=self._blob_store,
|
148
|
+
logger=logger,
|
149
|
+
)
|
150
|
+
self._spawn_aio_for_task(
|
151
|
+
aio=next_aio,
|
152
|
+
task_info=task_info,
|
153
|
+
on_exception=TaskPreparationFinished(task_info=task_info, is_success=False),
|
154
|
+
)
|
155
|
+
|
156
|
+
def has_task(self, task_id: str) -> bool:
|
157
|
+
"""Checks if the Function Executor has a task with the given ID.
|
158
|
+
|
159
|
+
Not blocking. Never raises exceptions.
|
160
|
+
"""
|
161
|
+
return task_id in self._tasks
|
162
|
+
|
163
|
+
def task_ids(self) -> List[str]:
|
164
|
+
"""Returns the list of task IDs known to the Function Executor.
|
165
|
+
|
166
|
+
Not blocking. Never raises exceptions.
|
167
|
+
"""
|
168
|
+
return list(self._tasks.keys())
|
169
|
+
|
170
|
+
def remove_task(self, task_id: str) -> None:
|
171
|
+
"""Removes the task from the Function Executor.
|
172
|
+
|
173
|
+
Cancels the task if it's in progress. Just removes the task if it was already completed.
|
174
|
+
The cancellation is asynchronous and might take a while to complete.
|
175
|
+
Until the cancellation is complete, the task won't be removed from the Function Executor.
|
176
|
+
Not blocking. Never raises exceptions.
|
177
|
+
"""
|
178
|
+
if not self.has_task(task_id):
|
179
|
+
self._logger.warning(
|
180
|
+
"attempted to cancel a task that is not known to the Function Executor",
|
181
|
+
task_id=task_id,
|
182
|
+
)
|
183
|
+
return
|
184
|
+
|
185
|
+
task_info: TaskInfo = self._tasks.pop(task_id)
|
186
|
+
if task_info.is_completed:
|
187
|
+
return # Server processed the completed task outputs, we can forget it now.
|
188
|
+
|
189
|
+
# Task cancellation is required as the task is not completed yet.
|
190
|
+
logger = task_logger(task_info.task, self._logger)
|
191
|
+
task_info.is_cancelled = True
|
192
|
+
logger.info(
|
193
|
+
"cancelling task",
|
194
|
+
allocation_id=task_info.allocation_id,
|
195
|
+
)
|
196
|
+
if task_info.aio_task is not None:
|
197
|
+
task_info.aio_task.cancel()
|
198
|
+
|
199
|
+
def startup(self) -> None:
|
200
|
+
"""Starts up the Function Executor and prepares it to run tasks.
|
201
|
+
|
202
|
+
Not blocking. Never raises exceptions."""
|
203
|
+
if self._control_loop_aio_task is not None:
|
204
|
+
self._logger.warning(
|
205
|
+
"ignoring startup call as the Function Executor is already started"
|
206
|
+
)
|
207
|
+
return
|
208
|
+
|
209
|
+
self._control_loop_aio_task = asyncio.create_task(
|
210
|
+
self._control_loop(),
|
211
|
+
name="function executor control loop",
|
212
|
+
)
|
213
|
+
self._set_status(FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_PENDING)
|
214
|
+
next_aio = create_function_executor(
|
215
|
+
function_executor_description=self._function_executor_description,
|
216
|
+
function_executor_server_factory=self._function_executor_server_factory,
|
217
|
+
blob_store=self._blob_store,
|
218
|
+
executor_id=self._executor_id,
|
219
|
+
base_url=self._base_url,
|
220
|
+
config_path=self._config_path,
|
221
|
+
cache_path=self._cache_path,
|
222
|
+
logger=self._logger,
|
223
|
+
)
|
224
|
+
self._spawn_aio_for_fe(
|
225
|
+
aio=next_aio,
|
226
|
+
on_exception=FunctionExecutorCreated(
|
227
|
+
function_executor=None,
|
228
|
+
termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR,
|
229
|
+
),
|
230
|
+
)
|
231
|
+
|
232
|
+
async def shutdown(
|
233
|
+
self, termination_reason: FunctionExecutorTerminationReason
|
234
|
+
) -> None:
|
235
|
+
"""Shutsdown the Function Executor and frees all of its resources.
|
236
|
+
|
237
|
+
All the tasks are reported as failed with FE Terminated failure code.
|
238
|
+
Doesn't raise any exceptions. Blocks until the shutdown is complete.
|
239
|
+
"""
|
240
|
+
self._add_event(
|
241
|
+
ShutdownInitiated(termination_reason=termination_reason), source="shutdown"
|
242
|
+
)
|
243
|
+
try:
|
244
|
+
await self._control_loop_aio_task
|
245
|
+
except asyncio.CancelledError:
|
246
|
+
pass # Expected exception on shutdown
|
247
|
+
except Exception as e:
|
248
|
+
self._logger.error(
|
249
|
+
"function executor controller control loop raised unexpected exception",
|
250
|
+
exc_info=e,
|
251
|
+
)
|
252
|
+
self._logger.info("function executor controller shutdown finished")
|
253
|
+
|
254
|
+
def _set_status(
|
255
|
+
self,
|
256
|
+
status: FunctionExecutorStatus,
|
257
|
+
termination_reason: FunctionExecutorTerminationReason = None, # type: Optional[FunctionExecutorTerminationReason]
|
258
|
+
) -> None:
|
259
|
+
"""Sets Function Executor status and reports it to the Server.
|
260
|
+
|
261
|
+
Not blocking. Never raises exceptions."""
|
262
|
+
old_status: FunctionExecutorStatus = self._status
|
263
|
+
new_status: FunctionExecutorStatus = status
|
264
|
+
self._status: FunctionExecutorStatus = new_status
|
265
|
+
|
266
|
+
self._logger.info(
|
267
|
+
"function executor status changed",
|
268
|
+
old_status=FunctionExecutorStatus.Name(old_status),
|
269
|
+
new_status=FunctionExecutorStatus.Name(new_status),
|
270
|
+
termination_reason=_termination_reason_to_short_name(termination_reason),
|
271
|
+
)
|
272
|
+
metric_function_executors_with_status.labels(
|
273
|
+
status=_to_fe_status_metric_label(old_status, self._logger)
|
274
|
+
).dec()
|
275
|
+
metric_function_executors_with_status.labels(
|
276
|
+
status=_to_fe_status_metric_label(new_status, self._logger)
|
277
|
+
).inc()
|
278
|
+
|
279
|
+
new_fe_state = FunctionExecutorState(
|
280
|
+
description=self._function_executor_description, status=new_status
|
281
|
+
)
|
282
|
+
if termination_reason is not None:
|
283
|
+
new_fe_state.termination_reason = termination_reason
|
284
|
+
self._state_reporter.update_function_executor_state(new_fe_state)
|
285
|
+
# Report the status change to the Server asap to reduce latency in the system.
|
286
|
+
self._state_reporter.schedule_state_report()
|
287
|
+
|
288
|
+
async def _control_loop(self) -> None:
|
289
|
+
"""Runs control loop that coordinates all the work done by the Function Executor.
|
290
|
+
|
291
|
+
Doesn't raise any Exceptions.
|
292
|
+
"""
|
293
|
+
self._logger.info("function executor controller control loop started")
|
294
|
+
|
295
|
+
while True:
|
296
|
+
await self._event_added.wait()
|
297
|
+
self._event_added.clear()
|
298
|
+
|
299
|
+
while self._events:
|
300
|
+
event: BaseEvent = self._events.pop(0)
|
301
|
+
debug_print_processing_event(event, self._logger)
|
302
|
+
|
303
|
+
try:
|
304
|
+
if event.event_type == EventType.SHUTDOWN_INITIATED:
|
305
|
+
return await self._shutdown_no_exceptions(event)
|
306
|
+
|
307
|
+
with metric_control_loop_handle_event_latency.time():
|
308
|
+
self._handle_event(event)
|
309
|
+
except BaseException as e:
|
310
|
+
# None of the event handlers should raise exceptions, but still catch all exceptions to ensure
|
311
|
+
# that the control loop doesn't crash if an unexpected exception happen.
|
312
|
+
self._logger.error(
|
313
|
+
"unexpected exception in function executor controller control loop",
|
314
|
+
exc_info=e,
|
315
|
+
fe_event=str(event),
|
316
|
+
)
|
317
|
+
|
318
|
+
def _handle_event(self, event: BaseEvent) -> None:
|
319
|
+
"""Handles the event.
|
320
|
+
|
321
|
+
Doesn't raise any exceptions. Doesn't block.
|
322
|
+
"""
|
323
|
+
if event.event_type == EventType.FUNCTION_EXECUTOR_CREATED:
|
324
|
+
return self._handle_event_function_executor_created(event)
|
325
|
+
elif event.event_type == EventType.FUNCTION_EXECUTOR_DESTROYED:
|
326
|
+
return self._handle_event_function_executor_destroyed(event)
|
327
|
+
elif event.event_type == EventType.TASK_PREPARATION_FINISHED:
|
328
|
+
return self._handle_event_task_preparation_finished(event)
|
329
|
+
elif event.event_type == EventType.SCHEDULE_TASK_EXECUTION:
|
330
|
+
return self._handle_event_schedule_task_execution(event)
|
331
|
+
elif event.event_type == EventType.TASK_EXECUTION_FINISHED:
|
332
|
+
return self._handle_event_task_execution_finished(event)
|
333
|
+
elif event.event_type == EventType.TASK_OUTPUT_UPLOAD_FINISHED:
|
334
|
+
return self._handle_event_task_output_upload_finished(event)
|
335
|
+
|
336
|
+
self._logger.warning(
|
337
|
+
"unexpected event type received", event_type=event.event_type.name
|
338
|
+
)
|
339
|
+
|
340
|
+
def _add_event(self, event: BaseEvent, source: str) -> None:
|
341
|
+
"""Adds an event to the list of events to be processed by the control loop.
|
342
|
+
|
343
|
+
Doesn't raise any exceptions. Doesn't block."""
|
344
|
+
debug_print_adding_event(event=event, source=source, logger=self._logger)
|
345
|
+
self._events.append(event)
|
346
|
+
self._event_added.set()
|
347
|
+
|
348
|
+
def _spawn_aio_for_task(
|
349
|
+
self,
|
350
|
+
aio: Coroutine[Any, Any, BaseEvent],
|
351
|
+
task_info: TaskInfo,
|
352
|
+
on_exception: BaseEvent,
|
353
|
+
) -> None:
|
354
|
+
self._spawn_aio(
|
355
|
+
aio=aio,
|
356
|
+
task_info=task_info,
|
357
|
+
on_exception=on_exception,
|
358
|
+
logger=task_logger(task_info.task, self._logger),
|
359
|
+
)
|
360
|
+
|
361
|
+
def _spawn_aio_for_fe(
|
362
|
+
self, aio: Coroutine[Any, Any, BaseEvent], on_exception: BaseEvent
|
363
|
+
) -> None:
|
364
|
+
self._spawn_aio(
|
365
|
+
aio=aio,
|
366
|
+
task_info=None,
|
367
|
+
on_exception=on_exception,
|
368
|
+
logger=self._logger,
|
369
|
+
)
|
370
|
+
|
371
|
+
def _spawn_aio(
|
372
|
+
self,
|
373
|
+
aio: Coroutine[Any, Any, BaseEvent],
|
374
|
+
task_info: Optional[TaskInfo],
|
375
|
+
on_exception: BaseEvent,
|
376
|
+
logger: Any,
|
377
|
+
) -> None:
|
378
|
+
"""Spawns an aio task for the supplied coroutine.
|
379
|
+
|
380
|
+
The coroutine should return an event that will be added to the FE controller events.
|
381
|
+
The coroutine should not raise any exceptions.
|
382
|
+
on_exception event will be added to the FE controller events if the aio task raises an unexpected exception.
|
383
|
+
on_exception is required to not silently stall the task processing due to an unexpected exception.
|
384
|
+
If task_info is not None, the aio task will be associated with the task_info while the aio task is running.
|
385
|
+
Doesn't raise any exceptions. Doesn't block.
|
386
|
+
Use `_spawn_aio_for_task` and `_spawn_aio_for_fe` instead of directly calling this method.
|
387
|
+
"""
|
388
|
+
|
389
|
+
aio_task_name: str = str(aio)
|
390
|
+
# Wrap the coroutine into aio task to disable warning "coroutine was never awaited" when the task is cancelled.
|
391
|
+
aio: asyncio.Task = asyncio.create_task(aio, name=aio_task_name)
|
392
|
+
|
393
|
+
async def coroutine_wrapper() -> None:
|
394
|
+
try:
|
395
|
+
self._add_event(await aio, source=aio_task_name)
|
396
|
+
except asyncio.CancelledError:
|
397
|
+
pass # Expected exception on aio task cancellation.
|
398
|
+
except BaseException as e:
|
399
|
+
logger.error(
|
400
|
+
"unexpected exception in aio task",
|
401
|
+
exc_info=e,
|
402
|
+
aio_task_name=aio_task_name,
|
403
|
+
)
|
404
|
+
self._add_event(on_exception, source=aio_task_name)
|
405
|
+
finally:
|
406
|
+
if task_info is not None:
|
407
|
+
task_info.aio_task = None
|
408
|
+
self._running_aio_tasks.remove(asyncio.current_task())
|
409
|
+
|
410
|
+
aio_wrapper_task: asyncio.Task = asyncio.create_task(
|
411
|
+
coroutine_wrapper(),
|
412
|
+
name=f"function executor controller aio task '{aio_task_name}'",
|
413
|
+
)
|
414
|
+
self._running_aio_tasks.append(aio_wrapper_task)
|
415
|
+
if task_info is not None:
|
416
|
+
task_info.aio_task = aio_wrapper_task
|
417
|
+
|
418
|
+
# Event handlers for the events added to the control loop.
|
419
|
+
# All the event handlers are synchronous and never block on any long running operations.
|
420
|
+
|
421
|
+
def _handle_event_function_executor_created(
|
422
|
+
self, event: FunctionExecutorCreated
|
423
|
+
) -> None:
|
424
|
+
"""Handles the startup finished event.
|
425
|
+
|
426
|
+
Doesn't raise any exceptions. Doesn't block.
|
427
|
+
"""
|
428
|
+
if event.function_executor is None:
|
429
|
+
self._destroy_function_executor_before_termination(event.termination_reason)
|
430
|
+
if event.function_error is not None:
|
431
|
+
# TODO: Save stdout and stderr of customer code that ran during FE creation into BLOBs
|
432
|
+
# so customers can debug their function initialization errors.
|
433
|
+
# https://github.com/tensorlakeai/indexify/issues/1426
|
434
|
+
self._logger.error(
|
435
|
+
"failed to create function executor due to error in customer code",
|
436
|
+
exc_info=event.function_error,
|
437
|
+
)
|
438
|
+
return
|
439
|
+
|
440
|
+
self._function_executor = event.function_executor
|
441
|
+
self._set_status(FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING)
|
442
|
+
# Health checker starts after FE creation and gets automatically stopped on FE destroy.
|
443
|
+
self._function_executor.health_checker().start(
|
444
|
+
self._health_check_failed_callback
|
445
|
+
)
|
446
|
+
self._add_event(
|
447
|
+
ScheduleTaskExecution(),
|
448
|
+
source="_handle_event_function_executor_created",
|
449
|
+
)
|
450
|
+
|
451
|
+
def _handle_event_function_executor_destroyed(
|
452
|
+
self, event: FunctionExecutorDestroyed
|
453
|
+
) -> None:
|
454
|
+
"""Handles the Function Executor destroy finished event.
|
455
|
+
|
456
|
+
Doesn't raise any exceptions. Doesn't block.
|
457
|
+
"""
|
458
|
+
if not event.is_success:
|
459
|
+
self._logger.error(
|
460
|
+
"Function Executor destroy failed unexpectedly, this should never happen",
|
461
|
+
)
|
462
|
+
# Set the status only after the FE got destroyed because Server assumes that all FE resources are freed when the status changes.
|
463
|
+
self._set_status(
|
464
|
+
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED,
|
465
|
+
termination_reason=event.termination_reason,
|
466
|
+
)
|
467
|
+
# Invoke the scheduler so it can fail runnable tasks with FE Terminated error.
|
468
|
+
self._add_event(
|
469
|
+
ScheduleTaskExecution(),
|
470
|
+
source="_handle_event_function_executor_destroyed",
|
471
|
+
)
|
472
|
+
|
473
|
+
async def _health_check_failed_callback(self, result: HealthCheckResult):
|
474
|
+
self._logger.error(
|
475
|
+
"Function Executor health check failed, terminating Function Executor",
|
476
|
+
reason=result.reason,
|
477
|
+
)
|
478
|
+
self._destroy_function_executor_before_termination(
|
479
|
+
termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
|
480
|
+
)
|
481
|
+
|
482
|
+
def _handle_event_task_preparation_finished(
|
483
|
+
self, event: TaskPreparationFinished
|
484
|
+
) -> None:
|
485
|
+
"""Handles the task preparation finished event.
|
486
|
+
|
487
|
+
Doesn't raise any exceptions. Doesn't block.
|
488
|
+
"""
|
489
|
+
task_info: TaskInfo = event.task_info
|
490
|
+
|
491
|
+
if task_info.is_cancelled:
|
492
|
+
task_info.output = TaskOutput.task_cancelled(
|
493
|
+
task=task_info.task, allocation_id=task_info.allocation_id
|
494
|
+
)
|
495
|
+
self._start_task_output_upload(task_info)
|
496
|
+
return
|
497
|
+
if not event.is_success:
|
498
|
+
task_info.output = TaskOutput.internal_error(
|
499
|
+
task=task_info.task, allocation_id=task_info.allocation_id
|
500
|
+
)
|
501
|
+
self._start_task_output_upload(task_info)
|
502
|
+
return
|
503
|
+
|
504
|
+
task_info.prepared_time = time.monotonic()
|
505
|
+
metric_runnable_tasks.inc()
|
506
|
+
metric_runnable_tasks_per_function_name.labels(
|
507
|
+
task_info.task.function_name
|
508
|
+
).inc()
|
509
|
+
self._runnable_tasks.append(task_info)
|
510
|
+
self._add_event(
|
511
|
+
ScheduleTaskExecution(),
|
512
|
+
source="_handle_event_task_preparation_finished",
|
513
|
+
)
|
514
|
+
|
515
|
+
def _handle_event_schedule_task_execution(
|
516
|
+
self, event: ScheduleTaskExecution
|
517
|
+
) -> None:
|
518
|
+
if len(self._runnable_tasks) == 0:
|
519
|
+
return
|
520
|
+
|
521
|
+
if self._status not in [
|
522
|
+
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING,
|
523
|
+
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED,
|
524
|
+
]:
|
525
|
+
return # Can't progress pending task with the current status.
|
526
|
+
|
527
|
+
if (
|
528
|
+
self._status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING
|
529
|
+
and self._running_task is not None
|
530
|
+
):
|
531
|
+
return
|
532
|
+
|
533
|
+
# Take the next task from head to get FIFO order and improve fairness.
|
534
|
+
task_info: TaskInfo = self._pop_runnable_task()
|
535
|
+
# Re-invoke the scheduler later to process the next runnable task if this one can't run on FE.
|
536
|
+
self._add_event(
|
537
|
+
ScheduleTaskExecution(),
|
538
|
+
source="_handle_event_schedule_task_execution",
|
539
|
+
)
|
540
|
+
|
541
|
+
if task_info.is_cancelled:
|
542
|
+
task_info.output = TaskOutput.task_cancelled(
|
543
|
+
task=task_info.task, allocation_id=task_info.allocation_id
|
544
|
+
)
|
545
|
+
self._start_task_output_upload(task_info)
|
546
|
+
elif self._status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED:
|
547
|
+
task_info.output = TaskOutput.function_executor_terminated(
|
548
|
+
task=task_info.task, allocation_id=task_info.allocation_id
|
549
|
+
)
|
550
|
+
self._start_task_output_upload(task_info)
|
551
|
+
elif self._status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING:
|
552
|
+
self._running_task = task_info
|
553
|
+
next_aio = run_task_on_function_executor(
|
554
|
+
task_info=task_info,
|
555
|
+
function_executor=self._function_executor,
|
556
|
+
logger=task_logger(task_info.task, self._logger),
|
557
|
+
)
|
558
|
+
self._spawn_aio_for_task(
|
559
|
+
aio=next_aio,
|
560
|
+
task_info=task_info,
|
561
|
+
on_exception=TaskExecutionFinished(
|
562
|
+
task_info=task_info,
|
563
|
+
function_executor_termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR,
|
564
|
+
),
|
565
|
+
)
|
566
|
+
else:
|
567
|
+
task_logger(task_info.task, self._logger).error(
|
568
|
+
"failed to schedule task execution, this should never happen"
|
569
|
+
)
|
570
|
+
|
571
|
+
def _pop_runnable_task(self) -> TaskInfo:
|
572
|
+
task_info: TaskInfo = self._runnable_tasks.pop(0)
|
573
|
+
metric_schedule_task_latency.observe(time.monotonic() - task_info.prepared_time)
|
574
|
+
metric_runnable_tasks.dec()
|
575
|
+
metric_runnable_tasks_per_function_name.labels(
|
576
|
+
task_info.task.function_name
|
577
|
+
).dec()
|
578
|
+
return task_info
|
579
|
+
|
580
|
+
def _handle_event_task_execution_finished(
|
581
|
+
self, event: TaskExecutionFinished
|
582
|
+
) -> None:
|
583
|
+
"""Handles the task execution finished event.
|
584
|
+
|
585
|
+
Doesn't raise any exceptions. Doesn't block.
|
586
|
+
"""
|
587
|
+
self._running_task = None
|
588
|
+
|
589
|
+
if event.function_executor_termination_reason is None:
|
590
|
+
self._add_event(
|
591
|
+
ScheduleTaskExecution(), source="_handle_event_task_execution_finished"
|
592
|
+
)
|
593
|
+
else:
|
594
|
+
self._destroy_function_executor_before_termination(
|
595
|
+
termination_reason=event.function_executor_termination_reason
|
596
|
+
)
|
597
|
+
|
598
|
+
# Ignore is_cancelled because cancelling a task still involves uploading its output.
|
599
|
+
# We'll just upload a real output instead of "task cancelled" output.
|
600
|
+
# Adds TaskOutputUploadFinished event when done.
|
601
|
+
self._start_task_output_upload(event.task_info)
|
602
|
+
|
603
|
+
def _start_task_output_upload(self, task_info: TaskInfo) -> None:
|
604
|
+
"""Starts the task output upload for the given task.
|
605
|
+
|
606
|
+
Doesn't raise any exceptions. Doesn't block.
|
607
|
+
"""
|
608
|
+
next_aio = upload_task_output(
|
609
|
+
task_info=task_info,
|
610
|
+
blob_store=self._blob_store,
|
611
|
+
logger=task_logger(task_info.task, self._logger),
|
612
|
+
)
|
613
|
+
self._spawn_aio_for_task(
|
614
|
+
aio=next_aio,
|
615
|
+
task_info=task_info,
|
616
|
+
on_exception=TaskOutputUploadFinished(
|
617
|
+
task_info=task_info, is_success=False
|
618
|
+
),
|
619
|
+
)
|
620
|
+
|
621
|
+
def _handle_event_task_output_upload_finished(
|
622
|
+
self, event: TaskOutputUploadFinished
|
623
|
+
) -> None:
|
624
|
+
"""Handles the task output upload finished event.
|
625
|
+
|
626
|
+
Doesn't raise any exceptions. Doesn't block.
|
627
|
+
"""
|
628
|
+
# Ignore task cancellation because we need to report it to the server anyway.
|
629
|
+
task_info: TaskInfo = event.task_info
|
630
|
+
if not event.is_success:
|
631
|
+
task_info.output = TaskOutput.internal_error(
|
632
|
+
task=task_info.task, allocation_id=task_info.allocation_id
|
633
|
+
)
|
634
|
+
|
635
|
+
self._complete_task(event.task_info)
|
636
|
+
|
637
|
+
def _complete_task(self, task_info: TaskInfo) -> None:
|
638
|
+
"""Marks the task as completed and reports it to the Server.
|
639
|
+
|
640
|
+
Doesn't raise any exceptions. Doesn't block.
|
641
|
+
"""
|
642
|
+
task_info.is_completed = True
|
643
|
+
emit_completed_task_metrics(
|
644
|
+
task_info=task_info,
|
645
|
+
logger=task_logger(task_info.task, self._logger),
|
646
|
+
)
|
647
|
+
# Reconciler will call .remove_task() once Server signals that it processed this update.
|
648
|
+
self._state_reporter.add_completed_task_output(task_info.output)
|
649
|
+
self._state_reporter.schedule_state_report()
|
650
|
+
|
651
|
+
def _destroy_function_executor_before_termination(
|
652
|
+
self, termination_reason: FunctionExecutorTerminationReason
|
653
|
+
) -> None:
|
654
|
+
"""Destroys the Function Executor and frees all its resources to prepare for transitioning to the TERMINATED state.
|
655
|
+
|
656
|
+
Doesn't raise any exceptions. Doesn't block.
|
657
|
+
"""
|
658
|
+
next_aio = destroy_function_executor(
|
659
|
+
function_executor=self._function_executor,
|
660
|
+
termination_reason=termination_reason,
|
661
|
+
logger=self._logger,
|
662
|
+
)
|
663
|
+
self._function_executor = None
|
664
|
+
self._spawn_aio_for_fe(
|
665
|
+
aio=next_aio,
|
666
|
+
on_exception=FunctionExecutorDestroyed(
|
667
|
+
is_success=False, termination_reason=termination_reason
|
668
|
+
),
|
669
|
+
)
|
670
|
+
|
671
|
+
async def _shutdown_no_exceptions(self, event: ShutdownInitiated) -> None:
|
672
|
+
try:
|
673
|
+
await self._shutdown(event)
|
674
|
+
except BaseException as e:
|
675
|
+
# This would result in resource leaks.
|
676
|
+
self._logger.error(
|
677
|
+
"unexpected exception in function executor controller shutdown, this should never happen",
|
678
|
+
exc_info=e,
|
679
|
+
)
|
680
|
+
|
681
|
+
async def _shutdown(self, event: ShutdownInitiated) -> None:
|
682
|
+
"""Shuts down the Function Executor and frees all its resources.
|
683
|
+
|
684
|
+
The control loop must be blocked while this method is running.
|
685
|
+
The control loop must exit immediately after this method returns.
|
686
|
+
Doesn't raise any exceptions.
|
687
|
+
|
688
|
+
Server needs to wait until all the tasks its interested in got their outcomes reported
|
689
|
+
before calling the FE shutdown as we don't report anything on FE shutdown.
|
690
|
+
"""
|
691
|
+
self._logger.info("function executor controller shutdown initiated")
|
692
|
+
# Control loop is blocked executing this method, no new aio tasks will be spawned concurrently.
|
693
|
+
# Create a copy of the running aio tasks because they remove themselves from the list when they finish.
|
694
|
+
cancelled_tasks: List[asyncio.Task] = self._running_aio_tasks.copy()
|
695
|
+
for cancelled_task in cancelled_tasks:
|
696
|
+
cancelled_task.cancel()
|
697
|
+
|
698
|
+
# Await all aio tasks to make sure that nothing is mutating this FE controller state concurrently.
|
699
|
+
for cancelled_task in cancelled_tasks:
|
700
|
+
try:
|
701
|
+
await cancelled_task
|
702
|
+
except BaseException:
|
703
|
+
# Ignore any errors as we expect them when cancelling tasks.
|
704
|
+
# BaseException includes asyncio.CancelledError which is always raised here.
|
705
|
+
pass
|
706
|
+
|
707
|
+
if self._status != FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED:
|
708
|
+
self._handle_event_function_executor_destroyed(
|
709
|
+
await destroy_function_executor(
|
710
|
+
function_executor=self._function_executor,
|
711
|
+
termination_reason=event.termination_reason,
|
712
|
+
logger=self._logger,
|
713
|
+
)
|
714
|
+
)
|
715
|
+
|
716
|
+
self._state_reporter.remove_function_executor_info(self.function_executor_id())
|
717
|
+
self._state_reporter.schedule_state_report()
|
718
|
+
|
719
|
+
self._logger.info("function executor controller control loop finished")
|
720
|
+
debug_print_events(events=self._events, logger=self._logger)
|
721
|
+
|
722
|
+
|
723
|
+
def _to_fe_status_metric_label(status: FunctionExecutorStatus, logger: Any) -> str:
|
724
|
+
if status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN:
|
725
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN
|
726
|
+
elif status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_PENDING:
|
727
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING
|
728
|
+
elif status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING:
|
729
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING
|
730
|
+
elif status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED:
|
731
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED
|
732
|
+
else:
|
733
|
+
logger.error(
|
734
|
+
"unexpected Function Executor status",
|
735
|
+
status=FunctionExecutorStatus.Name(status),
|
736
|
+
)
|
737
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN
|
738
|
+
|
739
|
+
|
740
|
+
_termination_reason_to_short_name_map = {
|
741
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNKNOWN: "UNKNOWN",
|
742
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR: "STARTUP_FAILED_INTERNAL_ERROR",
|
743
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_ERROR: "STARTUP_FAILED_FUNCTION_ERROR",
|
744
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_TIMEOUT: "STARTUP_FAILED_FUNCTION_TIMEOUT",
|
745
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_EXECUTOR_SHUTDOWN: "EXECUTOR_SHUTDOWN",
|
746
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_REMOVED_FROM_DESIRED_STATE: "REMOVED_FROM_DESIRED_STATE",
|
747
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY: "UNHEALTHY",
|
748
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR: "INTERNAL_ERROR",
|
749
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT: "FUNCTION_TIMEOUT",
|
750
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED: "FUNCTION_CANCELLED",
|
751
|
+
}
|
752
|
+
|
753
|
+
|
754
|
+
def _termination_reason_to_short_name(value: FunctionExecutorTerminationReason) -> str:
|
755
|
+
# The enum value names are really long, shorten them to make the logs more readable.
|
756
|
+
if value is None:
|
757
|
+
return "None"
|
758
|
+
|
759
|
+
return _termination_reason_to_short_name_map.get(value, "UNEXPECTED")
|