indexify 0.4.28__py3-none-any.whl → 0.4.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/executor/executor.py +11 -7
- indexify/executor/function_executor_controller/__init__.py +2 -2
- indexify/executor/function_executor_controller/completed_task_allocation_metrics.py +87 -0
- indexify/executor/function_executor_controller/events.py +29 -33
- indexify/executor/function_executor_controller/{finalize_task.py → finalize_task_allocation.py} +45 -37
- indexify/executor/function_executor_controller/function_executor_controller.py +194 -180
- indexify/executor/function_executor_controller/loggers.py +15 -17
- indexify/executor/function_executor_controller/message_validators.py +4 -12
- indexify/executor/function_executor_controller/metrics/completed_task_allocation_metrics.py +70 -0
- indexify/executor/function_executor_controller/metrics/finalize_task_allocation.py +26 -0
- indexify/executor/function_executor_controller/metrics/function_executor_controller.py +12 -11
- indexify/executor/function_executor_controller/metrics/prepare_task_allocation.py +27 -0
- indexify/executor/function_executor_controller/{prepare_task.py → prepare_task_allocation.py} +33 -29
- indexify/executor/function_executor_controller/{run_task.py → run_task_allocation.py} +54 -51
- indexify/executor/function_executor_controller/{task_info.py → task_allocation_info.py} +6 -6
- indexify/executor/function_executor_controller/{task_input.py → task_allocation_input.py} +2 -2
- indexify/executor/function_executor_controller/{task_output.py → task_allocation_output.py} +24 -24
- indexify/executor/monitoring/desired_state_handler.py +24 -0
- indexify/executor/monitoring/reported_state_handler.py +22 -0
- indexify/executor/monitoring/server.py +4 -0
- indexify/executor/state_reconciler.py +26 -19
- indexify/executor/state_reporter.py +9 -4
- {indexify-0.4.28.dist-info → indexify-0.4.30.dist-info}/METADATA +2 -2
- {indexify-0.4.28.dist-info → indexify-0.4.30.dist-info}/RECORD +27 -25
- indexify/executor/function_executor_controller/completed_task_metrics.py +0 -83
- indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +0 -68
- indexify/executor/function_executor_controller/metrics/finalize_task.py +0 -20
- indexify/executor/function_executor_controller/metrics/prepare_task.py +0 -18
- /indexify/executor/function_executor_controller/metrics/{run_task.py → run_task_allocation.py} +0 -0
- {indexify-0.4.28.dist-info → indexify-0.4.30.dist-info}/WHEEL +0 -0
- {indexify-0.4.28.dist-info → indexify-0.4.30.dist-info}/entry_points.txt +0 -0
@@ -29,7 +29,7 @@ from indexify.proto.executor_api_pb2 import (
|
|
29
29
|
TaskResult,
|
30
30
|
)
|
31
31
|
|
32
|
-
from .
|
32
|
+
from .completed_task_allocation_metrics import emit_completed_task_allocation_metrics
|
33
33
|
from .create_function_executor import create_function_executor
|
34
34
|
from .debug_event_loop import (
|
35
35
|
debug_print_adding_event,
|
@@ -41,13 +41,13 @@ from .events import (
|
|
41
41
|
EventType,
|
42
42
|
FunctionExecutorCreated,
|
43
43
|
FunctionExecutorTerminated,
|
44
|
-
|
44
|
+
ScheduleTaskAllocationExecution,
|
45
45
|
ShutdownInitiated,
|
46
|
-
|
47
|
-
|
48
|
-
|
46
|
+
TaskAllocationExecutionFinished,
|
47
|
+
TaskAllocationFinalizationFinished,
|
48
|
+
TaskAllocationPreparationFinished,
|
49
49
|
)
|
50
|
-
from .
|
50
|
+
from .finalize_task_allocation import finalize_task_allocation
|
51
51
|
from .loggers import function_executor_logger, task_allocation_logger
|
52
52
|
from .metrics.function_executor_controller import (
|
53
53
|
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED,
|
@@ -58,16 +58,16 @@ from .metrics.function_executor_controller import (
|
|
58
58
|
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN,
|
59
59
|
metric_control_loop_handle_event_latency,
|
60
60
|
metric_function_executors_with_state,
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
61
|
+
metric_runnable_task_allocations,
|
62
|
+
metric_runnable_task_allocations_per_function_name,
|
63
|
+
metric_schedule_task_allocation_latency,
|
64
|
+
metric_task_allocations_fetched,
|
65
65
|
)
|
66
|
-
from .
|
67
|
-
from .
|
68
|
-
from .
|
69
|
-
from .
|
70
|
-
from .
|
66
|
+
from .prepare_task_allocation import prepare_task_allocation
|
67
|
+
from .run_task_allocation import run_task_allocation_on_function_executor
|
68
|
+
from .task_allocation_info import TaskAllocationInfo
|
69
|
+
from .task_allocation_input import TaskAllocationInput
|
70
|
+
from .task_allocation_output import TaskAllocationOutput
|
71
71
|
from .terminate_function_executor import terminate_function_executor
|
72
72
|
|
73
73
|
|
@@ -133,84 +133,87 @@ class FunctionExecutorController:
|
|
133
133
|
self._control_loop_aio_task: Optional[asyncio.Task] = None
|
134
134
|
# aio tasks spawned by the control loop.
|
135
135
|
self._running_aio_tasks: List[asyncio.Task] = []
|
136
|
-
#
|
137
|
-
self.
|
138
|
-
#
|
139
|
-
self.
|
140
|
-
|
136
|
+
# All task allocations assigned to FE, Allocation ID -> TaskAllocationInfo.
|
137
|
+
self._task_allocations: Dict[str, TaskAllocationInfo] = {}
|
138
|
+
# Task allocation prepared for execution on FE.
|
139
|
+
self._runnable_task_allocations: List[TaskAllocationInfo] = []
|
140
|
+
# Task allocations currently running on the FE.
|
141
|
+
self._running_task_allocations: List[TaskAllocationInfo] = []
|
141
142
|
|
142
143
|
def function_executor_id(self) -> str:
|
143
144
|
return self._fe_description.id
|
144
145
|
|
145
146
|
def add_task_allocation(self, task_allocation: TaskAllocation) -> None:
|
146
|
-
"""Adds a task to the Function Executor.
|
147
|
+
"""Adds a task allocation to the Function Executor.
|
147
148
|
|
148
149
|
Not blocking. Never raises exceptions.
|
149
150
|
"""
|
150
151
|
logger = task_allocation_logger(task_allocation, self._logger)
|
151
|
-
if self.
|
152
|
+
if self.has_task_allocation(task_allocation.allocation_id):
|
152
153
|
logger.warning(
|
153
|
-
"attempted to add already added task to Function Executor",
|
154
|
+
"attempted to add already added task allocation to Function Executor",
|
154
155
|
)
|
155
156
|
return
|
156
157
|
|
157
|
-
|
158
|
-
|
158
|
+
metric_task_allocations_fetched.inc()
|
159
|
+
alloc_info: TaskAllocationInfo = TaskAllocationInfo(
|
159
160
|
allocation=task_allocation, start_time=time.monotonic()
|
160
161
|
)
|
161
|
-
self.
|
162
|
-
next_aio =
|
163
|
-
|
162
|
+
self._task_allocations[task_allocation.allocation_id] = alloc_info
|
163
|
+
next_aio = prepare_task_allocation(
|
164
|
+
alloc_info=alloc_info,
|
164
165
|
blob_store=self._blob_store,
|
165
166
|
logger=logger,
|
166
167
|
)
|
167
|
-
self.
|
168
|
+
self._spawn_aio_for_task_alloc(
|
168
169
|
aio=next_aio,
|
169
|
-
|
170
|
-
on_exception=
|
170
|
+
alloc_info=alloc_info,
|
171
|
+
on_exception=TaskAllocationPreparationFinished(
|
172
|
+
alloc_info=alloc_info, is_success=False
|
173
|
+
),
|
171
174
|
)
|
172
175
|
|
173
|
-
def
|
174
|
-
"""Checks if the Function Executor has a task with the given ID.
|
176
|
+
def has_task_allocation(self, task_allocation_id: str) -> bool:
|
177
|
+
"""Checks if the Function Executor has a task allocation with the given ID.
|
175
178
|
|
176
179
|
Not blocking. Never raises exceptions.
|
177
180
|
"""
|
178
|
-
return
|
181
|
+
return task_allocation_id in self._task_allocations
|
179
182
|
|
180
|
-
def
|
181
|
-
"""Returns the list of task IDs known to the Function Executor.
|
183
|
+
def task_allocation_ids(self) -> List[str]:
|
184
|
+
"""Returns the list of task allocation IDs known to the Function Executor.
|
182
185
|
|
183
186
|
Not blocking. Never raises exceptions.
|
184
187
|
"""
|
185
|
-
return list(self.
|
188
|
+
return list(self._task_allocations.keys())
|
186
189
|
|
187
|
-
def
|
188
|
-
"""Removes the task from the Function Executor.
|
190
|
+
def remove_task_allocation(self, task_allocation_id: str) -> None:
|
191
|
+
"""Removes the task allocation from the Function Executor.
|
189
192
|
|
190
|
-
Cancels the task if it's in progress. Just removes the task if it was already
|
191
|
-
The cancellation is asynchronous and might take a while to complete.
|
193
|
+
Cancels the task allocation if it's in progress. Just removes the task allocation if it was already
|
194
|
+
completed. The cancellation is asynchronous and might take a while to complete.
|
192
195
|
Until the cancellation is complete, the task won't be removed from the Function Executor.
|
193
196
|
Not blocking. Never raises exceptions.
|
194
197
|
"""
|
195
|
-
if not self.
|
198
|
+
if not self.has_task_allocation(task_allocation_id):
|
196
199
|
self._logger.warning(
|
197
|
-
"attempted to cancel a task that is not known to the Function Executor",
|
198
|
-
task_id=
|
200
|
+
"attempted to cancel a task allocation that is not known to the Function Executor",
|
201
|
+
task_id=task_allocation_id,
|
199
202
|
)
|
200
203
|
return
|
201
204
|
|
202
|
-
|
203
|
-
if
|
205
|
+
alloc_info: TaskAllocationInfo = self._task_allocations.pop(task_allocation_id)
|
206
|
+
if alloc_info.is_completed:
|
204
207
|
return # Server processed the completed task outputs, we can forget it now.
|
205
208
|
|
206
209
|
# Task cancellation is required as the task is not completed yet.
|
207
|
-
logger = task_allocation_logger(
|
208
|
-
|
210
|
+
logger = task_allocation_logger(alloc_info.allocation, self._logger)
|
211
|
+
alloc_info.is_cancelled = True
|
209
212
|
logger.info(
|
210
|
-
"cancelling task",
|
213
|
+
"cancelling task allocation",
|
211
214
|
)
|
212
|
-
if
|
213
|
-
|
215
|
+
if alloc_info.aio_task is not None:
|
216
|
+
alloc_info.aio_task.cancel()
|
214
217
|
|
215
218
|
def startup(self) -> None:
|
216
219
|
"""Starts up the Function Executor and prepares it to run tasks.
|
@@ -353,14 +356,14 @@ class FunctionExecutorController:
|
|
353
356
|
return self._handle_event_function_executor_created(event)
|
354
357
|
elif event.event_type == EventType.FUNCTION_EXECUTOR_TERMINATED:
|
355
358
|
return self._handle_event_function_executor_terminated(event)
|
356
|
-
elif event.event_type == EventType.
|
357
|
-
return self.
|
358
|
-
elif event.event_type == EventType.
|
359
|
-
return self.
|
360
|
-
elif event.event_type == EventType.
|
361
|
-
return self.
|
362
|
-
elif event.event_type == EventType.
|
363
|
-
return self.
|
359
|
+
elif event.event_type == EventType.TASK_ALLOCATION_PREPARATION_FINISHED:
|
360
|
+
return self._handle_event_task_allocation_preparation_finished(event)
|
361
|
+
elif event.event_type == EventType.SCHEDULE_TASK_ALLOCATION_EXECUTION:
|
362
|
+
return self._handle_event_schedule_task_allocation_execution(event)
|
363
|
+
elif event.event_type == EventType.TASK_ALLOCATION_EXECUTION_FINISHED:
|
364
|
+
return self._handle_event_task_allocation_execution_finished(event)
|
365
|
+
elif event.event_type == EventType.TASK_ALLOCATION_FINALIZATION_FINISHED:
|
366
|
+
return self._handle_event_task_allocation_finalization_finished(event)
|
364
367
|
|
365
368
|
self._logger.warning(
|
366
369
|
"unexpected event type received", event_type=event.event_type.name
|
@@ -374,17 +377,17 @@ class FunctionExecutorController:
|
|
374
377
|
self._events.append(event)
|
375
378
|
self._event_added.set()
|
376
379
|
|
377
|
-
def
|
380
|
+
def _spawn_aio_for_task_alloc(
|
378
381
|
self,
|
379
382
|
aio: Coroutine[Any, Any, BaseEvent],
|
380
|
-
|
383
|
+
alloc_info: TaskAllocationInfo,
|
381
384
|
on_exception: BaseEvent,
|
382
385
|
) -> None:
|
383
386
|
self._spawn_aio(
|
384
387
|
aio=aio,
|
385
|
-
|
388
|
+
alloc_info=alloc_info,
|
386
389
|
on_exception=on_exception,
|
387
|
-
logger=task_allocation_logger(
|
390
|
+
logger=task_allocation_logger(alloc_info.allocation, self._logger),
|
388
391
|
)
|
389
392
|
|
390
393
|
def _spawn_aio_for_fe(
|
@@ -392,7 +395,7 @@ class FunctionExecutorController:
|
|
392
395
|
) -> None:
|
393
396
|
self._spawn_aio(
|
394
397
|
aio=aio,
|
395
|
-
|
398
|
+
alloc_info=None,
|
396
399
|
on_exception=on_exception,
|
397
400
|
logger=self._logger,
|
398
401
|
)
|
@@ -400,7 +403,7 @@ class FunctionExecutorController:
|
|
400
403
|
def _spawn_aio(
|
401
404
|
self,
|
402
405
|
aio: Coroutine[Any, Any, BaseEvent],
|
403
|
-
|
406
|
+
alloc_info: Optional[TaskAllocationInfo],
|
404
407
|
on_exception: BaseEvent,
|
405
408
|
logger: Any,
|
406
409
|
) -> None:
|
@@ -410,9 +413,9 @@ class FunctionExecutorController:
|
|
410
413
|
The coroutine should not raise any exceptions including BaseException.
|
411
414
|
on_exception event will be added to the FE controller events if the aio task raises an unexpected exception.
|
412
415
|
on_exception is required to not silently stall the task processing due to an unexpected exception.
|
413
|
-
If
|
416
|
+
If alloc_info is not None, the aio task will be associated with the alloc_info while the aio task is running.
|
414
417
|
Doesn't raise any exceptions. Doesn't block.
|
415
|
-
Use `
|
418
|
+
Use `_spawn_aio_for_task_alloc` and `_spawn_aio_for_fe` instead of directly calling this method.
|
416
419
|
"""
|
417
420
|
|
418
421
|
aio_task_name: str = str(aio)
|
@@ -438,8 +441,8 @@ class FunctionExecutorController:
|
|
438
441
|
)
|
439
442
|
self._add_event(on_exception, source=aio_task_name)
|
440
443
|
finally:
|
441
|
-
if
|
442
|
-
|
444
|
+
if alloc_info is not None:
|
445
|
+
alloc_info.aio_task = None
|
443
446
|
self._running_aio_tasks.remove(asyncio.current_task())
|
444
447
|
|
445
448
|
aio_wrapper_task: asyncio.Task = asyncio.create_task(
|
@@ -447,8 +450,8 @@ class FunctionExecutorController:
|
|
447
450
|
name=f"function executor controller aio task '{aio_task_name}'",
|
448
451
|
)
|
449
452
|
self._running_aio_tasks.append(aio_wrapper_task)
|
450
|
-
if
|
451
|
-
|
453
|
+
if alloc_info is not None:
|
454
|
+
alloc_info.aio_task = aio_wrapper_task
|
452
455
|
|
453
456
|
# Event handlers for the events added to the control loop.
|
454
457
|
# All the event handlers are synchronous and never block on any long running operations.
|
@@ -466,18 +469,22 @@ class FunctionExecutorController:
|
|
466
469
|
# The allocations we marked here also need to not used FE terminated failure reason in their outputs
|
467
470
|
# because FE terminated means that the allocation wasn't the cause of the FE termination.
|
468
471
|
allocation_ids_caused_termination: List[str] = []
|
469
|
-
for
|
470
|
-
|
471
|
-
|
472
|
-
|
472
|
+
for alloc_info in self._task_allocations.values():
|
473
|
+
task_alloc_logger = task_allocation_logger(
|
474
|
+
alloc_info.allocation, self._logger
|
475
|
+
)
|
476
|
+
task_alloc_logger.info(
|
477
|
+
"marking task allocation failed on function executor startup failure"
|
473
478
|
)
|
474
479
|
allocation_ids_caused_termination.append(
|
475
|
-
|
480
|
+
alloc_info.allocation.allocation_id
|
476
481
|
)
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
482
|
+
alloc_info.output = (
|
483
|
+
TaskAllocationOutput.function_executor_startup_failed(
|
484
|
+
allocation=alloc_info.allocation,
|
485
|
+
fe_termination_reason=event.fe_termination_reason,
|
486
|
+
logger=task_alloc_logger,
|
487
|
+
)
|
481
488
|
)
|
482
489
|
self._start_termination(
|
483
490
|
fe_termination_reason=event.fe_termination_reason,
|
@@ -496,7 +503,7 @@ class FunctionExecutorController:
|
|
496
503
|
# Health checker starts after FE creation and gets automatically stopped on FE destroy.
|
497
504
|
self._fe.health_checker().start(self._health_check_failed_callback)
|
498
505
|
self._add_event(
|
499
|
-
|
506
|
+
ScheduleTaskAllocationExecution(),
|
500
507
|
source="_handle_event_function_executor_created",
|
501
508
|
)
|
502
509
|
|
@@ -526,7 +533,7 @@ class FunctionExecutorController:
|
|
526
533
|
|
527
534
|
# Invoke the scheduler so it can fail runnable tasks with FE Terminated error.
|
528
535
|
self._add_event(
|
529
|
-
|
536
|
+
ScheduleTaskAllocationExecution(),
|
530
537
|
source="_handle_event_function_executor_destroyed",
|
531
538
|
)
|
532
539
|
|
@@ -539,55 +546,56 @@ class FunctionExecutorController:
|
|
539
546
|
self._start_termination(
|
540
547
|
fe_termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY,
|
541
548
|
allocation_ids_caused_termination=[
|
542
|
-
|
549
|
+
alloc_info.allocation.allocation_id
|
550
|
+
for alloc_info in self._running_task_allocations
|
543
551
|
],
|
544
552
|
)
|
545
553
|
|
546
|
-
def
|
547
|
-
self, event:
|
554
|
+
def _handle_event_task_allocation_preparation_finished(
|
555
|
+
self, event: TaskAllocationPreparationFinished
|
548
556
|
) -> None:
|
549
|
-
"""Handles the task preparation finished event.
|
557
|
+
"""Handles the task allocation preparation finished event.
|
550
558
|
|
551
559
|
Doesn't raise any exceptions. Doesn't block.
|
552
560
|
"""
|
553
|
-
|
561
|
+
alloc_info: TaskAllocationInfo = event.alloc_info
|
554
562
|
|
555
|
-
if
|
556
|
-
|
557
|
-
allocation=
|
558
|
-
# Task was
|
563
|
+
if alloc_info.is_cancelled:
|
564
|
+
alloc_info.output = TaskAllocationOutput.task_allocation_cancelled(
|
565
|
+
allocation=alloc_info.allocation,
|
566
|
+
# Task alloc was never executed
|
559
567
|
execution_start_time=None,
|
560
568
|
execution_end_time=None,
|
561
569
|
)
|
562
|
-
self.
|
570
|
+
self._start_task_allocation_finalization(alloc_info)
|
563
571
|
return
|
564
572
|
|
565
573
|
if not event.is_success:
|
566
|
-
# Failed to prepare the task inputs.
|
567
|
-
|
568
|
-
allocation=
|
569
|
-
# Task was
|
574
|
+
# Failed to prepare the task alloc inputs.
|
575
|
+
alloc_info.output = TaskAllocationOutput.internal_error(
|
576
|
+
allocation=alloc_info.allocation,
|
577
|
+
# Task alloc was never executed
|
570
578
|
execution_start_time=None,
|
571
579
|
execution_end_time=None,
|
572
580
|
)
|
573
|
-
self.
|
581
|
+
self._start_task_allocation_finalization(alloc_info)
|
574
582
|
return
|
575
583
|
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
584
|
+
alloc_info.prepared_time = time.monotonic()
|
585
|
+
metric_runnable_task_allocations.inc()
|
586
|
+
metric_runnable_task_allocations_per_function_name.labels(
|
587
|
+
alloc_info.allocation.task.function_name
|
580
588
|
).inc()
|
581
|
-
self.
|
589
|
+
self._runnable_task_allocations.append(alloc_info)
|
582
590
|
self._add_event(
|
583
|
-
|
584
|
-
source="
|
591
|
+
ScheduleTaskAllocationExecution(),
|
592
|
+
source="_handle_event_task_allocation_preparation_finished",
|
585
593
|
)
|
586
594
|
|
587
|
-
def
|
588
|
-
self, event:
|
595
|
+
def _handle_event_schedule_task_allocation_execution(
|
596
|
+
self, event: ScheduleTaskAllocationExecution
|
589
597
|
) -> None:
|
590
|
-
if len(self.
|
598
|
+
if len(self._runnable_task_allocations) == 0:
|
591
599
|
return
|
592
600
|
|
593
601
|
if self._internal_state not in [
|
@@ -599,144 +607,150 @@ class FunctionExecutorController:
|
|
599
607
|
|
600
608
|
if (
|
601
609
|
self._internal_state == _FE_CONTROLLER_STATE.RUNNING
|
602
|
-
and len(self.
|
610
|
+
and len(self._running_task_allocations)
|
611
|
+
== self._fe_description.max_concurrency
|
603
612
|
):
|
604
613
|
return
|
605
614
|
|
606
|
-
# Take the next task from head to get FIFO order and improve fairness.
|
607
|
-
|
615
|
+
# Take the next task alloc from head to get FIFO order and improve fairness.
|
616
|
+
alloc_info: TaskAllocationInfo = self._pop_runnable_task_allocation()
|
608
617
|
# Re-invoke the scheduler later to process the next runnable task if this one can't run on FE.
|
609
618
|
self._add_event(
|
610
|
-
|
611
|
-
source="
|
619
|
+
ScheduleTaskAllocationExecution(),
|
620
|
+
source="_handle_event_schedule_task_allocation_execution",
|
612
621
|
)
|
613
622
|
|
614
|
-
if
|
615
|
-
|
616
|
-
allocation=
|
617
|
-
# Task
|
623
|
+
if alloc_info.is_cancelled:
|
624
|
+
alloc_info.output = TaskAllocationOutput.task_allocation_cancelled(
|
625
|
+
allocation=alloc_info.allocation,
|
626
|
+
# Task alloc was never executed
|
618
627
|
execution_start_time=None,
|
619
628
|
execution_end_time=None,
|
620
629
|
)
|
621
|
-
self.
|
630
|
+
self._start_task_allocation_finalization(alloc_info)
|
622
631
|
elif self._internal_state in [
|
623
632
|
_FE_CONTROLLER_STATE.TERMINATING,
|
624
633
|
_FE_CONTROLLER_STATE.TERMINATED,
|
625
634
|
]:
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
635
|
+
# The output could be set already by FE startup failure handler.
|
636
|
+
if alloc_info.output is None:
|
637
|
+
alloc_info.output = TaskAllocationOutput.function_executor_terminated(
|
638
|
+
alloc_info.allocation
|
630
639
|
)
|
631
|
-
self.
|
640
|
+
self._start_task_allocation_finalization(alloc_info)
|
632
641
|
elif self._internal_state == _FE_CONTROLLER_STATE.RUNNING:
|
633
|
-
self.
|
634
|
-
next_aio =
|
635
|
-
|
642
|
+
self._running_task_allocations.append(alloc_info)
|
643
|
+
next_aio = run_task_allocation_on_function_executor(
|
644
|
+
alloc_info=alloc_info,
|
636
645
|
function_executor=self._fe,
|
637
|
-
logger=task_allocation_logger(
|
646
|
+
logger=task_allocation_logger(alloc_info.allocation, self._logger),
|
638
647
|
)
|
639
|
-
self.
|
648
|
+
self._spawn_aio_for_task_alloc(
|
640
649
|
aio=next_aio,
|
641
|
-
|
642
|
-
on_exception=
|
643
|
-
|
650
|
+
alloc_info=alloc_info,
|
651
|
+
on_exception=TaskAllocationExecutionFinished(
|
652
|
+
alloc_info=alloc_info,
|
644
653
|
function_executor_termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR,
|
645
654
|
),
|
646
655
|
)
|
647
656
|
else:
|
648
|
-
task_allocation_logger(
|
649
|
-
"failed to schedule task execution, this should never happen"
|
657
|
+
task_allocation_logger(alloc_info.allocation, self._logger).error(
|
658
|
+
"failed to schedule task allocation execution, this should never happen"
|
650
659
|
)
|
651
660
|
|
652
|
-
def
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
661
|
+
def _pop_runnable_task_allocation(self) -> TaskAllocationInfo:
|
662
|
+
alloc_info: TaskAllocationInfo = self._runnable_task_allocations.pop(0)
|
663
|
+
metric_schedule_task_allocation_latency.observe(
|
664
|
+
time.monotonic() - alloc_info.prepared_time
|
665
|
+
)
|
666
|
+
metric_runnable_task_allocations.dec()
|
667
|
+
metric_runnable_task_allocations_per_function_name.labels(
|
668
|
+
alloc_info.allocation.task.function_name
|
658
669
|
).dec()
|
659
|
-
return
|
670
|
+
return alloc_info
|
660
671
|
|
661
|
-
def
|
662
|
-
self, event:
|
672
|
+
def _handle_event_task_allocation_execution_finished(
|
673
|
+
self, event: TaskAllocationExecutionFinished
|
663
674
|
) -> None:
|
664
|
-
"""Handles the task execution finished event.
|
675
|
+
"""Handles the task allocation execution finished event.
|
665
676
|
|
666
677
|
Doesn't raise any exceptions. Doesn't block.
|
667
678
|
"""
|
668
|
-
|
669
|
-
self.
|
679
|
+
alloc_info: TaskAllocationInfo = event.alloc_info
|
680
|
+
self._running_task_allocations.remove(alloc_info)
|
670
681
|
|
671
682
|
if event.function_executor_termination_reason is None:
|
672
683
|
self._add_event(
|
673
|
-
|
684
|
+
ScheduleTaskAllocationExecution(),
|
685
|
+
source="_handle_event_task_allocation_execution_finished",
|
674
686
|
)
|
675
687
|
else:
|
676
688
|
self._start_termination(
|
677
689
|
fe_termination_reason=event.function_executor_termination_reason,
|
678
|
-
allocation_ids_caused_termination=[
|
679
|
-
event.task_info.allocation.allocation_id
|
680
|
-
],
|
690
|
+
allocation_ids_caused_termination=[alloc_info.allocation.allocation_id],
|
681
691
|
)
|
682
692
|
|
683
|
-
if
|
693
|
+
if alloc_info.output is None:
|
684
694
|
# `run_task_on_function_executor` guarantees that the output is set in
|
685
695
|
# all cases including task cancellations. If this didn't happen then some
|
686
696
|
# internal error occurred in our code.
|
687
|
-
|
688
|
-
allocation=
|
697
|
+
alloc_info.output = TaskAllocationOutput.internal_error(
|
698
|
+
allocation=alloc_info.allocation,
|
689
699
|
execution_start_time=None,
|
690
700
|
execution_end_time=None,
|
691
701
|
)
|
692
702
|
|
693
|
-
self.
|
703
|
+
self._start_task_allocation_finalization(alloc_info)
|
694
704
|
|
695
|
-
def
|
696
|
-
|
705
|
+
def _start_task_allocation_finalization(
|
706
|
+
self, alloc_info: TaskAllocationInfo
|
707
|
+
) -> None:
|
708
|
+
"""Starts finalization for the given task allocation.
|
697
709
|
|
698
710
|
Doesn't raise any exceptions. Doesn't block.
|
699
|
-
|
711
|
+
alloc_info.output should not be None.
|
700
712
|
"""
|
701
|
-
next_aio =
|
702
|
-
|
713
|
+
next_aio = finalize_task_allocation(
|
714
|
+
task_alloc=alloc_info,
|
703
715
|
blob_store=self._blob_store,
|
704
|
-
logger=task_allocation_logger(
|
716
|
+
logger=task_allocation_logger(alloc_info.allocation, self._logger),
|
705
717
|
)
|
706
|
-
self.
|
718
|
+
self._spawn_aio_for_task_alloc(
|
707
719
|
aio=next_aio,
|
708
|
-
|
709
|
-
on_exception=
|
710
|
-
|
720
|
+
alloc_info=alloc_info,
|
721
|
+
on_exception=TaskAllocationFinalizationFinished(
|
722
|
+
alloc_info=alloc_info, is_success=False
|
711
723
|
),
|
712
724
|
)
|
713
725
|
|
714
|
-
def
|
715
|
-
self, event:
|
726
|
+
def _handle_event_task_allocation_finalization_finished(
|
727
|
+
self, event: TaskAllocationFinalizationFinished
|
716
728
|
) -> None:
|
717
|
-
"""Handles the task finalization finished event.
|
729
|
+
"""Handles the task allocation finalization finished event.
|
718
730
|
|
719
731
|
Doesn't raise any exceptions. Doesn't block.
|
720
732
|
"""
|
721
|
-
|
733
|
+
alloc_info: TaskAllocationInfo = event.alloc_info
|
722
734
|
if not event.is_success:
|
723
|
-
original_task_output:
|
724
|
-
|
725
|
-
|
735
|
+
original_task_output: TaskAllocationOutput = (
|
736
|
+
alloc_info.output
|
737
|
+
) # Never None here
|
738
|
+
alloc_info.output = TaskAllocationOutput.internal_error(
|
739
|
+
allocation=alloc_info.allocation,
|
726
740
|
execution_start_time=original_task_output.execution_start_time,
|
727
741
|
execution_end_time=original_task_output.execution_end_time,
|
728
742
|
)
|
729
743
|
|
730
|
-
logger: Any = task_allocation_logger(
|
744
|
+
logger: Any = task_allocation_logger(alloc_info.allocation, self._logger)
|
731
745
|
# Ignore task cancellation as it's technically finished at this point.
|
732
|
-
|
733
|
-
|
734
|
-
|
746
|
+
alloc_info.is_completed = True
|
747
|
+
emit_completed_task_allocation_metrics(
|
748
|
+
alloc_info=alloc_info,
|
735
749
|
logger=logger,
|
736
750
|
)
|
737
|
-
# Reconciler will call .
|
751
|
+
# Reconciler will call .remove_task_allocation() once Server signals that it processed this update.
|
738
752
|
self._state_reporter.add_completed_task_result(
|
739
|
-
_to_task_result_proto(
|
753
|
+
_to_task_result_proto(alloc_info, logger)
|
740
754
|
)
|
741
755
|
self._state_reporter.schedule_state_report()
|
742
756
|
|
@@ -781,7 +795,7 @@ class FunctionExecutorController:
|
|
781
795
|
The control loop must exit immediately after this method returns.
|
782
796
|
Doesn't raise any exceptions.
|
783
797
|
|
784
|
-
Server needs to wait until all the
|
798
|
+
Server needs to wait until all the task allocations its interested in got their outcomes reported
|
785
799
|
before calling the FE shutdown as we don't report anything on FE shutdown.
|
786
800
|
"""
|
787
801
|
self._logger.info("function executor controller shutdown initiated")
|
@@ -860,12 +874,12 @@ def _termination_reason_to_short_name(value: FunctionExecutorTerminationReason)
|
|
860
874
|
return _termination_reason_to_short_name_map.get(value, "UNEXPECTED")
|
861
875
|
|
862
876
|
|
863
|
-
def _to_task_result_proto(
|
864
|
-
allocation: TaskAllocation =
|
877
|
+
def _to_task_result_proto(alloc_info: TaskAllocationInfo, logger: Any) -> TaskResult:
|
878
|
+
allocation: TaskAllocation = alloc_info.allocation
|
865
879
|
# Might be None if the task wasn't prepared successfully.
|
866
|
-
input: Optional[
|
880
|
+
input: Optional[TaskAllocationInput] = alloc_info.input
|
867
881
|
# Never None here as we're completing the task here.
|
868
|
-
output: Optional[
|
882
|
+
output: Optional[TaskAllocationOutput] = alloc_info.output
|
869
883
|
|
870
884
|
execution_duration_ms: Optional[int] = None
|
871
885
|
if (
|