indexify 0.4.9__py3-none-any.whl → 0.4.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/executor/executor.py +1 -1
- indexify/executor/function_executor/function_executor.py +35 -7
- indexify/executor/function_executor/invocation_state_client.py +26 -16
- indexify/executor/function_executor_controller/completed_task_metrics.py +1 -0
- indexify/executor/function_executor_controller/create_function_executor.py +1 -0
- indexify/executor/function_executor_controller/destroy_function_executor.py +14 -11
- indexify/executor/function_executor_controller/downloads.py +20 -9
- indexify/executor/function_executor_controller/events.py +3 -17
- indexify/executor/function_executor_controller/function_executor_controller.py +145 -119
- indexify/executor/function_executor_controller/metrics/function_executor_controller.py +25 -18
- indexify/executor/function_executor_controller/run_task.py +82 -20
- indexify/executor/function_executor_controller/task_output.py +11 -14
- indexify/executor/function_executor_controller/upload_task_output.py +102 -62
- indexify/executor/state_reconciler.py +2 -7
- indexify/proto/executor_api.proto +31 -30
- indexify/proto/executor_api_pb2.py +19 -21
- indexify/proto/executor_api_pb2.pyi +8 -22
- {indexify-0.4.9.dist-info → indexify-0.4.11.dist-info}/METADATA +2 -2
- {indexify-0.4.9.dist-info → indexify-0.4.11.dist-info}/RECORD +21 -21
- {indexify-0.4.9.dist-info → indexify-0.4.11.dist-info}/WHEEL +0 -0
- {indexify-0.4.9.dist-info → indexify-0.4.11.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
import asyncio
|
2
2
|
import time
|
3
3
|
from collections.abc import Coroutine
|
4
|
+
from enum import Enum
|
4
5
|
from pathlib import Path
|
5
6
|
from typing import Any, Dict, List, Optional
|
6
7
|
|
@@ -43,12 +44,14 @@ from .events import (
|
|
43
44
|
from .function_executor_startup_output import FunctionExecutorStartupOutput
|
44
45
|
from .loggers import function_executor_logger, task_allocation_logger
|
45
46
|
from .metrics.function_executor_controller import (
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
47
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED,
|
48
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING,
|
49
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP,
|
50
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED,
|
51
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING,
|
52
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN,
|
50
53
|
metric_control_loop_handle_event_latency,
|
51
|
-
|
54
|
+
metric_function_executors_with_state,
|
52
55
|
metric_runnable_tasks,
|
53
56
|
metric_runnable_tasks_per_function_name,
|
54
57
|
metric_schedule_task_latency,
|
@@ -61,6 +64,16 @@ from .task_output import TaskOutput
|
|
61
64
|
from .upload_task_output import upload_task_output
|
62
65
|
|
63
66
|
|
67
|
+
# Actual FE controller states, they are a bit different from statuses reported to the Server.
|
68
|
+
# All the valid state transitions are forward only (can skip multiple states in a row).
|
69
|
+
class _FE_CONTROLLER_STATE(Enum):
|
70
|
+
NOT_STARTED = 1
|
71
|
+
STARTING_UP = 2
|
72
|
+
RUNNING = 3
|
73
|
+
TERMINATING = 4
|
74
|
+
TERMINATED = 5
|
75
|
+
|
76
|
+
|
64
77
|
class FunctionExecutorController:
|
65
78
|
def __init__(
|
66
79
|
self,
|
@@ -94,19 +107,18 @@ class FunctionExecutorController:
|
|
94
107
|
self._logger: Any = function_executor_logger(
|
95
108
|
function_executor_description, logger.bind(module=__name__)
|
96
109
|
)
|
97
|
-
|
98
|
-
# the same event loop.
|
110
|
+
self._destroy_lock: asyncio.Lock = asyncio.Lock()
|
111
|
+
# Mutable state. No lock needed as it's modified by async tasks running in the same event loop.
|
99
112
|
self._fe: Optional[FunctionExecutor] = None
|
100
|
-
self._fe_termination_reason: FunctionExecutorTerminationReason =
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
self._status: FunctionExecutorStatus = (
|
105
|
-
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN
|
106
|
-
)
|
107
|
-
metric_function_executors_with_status.labels(
|
108
|
-
status=_to_fe_status_metric_label(self._status, self._logger)
|
113
|
+
self._fe_termination_reason: Optional[FunctionExecutorTerminationReason] = None
|
114
|
+
self._internal_state = _FE_CONTROLLER_STATE.NOT_STARTED
|
115
|
+
metric_function_executors_with_state.labels(
|
116
|
+
state=_to_fe_state_metric_label(self._internal_state, self._logger)
|
109
117
|
).inc()
|
118
|
+
self._reported_state: FunctionExecutorState = FunctionExecutorState(
|
119
|
+
description=function_executor_description,
|
120
|
+
status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN,
|
121
|
+
)
|
110
122
|
# Ordered list of events to be processed by the control loop.
|
111
123
|
self._events: List[BaseEvent] = []
|
112
124
|
# Asyncio event used to notify the control loop that there are new events to process.
|
@@ -124,13 +136,6 @@ class FunctionExecutorController:
|
|
124
136
|
def function_executor_id(self) -> str:
|
125
137
|
return self._fe_description.id
|
126
138
|
|
127
|
-
def status(self) -> FunctionExecutorStatus:
|
128
|
-
"""Returns the current status of the Function Executor.
|
129
|
-
|
130
|
-
Not blocking.
|
131
|
-
"""
|
132
|
-
return self._status
|
133
|
-
|
134
139
|
def add_task_allocation(self, task_allocation: TaskAllocation) -> None:
|
135
140
|
"""Adds a task to the Function Executor.
|
136
141
|
|
@@ -197,7 +202,6 @@ class FunctionExecutorController:
|
|
197
202
|
task_info.is_cancelled = True
|
198
203
|
logger.info(
|
199
204
|
"cancelling task",
|
200
|
-
allocation_id=task_info.allocation.allocation_id,
|
201
205
|
)
|
202
206
|
if task_info.aio_task is not None:
|
203
207
|
task_info.aio_task.cancel()
|
@@ -206,9 +210,10 @@ class FunctionExecutorController:
|
|
206
210
|
"""Starts up the Function Executor and prepares it to run tasks.
|
207
211
|
|
208
212
|
Not blocking. Never raises exceptions."""
|
209
|
-
if self.
|
213
|
+
if self._internal_state != _FE_CONTROLLER_STATE.NOT_STARTED:
|
210
214
|
self._logger.warning(
|
211
|
-
"
|
215
|
+
"function executor state is not NOT_STARTED, ignoring startup call",
|
216
|
+
internal_state=self._internal_state.name,
|
212
217
|
)
|
213
218
|
return
|
214
219
|
|
@@ -216,7 +221,13 @@ class FunctionExecutorController:
|
|
216
221
|
self._control_loop(),
|
217
222
|
name="function executor control loop",
|
218
223
|
)
|
219
|
-
self.
|
224
|
+
self._update_internal_state(_FE_CONTROLLER_STATE.STARTING_UP)
|
225
|
+
self._update_reported_state(
|
226
|
+
FunctionExecutorState(
|
227
|
+
description=self._fe_description,
|
228
|
+
status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_PENDING,
|
229
|
+
)
|
230
|
+
)
|
220
231
|
next_aio = create_function_executor(
|
221
232
|
function_executor_description=self._fe_description,
|
222
233
|
function_executor_server_factory=self._fe_server_factory,
|
@@ -238,17 +249,13 @@ class FunctionExecutorController:
|
|
238
249
|
),
|
239
250
|
)
|
240
251
|
|
241
|
-
async def shutdown(
|
242
|
-
self, termination_reason: FunctionExecutorTerminationReason
|
243
|
-
) -> None:
|
252
|
+
async def shutdown(self) -> None:
|
244
253
|
"""Shutsdown the Function Executor and frees all of its resources.
|
245
254
|
|
246
|
-
|
255
|
+
No task outcomes and outputs are getting reported to Server after this call.
|
247
256
|
Doesn't raise any exceptions. Blocks until the shutdown is complete.
|
248
257
|
"""
|
249
|
-
self._add_event(
|
250
|
-
ShutdownInitiated(termination_reason=termination_reason), source="shutdown"
|
251
|
-
)
|
258
|
+
self._add_event(ShutdownInitiated(), source="shutdown")
|
252
259
|
try:
|
253
260
|
await self._control_loop_aio_task
|
254
261
|
except asyncio.CancelledError:
|
@@ -260,51 +267,49 @@ class FunctionExecutorController:
|
|
260
267
|
)
|
261
268
|
self._logger.info("function executor controller shutdown finished")
|
262
269
|
|
263
|
-
def
|
270
|
+
def _update_internal_state(self, new_state: _FE_CONTROLLER_STATE) -> None:
|
271
|
+
"""Updates the internal state of the Function Executor Controller.
|
272
|
+
|
273
|
+
Not blocking. Never raises exceptions."""
|
274
|
+
old_state: _FE_CONTROLLER_STATE = self._internal_state
|
275
|
+
self._internal_state = new_state
|
276
|
+
|
277
|
+
self._logger.info(
|
278
|
+
"function executor internal state changed",
|
279
|
+
old_state=old_state.name,
|
280
|
+
new_state=new_state.name,
|
281
|
+
)
|
282
|
+
|
283
|
+
metric_function_executors_with_state.labels(
|
284
|
+
state=_to_fe_state_metric_label(old_state, self._logger)
|
285
|
+
).dec()
|
286
|
+
metric_function_executors_with_state.labels(
|
287
|
+
state=_to_fe_state_metric_label(new_state, self._logger)
|
288
|
+
).inc()
|
289
|
+
|
290
|
+
def _update_reported_state(
|
264
291
|
self,
|
265
|
-
|
292
|
+
new_state: FunctionExecutorState,
|
266
293
|
) -> None:
|
267
|
-
"""Sets Function Executor
|
294
|
+
"""Sets new Function Executor state and reports it to the Server.
|
268
295
|
|
269
296
|
Not blocking. Never raises exceptions."""
|
270
|
-
|
271
|
-
|
272
|
-
self._status: FunctionExecutorStatus = new_status
|
297
|
+
old_state: FunctionExecutorState = self._reported_state
|
298
|
+
self._reported_state = new_state
|
273
299
|
|
274
300
|
self._logger.info(
|
275
|
-
"function executor status changed",
|
276
|
-
old_status=FunctionExecutorStatus.Name(
|
277
|
-
new_status=FunctionExecutorStatus.Name(
|
301
|
+
"function executor grpc status changed",
|
302
|
+
old_status=FunctionExecutorStatus.Name(old_state.status),
|
303
|
+
new_status=FunctionExecutorStatus.Name(new_state.status),
|
278
304
|
termination_reason=_termination_reason_to_short_name(
|
279
|
-
|
305
|
+
new_state.termination_reason
|
280
306
|
),
|
281
307
|
)
|
282
|
-
metric_function_executors_with_status.labels(
|
283
|
-
status=_to_fe_status_metric_label(old_status, self._logger)
|
284
|
-
).dec()
|
285
|
-
metric_function_executors_with_status.labels(
|
286
|
-
status=_to_fe_status_metric_label(new_status, self._logger)
|
287
|
-
).inc()
|
288
308
|
|
289
|
-
self._state_reporter.update_function_executor_state(
|
309
|
+
self._state_reporter.update_function_executor_state(new_state)
|
290
310
|
# Report the status change to the Server asap to reduce latency in the system.
|
291
311
|
self._state_reporter.schedule_state_report()
|
292
312
|
|
293
|
-
def _current_state(self) -> FunctionExecutorState:
|
294
|
-
"""Returns the current state of the Function Executor.
|
295
|
-
|
296
|
-
Not blocking. Never raises exceptions.
|
297
|
-
"""
|
298
|
-
termination_reason: Optional[FunctionExecutorTerminationReason] = None
|
299
|
-
if self._fe_termination_reason is not None:
|
300
|
-
termination_reason = self._fe_termination_reason
|
301
|
-
|
302
|
-
return FunctionExecutorState(
|
303
|
-
description=self._fe_description,
|
304
|
-
status=self._status,
|
305
|
-
termination_reason=termination_reason,
|
306
|
-
)
|
307
|
-
|
308
313
|
async def _control_loop(self) -> None:
|
309
314
|
"""Runs control loop that coordinates all the work done by the Function Executor.
|
310
315
|
|
@@ -332,7 +337,7 @@ class FunctionExecutorController:
|
|
332
337
|
self._logger.error(
|
333
338
|
"unexpected exception in function executor controller control loop",
|
334
339
|
exc_info=e,
|
335
|
-
|
340
|
+
event_type=event.event_type.name,
|
336
341
|
)
|
337
342
|
|
338
343
|
def _handle_event(self, event: BaseEvent) -> None:
|
@@ -455,13 +460,17 @@ class FunctionExecutorController:
|
|
455
460
|
self._state_reporter.schedule_state_report()
|
456
461
|
|
457
462
|
if event.function_executor is None:
|
458
|
-
self.
|
459
|
-
event.output.termination_reason
|
460
|
-
)
|
463
|
+
self._start_termination(termination_reason=event.output.termination_reason)
|
461
464
|
return
|
462
465
|
|
463
466
|
self._fe = event.function_executor
|
464
|
-
self.
|
467
|
+
self._update_internal_state(_FE_CONTROLLER_STATE.RUNNING)
|
468
|
+
self._update_reported_state(
|
469
|
+
FunctionExecutorState(
|
470
|
+
description=self._fe_description,
|
471
|
+
status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING,
|
472
|
+
)
|
473
|
+
)
|
465
474
|
# Health checker starts after FE creation and gets automatically stopped on FE destroy.
|
466
475
|
self._fe.health_checker().start(self._health_check_failed_callback)
|
467
476
|
self._add_event(
|
@@ -480,9 +489,18 @@ class FunctionExecutorController:
|
|
480
489
|
self._logger.error(
|
481
490
|
"Function Executor destroy failed unexpectedly, this should never happen",
|
482
491
|
)
|
483
|
-
|
484
|
-
self.
|
485
|
-
|
492
|
+
|
493
|
+
self._fe = None
|
494
|
+
# Set reported status only after the FE got destroyed because Server assumes that all FE resources are freed when the status changes.
|
495
|
+
self._update_reported_state(
|
496
|
+
FunctionExecutorState(
|
497
|
+
description=self._fe_description,
|
498
|
+
status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED,
|
499
|
+
termination_reason=self._fe_termination_reason,
|
500
|
+
)
|
501
|
+
)
|
502
|
+
self._update_internal_state(_FE_CONTROLLER_STATE.TERMINATED)
|
503
|
+
|
486
504
|
# Invoke the scheduler so it can fail runnable tasks with FE Terminated error.
|
487
505
|
self._add_event(
|
488
506
|
ScheduleTaskExecution(),
|
@@ -494,7 +512,7 @@ class FunctionExecutorController:
|
|
494
512
|
"Function Executor health check failed, terminating Function Executor",
|
495
513
|
reason=result.reason,
|
496
514
|
)
|
497
|
-
self.
|
515
|
+
self._start_termination(
|
498
516
|
termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
|
499
517
|
)
|
500
518
|
|
@@ -533,14 +551,15 @@ class FunctionExecutorController:
|
|
533
551
|
if len(self._runnable_tasks) == 0:
|
534
552
|
return
|
535
553
|
|
536
|
-
if self.
|
537
|
-
|
538
|
-
|
554
|
+
if self._internal_state not in [
|
555
|
+
_FE_CONTROLLER_STATE.RUNNING,
|
556
|
+
_FE_CONTROLLER_STATE.TERMINATING,
|
557
|
+
_FE_CONTROLLER_STATE.TERMINATED,
|
539
558
|
]:
|
540
|
-
return # Can't progress
|
559
|
+
return # Can't progress runnable tasks in the current state.
|
541
560
|
|
542
561
|
if (
|
543
|
-
self.
|
562
|
+
self._internal_state == _FE_CONTROLLER_STATE.RUNNING
|
544
563
|
and self._running_task is not None
|
545
564
|
):
|
546
565
|
return
|
@@ -556,12 +575,15 @@ class FunctionExecutorController:
|
|
556
575
|
if task_info.is_cancelled:
|
557
576
|
task_info.output = TaskOutput.task_cancelled(task_info.allocation)
|
558
577
|
self._start_task_output_upload(task_info)
|
559
|
-
elif self.
|
578
|
+
elif self._internal_state in [
|
579
|
+
_FE_CONTROLLER_STATE.TERMINATING,
|
580
|
+
_FE_CONTROLLER_STATE.TERMINATED,
|
581
|
+
]:
|
560
582
|
task_info.output = TaskOutput.function_executor_terminated(
|
561
583
|
task_info.allocation
|
562
584
|
)
|
563
585
|
self._start_task_output_upload(task_info)
|
564
|
-
elif self.
|
586
|
+
elif self._internal_state == _FE_CONTROLLER_STATE.RUNNING:
|
565
587
|
self._running_task = task_info
|
566
588
|
next_aio = run_task_on_function_executor(
|
567
589
|
task_info=task_info,
|
@@ -604,7 +626,7 @@ class FunctionExecutorController:
|
|
604
626
|
ScheduleTaskExecution(), source="_handle_event_task_execution_finished"
|
605
627
|
)
|
606
628
|
else:
|
607
|
-
self.
|
629
|
+
self._start_termination(
|
608
630
|
termination_reason=event.function_executor_termination_reason
|
609
631
|
)
|
610
632
|
|
@@ -661,24 +683,31 @@ class FunctionExecutorController:
|
|
661
683
|
)
|
662
684
|
self._state_reporter.schedule_state_report()
|
663
685
|
|
664
|
-
def
|
686
|
+
def _start_termination(
|
665
687
|
self, termination_reason: FunctionExecutorTerminationReason
|
666
688
|
) -> None:
|
667
|
-
"""
|
689
|
+
"""Starts termination of the Function Executor if it's not started yet.
|
668
690
|
|
669
691
|
Doesn't raise any exceptions. Doesn't block.
|
670
692
|
"""
|
693
|
+
if self._internal_state in [
|
694
|
+
_FE_CONTROLLER_STATE.TERMINATING,
|
695
|
+
_FE_CONTROLLER_STATE.TERMINATED,
|
696
|
+
]:
|
697
|
+
# _start_termination() can be called multiple times, e.g. by each failed task alloc
|
698
|
+
# when the FE is unhealthy. Dedup the calls to keep state machine consistent.
|
699
|
+
return
|
700
|
+
|
701
|
+
self._fe_termination_reason = termination_reason
|
702
|
+
self._update_internal_state(_FE_CONTROLLER_STATE.TERMINATING)
|
671
703
|
next_aio = destroy_function_executor(
|
672
704
|
function_executor=self._fe,
|
673
|
-
|
705
|
+
lock=self._destroy_lock,
|
674
706
|
logger=self._logger,
|
675
707
|
)
|
676
|
-
self._fe = None
|
677
708
|
self._spawn_aio_for_fe(
|
678
709
|
aio=next_aio,
|
679
|
-
on_exception=FunctionExecutorDestroyed(
|
680
|
-
is_success=False, termination_reason=termination_reason
|
681
|
-
),
|
710
|
+
on_exception=FunctionExecutorDestroyed(is_success=False),
|
682
711
|
)
|
683
712
|
|
684
713
|
async def _shutdown_no_exceptions(self, event: ShutdownInitiated) -> None:
|
@@ -717,16 +746,15 @@ class FunctionExecutorController:
|
|
717
746
|
# BaseException includes asyncio.CancelledError which is always raised here.
|
718
747
|
pass
|
719
748
|
|
720
|
-
|
721
|
-
self.
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
status=_to_fe_status_metric_label(self._status, self._logger)
|
749
|
+
await destroy_function_executor(
|
750
|
+
function_executor=self._fe,
|
751
|
+
lock=self._destroy_lock,
|
752
|
+
logger=self._logger,
|
753
|
+
)
|
754
|
+
|
755
|
+
# Cleanup the metric from this FE.
|
756
|
+
metric_function_executors_with_state.labels(
|
757
|
+
state=_to_fe_state_metric_label(self._internal_state, self._logger)
|
730
758
|
).dec()
|
731
759
|
|
732
760
|
self._state_reporter.remove_function_executor_state(self.function_executor_id())
|
@@ -736,21 +764,23 @@ class FunctionExecutorController:
|
|
736
764
|
debug_print_events(events=self._events, logger=self._logger)
|
737
765
|
|
738
766
|
|
739
|
-
def
|
740
|
-
if
|
741
|
-
return
|
742
|
-
elif
|
743
|
-
return
|
744
|
-
elif
|
745
|
-
return
|
746
|
-
elif
|
747
|
-
return
|
767
|
+
def _to_fe_state_metric_label(state: _FE_CONTROLLER_STATE, logger: Any) -> str:
|
768
|
+
if state == _FE_CONTROLLER_STATE.NOT_STARTED:
|
769
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED
|
770
|
+
elif state == _FE_CONTROLLER_STATE.STARTING_UP:
|
771
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP
|
772
|
+
elif state == _FE_CONTROLLER_STATE.RUNNING:
|
773
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING
|
774
|
+
elif state == _FE_CONTROLLER_STATE.TERMINATING:
|
775
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING
|
776
|
+
elif state == _FE_CONTROLLER_STATE.TERMINATED:
|
777
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED
|
748
778
|
else:
|
749
779
|
logger.error(
|
750
|
-
"unexpected Function Executor
|
751
|
-
|
780
|
+
"unexpected Function Executor internal state",
|
781
|
+
state=state.name,
|
752
782
|
)
|
753
|
-
return
|
783
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN
|
754
784
|
|
755
785
|
|
756
786
|
_termination_reason_to_short_name_map = {
|
@@ -758,8 +788,6 @@ _termination_reason_to_short_name_map = {
|
|
758
788
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR: "STARTUP_FAILED_INTERNAL_ERROR",
|
759
789
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_ERROR: "STARTUP_FAILED_FUNCTION_ERROR",
|
760
790
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_TIMEOUT: "STARTUP_FAILED_FUNCTION_TIMEOUT",
|
761
|
-
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_EXECUTOR_SHUTDOWN: "EXECUTOR_SHUTDOWN",
|
762
|
-
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_REMOVED_FROM_DESIRED_STATE: "REMOVED_FROM_DESIRED_STATE",
|
763
791
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY: "UNHEALTHY",
|
764
792
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR: "INTERNAL_ERROR",
|
765
793
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT: "FUNCTION_TIMEOUT",
|
@@ -786,16 +814,14 @@ def _to_task_result_proto(output: TaskOutput) -> TaskResult:
|
|
786
814
|
graph_invocation_id=output.allocation.task.graph_invocation_id,
|
787
815
|
reducer=output.reducer,
|
788
816
|
outcome_code=output.outcome_code,
|
789
|
-
|
817
|
+
failure_reason=output.failure_reason,
|
818
|
+
next_functions=output.next_functions,
|
790
819
|
function_outputs=output.uploaded_data_payloads,
|
820
|
+
invocation_error_output=output.uploaded_invocation_error_output,
|
791
821
|
)
|
792
|
-
if output.failure_reason is not None:
|
793
|
-
task_result.failure_reason = output.failure_reason
|
794
822
|
if output.uploaded_stdout is not None:
|
795
823
|
task_result.stdout.CopyFrom(output.uploaded_stdout)
|
796
824
|
if output.uploaded_stderr is not None:
|
797
825
|
task_result.stderr.CopyFrom(output.uploaded_stderr)
|
798
|
-
if output.router_output is not None:
|
799
|
-
task_result.routing.next_functions[:] = output.router_output.edges
|
800
826
|
|
801
827
|
return task_result
|
@@ -34,27 +34,34 @@ metric_runnable_tasks_per_function_name: prometheus_client.Gauge = (
|
|
34
34
|
)
|
35
35
|
)
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
["status"],
|
42
|
-
)
|
37
|
+
metric_function_executors_with_state: prometheus_client.Gauge = prometheus_client.Gauge(
|
38
|
+
"function_executors_with_state",
|
39
|
+
"Number of Function Executors with a particular internal state",
|
40
|
+
["state"],
|
43
41
|
)
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
42
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN = "unknown"
|
43
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED = "not_started"
|
44
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP = "starting_up"
|
45
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING = "running"
|
46
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING = "terminating"
|
47
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED = "terminated"
|
48
|
+
|
48
49
|
|
49
|
-
|
50
|
-
|
50
|
+
metric_function_executors_with_state.labels(
|
51
|
+
state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN
|
52
|
+
)
|
53
|
+
metric_function_executors_with_state.labels(
|
54
|
+
state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED
|
55
|
+
)
|
56
|
+
metric_function_executors_with_state.labels(
|
57
|
+
state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP
|
51
58
|
)
|
52
|
-
|
53
|
-
|
59
|
+
metric_function_executors_with_state.labels(
|
60
|
+
state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING
|
54
61
|
)
|
55
|
-
|
56
|
-
|
62
|
+
metric_function_executors_with_state.labels(
|
63
|
+
state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING
|
57
64
|
)
|
58
|
-
|
59
|
-
|
65
|
+
metric_function_executors_with_state.labels(
|
66
|
+
state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED
|
60
67
|
)
|
@@ -1,4 +1,6 @@
|
|
1
1
|
import asyncio
|
2
|
+
import os
|
3
|
+
import random
|
2
4
|
import time
|
3
5
|
from typing import Any, Optional
|
4
6
|
|
@@ -6,6 +8,13 @@ import grpc
|
|
6
8
|
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
7
9
|
RunTaskRequest,
|
8
10
|
RunTaskResponse,
|
11
|
+
SerializedObject,
|
12
|
+
)
|
13
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
14
|
+
TaskFailureReason as FETaskFailureReason,
|
15
|
+
)
|
16
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
17
|
+
TaskOutcomeCode as FETaskOutcomeCode,
|
9
18
|
)
|
10
19
|
from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
11
20
|
FunctionExecutorStub,
|
@@ -31,6 +40,10 @@ from .metrics.run_task import (
|
|
31
40
|
from .task_info import TaskInfo
|
32
41
|
from .task_output import TaskMetrics, TaskOutput
|
33
42
|
|
43
|
+
_ENABLE_INJECT_TASK_CANCELLATIONS = (
|
44
|
+
os.getenv("INDEXIFY_INJECT_TASK_CANCELLATIONS", "0") == "1"
|
45
|
+
)
|
46
|
+
|
34
47
|
|
35
48
|
async def run_task_on_function_executor(
|
36
49
|
task_info: TaskInfo, function_executor: FunctionExecutor, logger: Any
|
@@ -83,6 +96,7 @@ async def run_task_on_function_executor(
|
|
83
96
|
task_info.output = _task_output_from_function_executor_response(
|
84
97
|
allocation=task_info.allocation,
|
85
98
|
response=response,
|
99
|
+
logger=logger,
|
86
100
|
)
|
87
101
|
except grpc.aio.AioRpcError as e:
|
88
102
|
if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
|
@@ -140,13 +154,13 @@ async def run_task_on_function_executor(
|
|
140
154
|
|
141
155
|
|
142
156
|
def _task_output_from_function_executor_response(
|
143
|
-
allocation: TaskAllocation, response: RunTaskResponse
|
157
|
+
allocation: TaskAllocation, response: RunTaskResponse, logger: Any
|
144
158
|
) -> TaskOutput:
|
145
159
|
response_validator = MessageValidator(response)
|
146
160
|
response_validator.required_field("stdout")
|
147
161
|
response_validator.required_field("stderr")
|
148
162
|
response_validator.required_field("is_reducer")
|
149
|
-
response_validator.required_field("
|
163
|
+
response_validator.required_field("outcome_code")
|
150
164
|
|
151
165
|
metrics = TaskMetrics(counters={}, timers={})
|
152
166
|
if response.HasField("metrics"):
|
@@ -154,31 +168,42 @@ def _task_output_from_function_executor_response(
|
|
154
168
|
metrics.counters = dict(response.metrics.counters)
|
155
169
|
metrics.timers = dict(response.metrics.timers)
|
156
170
|
|
157
|
-
|
171
|
+
outcome_code: TaskOutcomeCode = _to_task_outcome_code(
|
172
|
+
response.outcome_code, logger=logger
|
173
|
+
)
|
174
|
+
failure_reason: Optional[TaskFailureReason] = None
|
175
|
+
invocation_error_output: Optional[SerializedObject] = None
|
176
|
+
|
177
|
+
if outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE:
|
178
|
+
response_validator.required_field("failure_reason")
|
179
|
+
failure_reason: Optional[TaskFailureReason] = _to_task_failure_reason(
|
180
|
+
response.failure_reason, logger
|
181
|
+
)
|
182
|
+
if failure_reason == TaskFailureReason.TASK_FAILURE_REASON_INVOCATION_ERROR:
|
183
|
+
response_validator.required_field("invocation_error_output")
|
184
|
+
invocation_error_output = response.invocation_error_output
|
185
|
+
|
186
|
+
if _ENABLE_INJECT_TASK_CANCELLATIONS:
|
187
|
+
logger.warning("injecting cancellation failure for the task allocation")
|
188
|
+
if (
|
189
|
+
random.random() < 0.5
|
190
|
+
): # 50% chance to get stable reproduction in manual testing
|
191
|
+
outcome_code = TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
|
192
|
+
failure_reason = TaskFailureReason.TASK_FAILURE_REASON_TASK_CANCELLED
|
193
|
+
|
194
|
+
return TaskOutput(
|
158
195
|
allocation=allocation,
|
159
|
-
outcome_code=
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
failure_reason=(
|
165
|
-
None
|
166
|
-
if response.success
|
167
|
-
else TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR
|
168
|
-
),
|
196
|
+
outcome_code=outcome_code,
|
197
|
+
failure_reason=failure_reason,
|
198
|
+
invocation_error_output=invocation_error_output,
|
199
|
+
function_outputs=response.function_outputs,
|
200
|
+
next_functions=response.next_functions,
|
169
201
|
stdout=response.stdout,
|
170
202
|
stderr=response.stderr,
|
171
203
|
reducer=response.is_reducer,
|
172
204
|
metrics=metrics,
|
173
205
|
)
|
174
206
|
|
175
|
-
if response.HasField("function_output"):
|
176
|
-
output.function_output = response.function_output
|
177
|
-
if response.HasField("router_output"):
|
178
|
-
output.router_output = response.router_output
|
179
|
-
|
180
|
-
return output
|
181
|
-
|
182
207
|
|
183
208
|
def _log_task_execution_finished(output: TaskOutput, logger: Any) -> None:
|
184
209
|
logger.info(
|
@@ -191,3 +216,40 @@ def _log_task_execution_finished(output: TaskOutput, logger: Any) -> None:
|
|
191
216
|
else None
|
192
217
|
),
|
193
218
|
)
|
219
|
+
|
220
|
+
|
221
|
+
def _to_task_outcome_code(
|
222
|
+
fe_task_outcome_code: FETaskOutcomeCode, logger
|
223
|
+
) -> TaskOutcomeCode:
|
224
|
+
if fe_task_outcome_code == FETaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS:
|
225
|
+
return TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS
|
226
|
+
elif fe_task_outcome_code == FETaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE:
|
227
|
+
return TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
|
228
|
+
else:
|
229
|
+
logger.warning(
|
230
|
+
"Unknown TaskOutcomeCode received from Function Executor",
|
231
|
+
value=FETaskOutcomeCode.Name(fe_task_outcome_code),
|
232
|
+
)
|
233
|
+
return TaskOutcomeCode.TASK_OUTCOME_CODE_UNKNOWN
|
234
|
+
|
235
|
+
|
236
|
+
def _to_task_failure_reason(
|
237
|
+
fe_task_failure_reason: FETaskFailureReason, logger: Any
|
238
|
+
) -> TaskFailureReason:
|
239
|
+
if fe_task_failure_reason == FETaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR:
|
240
|
+
return TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR
|
241
|
+
elif (
|
242
|
+
fe_task_failure_reason
|
243
|
+
== FETaskFailureReason.TASK_FAILURE_REASON_INVOCATION_ERROR
|
244
|
+
):
|
245
|
+
return TaskFailureReason.TASK_FAILURE_REASON_INVOCATION_ERROR
|
246
|
+
elif (
|
247
|
+
fe_task_failure_reason == FETaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR
|
248
|
+
):
|
249
|
+
return TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR
|
250
|
+
else:
|
251
|
+
logger.warning(
|
252
|
+
"Unknown TaskFailureReason received from Function Executor",
|
253
|
+
value=FETaskFailureReason.Name(fe_task_failure_reason),
|
254
|
+
)
|
255
|
+
return TaskFailureReason.TASK_FAILURE_REASON_UNKNOWN
|