indexify 0.4.10__tar.gz → 0.4.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {indexify-0.4.10 → indexify-0.4.12}/PKG-INFO +2 -2
- {indexify-0.4.10 → indexify-0.4.12}/pyproject.toml +2 -2
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/events.py +18 -16
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/function_executor_controller.py +177 -121
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/metrics/function_executor_controller.py +25 -18
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/run_task.py +14 -0
- indexify-0.4.12/src/indexify/executor/function_executor_controller/terminate_function_executor.py +38 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/state_reconciler.py +2 -7
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/proto/executor_api.proto +2 -4
- indexify-0.4.12/src/indexify/proto/executor_api_pb2.py +88 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/proto/executor_api_pb2.pyi +9 -13
- indexify-0.4.10/src/indexify/executor/function_executor_controller/destroy_function_executor.py +0 -28
- indexify-0.4.10/src/indexify/proto/executor_api_pb2.py +0 -88
- {indexify-0.4.10 → indexify-0.4.12}/README.md +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/cli/__init__.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/cli/build_image.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/cli/deploy.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/cli/executor.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/README.md +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/blob_store/blob_store.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/blob_store/local_fs_blob_store.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/blob_store/metrics/blob_store.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/blob_store/s3_blob_store.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/channel_manager.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/executor.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_allowlist.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/function_executor.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/health_checker.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/invocation_state_client.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/metrics/function_executor.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/metrics/health_checker.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/metrics/invocation_state_client.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/server/client_configuration.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/server/function_executor_server.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/server/function_executor_server_factory.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/server/subprocess_function_executor_server.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/__init__.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/completed_task_metrics.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/create_function_executor.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/debug_event_loop.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/downloads.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/function_executor_startup_output.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/loggers.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/message_validators.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/metrics/downloads.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/metrics/run_task.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/metrics/upload_task_output.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/prepare_task.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/task_info.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/task_output.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/upload_task_output.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/host_resources/host_resources.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/host_resources/nvidia_gpu.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/host_resources/nvidia_gpu_allocator.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/metrics/channel_manager.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/metrics/executor.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/metrics/state_reconciler.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/metrics/state_reporter.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/monitoring/handler.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/monitoring/health_check_handler.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/monitoring/health_checker/generic_health_checker.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/monitoring/health_checker/health_checker.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/monitoring/metrics.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/monitoring/prometheus_metrics_handler.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/monitoring/server.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/monitoring/startup_probe_handler.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/state_reporter.py +0 -0
- {indexify-0.4.10 → indexify-0.4.12}/src/indexify/proto/executor_api_pb2_grpc.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: indexify
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.12
|
4
4
|
Summary: Open Source Indexify components and helper tools
|
5
5
|
Home-page: https://github.com/tensorlakeai/indexify
|
6
6
|
License: Apache 2.0
|
@@ -17,7 +17,7 @@ Requires-Dist: aiohttp (>=3.11.0,<4.0.0)
|
|
17
17
|
Requires-Dist: boto3 (>=1.37.30,<2.0.0)
|
18
18
|
Requires-Dist: prometheus-client (>=0.21.1,<0.22.0)
|
19
19
|
Requires-Dist: psutil (>=7.0.0,<8.0.0)
|
20
|
-
Requires-Dist: tensorlake (==0.2.
|
20
|
+
Requires-Dist: tensorlake (==0.2.8)
|
21
21
|
Project-URL: Repository, https://github.com/tensorlakeai/indexify
|
22
22
|
Description-Content-Type: text/markdown
|
23
23
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "indexify"
|
3
3
|
# Incremented if any of the components provided in this packages are updated.
|
4
|
-
version = "0.4.
|
4
|
+
version = "0.4.12"
|
5
5
|
description = "Open Source Indexify components and helper tools"
|
6
6
|
authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
|
7
7
|
license = "Apache 2.0"
|
@@ -25,7 +25,7 @@ prometheus-client = "^0.21.1"
|
|
25
25
|
psutil = "^7.0.0"
|
26
26
|
# Adds function-executor binary, utils lib, sdk used in indexify-cli commands.
|
27
27
|
# We need to specify the tensorlake version exactly because pip install doesn't respect poetry.lock files.
|
28
|
-
tensorlake = "0.2.
|
28
|
+
tensorlake = "0.2.8"
|
29
29
|
# Uncomment the next line to use local tensorlake package (only for development!)
|
30
30
|
# tensorlake = { path = "../tensorlake", develop = true }
|
31
31
|
# pydantic is provided by tensorlake
|
{indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/events.py
RENAMED
@@ -1,5 +1,5 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
from typing import Optional
|
2
|
+
from typing import List, Optional
|
3
3
|
|
4
4
|
from indexify.executor.function_executor.function_executor import (
|
5
5
|
FunctionExecutor,
|
@@ -12,7 +12,7 @@ from .task_info import TaskInfo
|
|
12
12
|
|
13
13
|
class EventType(Enum):
|
14
14
|
FUNCTION_EXECUTOR_CREATED = 1
|
15
|
-
|
15
|
+
FUNCTION_EXECUTOR_TERMINATED = 2
|
16
16
|
SHUTDOWN_INITIATED = 3
|
17
17
|
TASK_PREPARATION_FINISHED = 4
|
18
18
|
SCHEDULE_TASK_EXECUTION = 5
|
@@ -50,23 +50,32 @@ class FunctionExecutorCreated(BaseEvent):
|
|
50
50
|
self.output: FunctionExecutorStartupOutput = output
|
51
51
|
|
52
52
|
|
53
|
-
class
|
53
|
+
class FunctionExecutorTerminated(BaseEvent):
|
54
54
|
"""
|
55
|
-
Event indicating that Function Executor has been destroyed.
|
55
|
+
Event indicating that Function Executor has been terminated (destroyed).
|
56
56
|
"""
|
57
57
|
|
58
58
|
def __init__(
|
59
|
-
self,
|
59
|
+
self,
|
60
|
+
is_success: bool,
|
61
|
+
fe_termination_reason: FunctionExecutorTerminationReason,
|
62
|
+
allocation_ids_caused_termination: List[str],
|
60
63
|
):
|
61
|
-
super().__init__(EventType.
|
64
|
+
super().__init__(EventType.FUNCTION_EXECUTOR_TERMINATED)
|
62
65
|
self.is_success: bool = is_success
|
63
|
-
self.
|
66
|
+
self.fe_termination_reason: FunctionExecutorTerminationReason = (
|
67
|
+
fe_termination_reason
|
68
|
+
)
|
69
|
+
self.allocation_ids_caused_termination: List[str] = (
|
70
|
+
allocation_ids_caused_termination
|
71
|
+
)
|
64
72
|
|
65
73
|
def __str__(self) -> str:
|
66
74
|
return (
|
67
75
|
f"Event(type={self.event_type.name}, "
|
68
76
|
f"is_success={self.is_success}, "
|
69
|
-
f"
|
77
|
+
f"fe_termination_reason={FunctionExecutorTerminationReason.Name(self.fe_termination_reason)}, "
|
78
|
+
f"allocation_ids_caused_termination={self.allocation_ids_caused_termination})"
|
70
79
|
)
|
71
80
|
|
72
81
|
|
@@ -75,15 +84,8 @@ class ShutdownInitiated(BaseEvent):
|
|
75
84
|
Event indicating that Function Executor shutdown has been initiated.
|
76
85
|
"""
|
77
86
|
|
78
|
-
def __init__(self
|
87
|
+
def __init__(self):
|
79
88
|
super().__init__(EventType.SHUTDOWN_INITIATED)
|
80
|
-
self.termination_reason: FunctionExecutorTerminationReason = termination_reason
|
81
|
-
|
82
|
-
def __str__(self) -> str:
|
83
|
-
return (
|
84
|
-
f"Event(type={self.event_type.name}, "
|
85
|
-
f"termination_reason={FunctionExecutorTerminationReason.Name(self.termination_reason)})"
|
86
|
-
)
|
87
89
|
|
88
90
|
|
89
91
|
class TaskPreparationFinished(BaseEvent):
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import asyncio
|
2
2
|
import time
|
3
3
|
from collections.abc import Coroutine
|
4
|
+
from enum import Enum
|
4
5
|
from pathlib import Path
|
5
6
|
from typing import Any, Dict, List, Optional
|
6
7
|
|
@@ -28,12 +29,11 @@ from .debug_event_loop import (
|
|
28
29
|
debug_print_events,
|
29
30
|
debug_print_processing_event,
|
30
31
|
)
|
31
|
-
from .destroy_function_executor import destroy_function_executor
|
32
32
|
from .events import (
|
33
33
|
BaseEvent,
|
34
34
|
EventType,
|
35
35
|
FunctionExecutorCreated,
|
36
|
-
|
36
|
+
FunctionExecutorTerminated,
|
37
37
|
ScheduleTaskExecution,
|
38
38
|
ShutdownInitiated,
|
39
39
|
TaskExecutionFinished,
|
@@ -43,12 +43,14 @@ from .events import (
|
|
43
43
|
from .function_executor_startup_output import FunctionExecutorStartupOutput
|
44
44
|
from .loggers import function_executor_logger, task_allocation_logger
|
45
45
|
from .metrics.function_executor_controller import (
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
46
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED,
|
47
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING,
|
48
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP,
|
49
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED,
|
50
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING,
|
51
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN,
|
50
52
|
metric_control_loop_handle_event_latency,
|
51
|
-
|
53
|
+
metric_function_executors_with_state,
|
52
54
|
metric_runnable_tasks,
|
53
55
|
metric_runnable_tasks_per_function_name,
|
54
56
|
metric_schedule_task_latency,
|
@@ -58,9 +60,20 @@ from .prepare_task import prepare_task
|
|
58
60
|
from .run_task import run_task_on_function_executor
|
59
61
|
from .task_info import TaskInfo
|
60
62
|
from .task_output import TaskOutput
|
63
|
+
from .terminate_function_executor import terminate_function_executor
|
61
64
|
from .upload_task_output import upload_task_output
|
62
65
|
|
63
66
|
|
67
|
+
# Actual FE controller states, they are a bit different from statuses reported to the Server.
|
68
|
+
# All the valid state transitions are forward only (can skip multiple states in a row).
|
69
|
+
class _FE_CONTROLLER_STATE(Enum):
|
70
|
+
NOT_STARTED = 1
|
71
|
+
STARTING_UP = 2
|
72
|
+
RUNNING = 3
|
73
|
+
TERMINATING = 4
|
74
|
+
TERMINATED = 5
|
75
|
+
|
76
|
+
|
64
77
|
class FunctionExecutorController:
|
65
78
|
def __init__(
|
66
79
|
self,
|
@@ -94,19 +107,17 @@ class FunctionExecutorController:
|
|
94
107
|
self._logger: Any = function_executor_logger(
|
95
108
|
function_executor_description, logger.bind(module=__name__)
|
96
109
|
)
|
97
|
-
|
98
|
-
# the same event loop.
|
110
|
+
self._destroy_lock: asyncio.Lock = asyncio.Lock()
|
111
|
+
# Mutable state. No lock needed as it's modified by async tasks running in the same event loop.
|
99
112
|
self._fe: Optional[FunctionExecutor] = None
|
100
|
-
self.
|
101
|
-
|
102
|
-
|
103
|
-
# FE Status reported to Server.
|
104
|
-
self._status: FunctionExecutorStatus = (
|
105
|
-
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN
|
106
|
-
)
|
107
|
-
metric_function_executors_with_status.labels(
|
108
|
-
status=_to_fe_status_metric_label(self._status, self._logger)
|
113
|
+
self._internal_state = _FE_CONTROLLER_STATE.NOT_STARTED
|
114
|
+
metric_function_executors_with_state.labels(
|
115
|
+
state=_to_fe_state_metric_label(self._internal_state, self._logger)
|
109
116
|
).inc()
|
117
|
+
self._reported_state: FunctionExecutorState = FunctionExecutorState(
|
118
|
+
description=function_executor_description,
|
119
|
+
status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN,
|
120
|
+
)
|
110
121
|
# Ordered list of events to be processed by the control loop.
|
111
122
|
self._events: List[BaseEvent] = []
|
112
123
|
# Asyncio event used to notify the control loop that there are new events to process.
|
@@ -124,13 +135,6 @@ class FunctionExecutorController:
|
|
124
135
|
def function_executor_id(self) -> str:
|
125
136
|
return self._fe_description.id
|
126
137
|
|
127
|
-
def status(self) -> FunctionExecutorStatus:
|
128
|
-
"""Returns the current status of the Function Executor.
|
129
|
-
|
130
|
-
Not blocking.
|
131
|
-
"""
|
132
|
-
return self._status
|
133
|
-
|
134
138
|
def add_task_allocation(self, task_allocation: TaskAllocation) -> None:
|
135
139
|
"""Adds a task to the Function Executor.
|
136
140
|
|
@@ -205,9 +209,10 @@ class FunctionExecutorController:
|
|
205
209
|
"""Starts up the Function Executor and prepares it to run tasks.
|
206
210
|
|
207
211
|
Not blocking. Never raises exceptions."""
|
208
|
-
if self.
|
212
|
+
if self._internal_state != _FE_CONTROLLER_STATE.NOT_STARTED:
|
209
213
|
self._logger.warning(
|
210
|
-
"
|
214
|
+
"function executor state is not NOT_STARTED, ignoring startup call",
|
215
|
+
internal_state=self._internal_state.name,
|
211
216
|
)
|
212
217
|
return
|
213
218
|
|
@@ -215,7 +220,13 @@ class FunctionExecutorController:
|
|
215
220
|
self._control_loop(),
|
216
221
|
name="function executor control loop",
|
217
222
|
)
|
218
|
-
self.
|
223
|
+
self._update_internal_state(_FE_CONTROLLER_STATE.STARTING_UP)
|
224
|
+
self._update_reported_state(
|
225
|
+
FunctionExecutorState(
|
226
|
+
description=self._fe_description,
|
227
|
+
status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_PENDING,
|
228
|
+
)
|
229
|
+
)
|
219
230
|
next_aio = create_function_executor(
|
220
231
|
function_executor_description=self._fe_description,
|
221
232
|
function_executor_server_factory=self._fe_server_factory,
|
@@ -237,17 +248,13 @@ class FunctionExecutorController:
|
|
237
248
|
),
|
238
249
|
)
|
239
250
|
|
240
|
-
async def shutdown(
|
241
|
-
self, termination_reason: FunctionExecutorTerminationReason
|
242
|
-
) -> None:
|
251
|
+
async def shutdown(self) -> None:
|
243
252
|
"""Shutsdown the Function Executor and frees all of its resources.
|
244
253
|
|
245
|
-
|
254
|
+
No task outcomes and outputs are getting reported to Server after this call.
|
246
255
|
Doesn't raise any exceptions. Blocks until the shutdown is complete.
|
247
256
|
"""
|
248
|
-
self._add_event(
|
249
|
-
ShutdownInitiated(termination_reason=termination_reason), source="shutdown"
|
250
|
-
)
|
257
|
+
self._add_event(ShutdownInitiated(), source="shutdown")
|
251
258
|
try:
|
252
259
|
await self._control_loop_aio_task
|
253
260
|
except asyncio.CancelledError:
|
@@ -259,51 +266,49 @@ class FunctionExecutorController:
|
|
259
266
|
)
|
260
267
|
self._logger.info("function executor controller shutdown finished")
|
261
268
|
|
262
|
-
def
|
269
|
+
def _update_internal_state(self, new_state: _FE_CONTROLLER_STATE) -> None:
|
270
|
+
"""Updates the internal state of the Function Executor Controller.
|
271
|
+
|
272
|
+
Not blocking. Never raises exceptions."""
|
273
|
+
old_state: _FE_CONTROLLER_STATE = self._internal_state
|
274
|
+
self._internal_state = new_state
|
275
|
+
|
276
|
+
self._logger.info(
|
277
|
+
"function executor internal state changed",
|
278
|
+
old_state=old_state.name,
|
279
|
+
new_state=new_state.name,
|
280
|
+
)
|
281
|
+
|
282
|
+
metric_function_executors_with_state.labels(
|
283
|
+
state=_to_fe_state_metric_label(old_state, self._logger)
|
284
|
+
).dec()
|
285
|
+
metric_function_executors_with_state.labels(
|
286
|
+
state=_to_fe_state_metric_label(new_state, self._logger)
|
287
|
+
).inc()
|
288
|
+
|
289
|
+
def _update_reported_state(
|
263
290
|
self,
|
264
|
-
|
291
|
+
new_state: FunctionExecutorState,
|
265
292
|
) -> None:
|
266
|
-
"""Sets Function Executor
|
293
|
+
"""Sets new Function Executor state and reports it to the Server.
|
267
294
|
|
268
295
|
Not blocking. Never raises exceptions."""
|
269
|
-
|
270
|
-
|
271
|
-
self._status: FunctionExecutorStatus = new_status
|
296
|
+
old_state: FunctionExecutorState = self._reported_state
|
297
|
+
self._reported_state = new_state
|
272
298
|
|
273
299
|
self._logger.info(
|
274
|
-
"function executor status changed",
|
275
|
-
old_status=FunctionExecutorStatus.Name(
|
276
|
-
new_status=FunctionExecutorStatus.Name(
|
300
|
+
"function executor grpc status changed",
|
301
|
+
old_status=FunctionExecutorStatus.Name(old_state.status),
|
302
|
+
new_status=FunctionExecutorStatus.Name(new_state.status),
|
277
303
|
termination_reason=_termination_reason_to_short_name(
|
278
|
-
|
304
|
+
new_state.termination_reason
|
279
305
|
),
|
280
306
|
)
|
281
|
-
metric_function_executors_with_status.labels(
|
282
|
-
status=_to_fe_status_metric_label(old_status, self._logger)
|
283
|
-
).dec()
|
284
|
-
metric_function_executors_with_status.labels(
|
285
|
-
status=_to_fe_status_metric_label(new_status, self._logger)
|
286
|
-
).inc()
|
287
307
|
|
288
|
-
self._state_reporter.update_function_executor_state(
|
308
|
+
self._state_reporter.update_function_executor_state(new_state)
|
289
309
|
# Report the status change to the Server asap to reduce latency in the system.
|
290
310
|
self._state_reporter.schedule_state_report()
|
291
311
|
|
292
|
-
def _current_state(self) -> FunctionExecutorState:
|
293
|
-
"""Returns the current state of the Function Executor.
|
294
|
-
|
295
|
-
Not blocking. Never raises exceptions.
|
296
|
-
"""
|
297
|
-
termination_reason: Optional[FunctionExecutorTerminationReason] = None
|
298
|
-
if self._fe_termination_reason is not None:
|
299
|
-
termination_reason = self._fe_termination_reason
|
300
|
-
|
301
|
-
return FunctionExecutorState(
|
302
|
-
description=self._fe_description,
|
303
|
-
status=self._status,
|
304
|
-
termination_reason=termination_reason,
|
305
|
-
)
|
306
|
-
|
307
312
|
async def _control_loop(self) -> None:
|
308
313
|
"""Runs control loop that coordinates all the work done by the Function Executor.
|
309
314
|
|
@@ -341,8 +346,8 @@ class FunctionExecutorController:
|
|
341
346
|
"""
|
342
347
|
if event.event_type == EventType.FUNCTION_EXECUTOR_CREATED:
|
343
348
|
return self._handle_event_function_executor_created(event)
|
344
|
-
elif event.event_type == EventType.
|
345
|
-
return self.
|
349
|
+
elif event.event_type == EventType.FUNCTION_EXECUTOR_TERMINATED:
|
350
|
+
return self._handle_event_function_executor_terminated(event)
|
346
351
|
elif event.event_type == EventType.TASK_PREPARATION_FINISHED:
|
347
352
|
return self._handle_event_task_preparation_finished(event)
|
348
353
|
elif event.event_type == EventType.SCHEDULE_TASK_EXECUTION:
|
@@ -454,13 +459,25 @@ class FunctionExecutorController:
|
|
454
459
|
self._state_reporter.schedule_state_report()
|
455
460
|
|
456
461
|
if event.function_executor is None:
|
457
|
-
|
458
|
-
|
462
|
+
# Server needs to increment attempts counter for all the tasks that were pending while FE was starting up.
|
463
|
+
# This prevents infinite retries if FEs consistently fail to start up.
|
464
|
+
self._start_termination(
|
465
|
+
fe_termination_reason=event.output.termination_reason,
|
466
|
+
allocation_ids_caused_termination=[
|
467
|
+
task_info.allocation.allocation_id
|
468
|
+
for task_info in self._tasks.values()
|
469
|
+
],
|
459
470
|
)
|
460
471
|
return
|
461
472
|
|
462
473
|
self._fe = event.function_executor
|
463
|
-
self.
|
474
|
+
self._update_internal_state(_FE_CONTROLLER_STATE.RUNNING)
|
475
|
+
self._update_reported_state(
|
476
|
+
FunctionExecutorState(
|
477
|
+
description=self._fe_description,
|
478
|
+
status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING,
|
479
|
+
)
|
480
|
+
)
|
464
481
|
# Health checker starts after FE creation and gets automatically stopped on FE destroy.
|
465
482
|
self._fe.health_checker().start(self._health_check_failed_callback)
|
466
483
|
self._add_event(
|
@@ -468,20 +485,30 @@ class FunctionExecutorController:
|
|
468
485
|
source="_handle_event_function_executor_created",
|
469
486
|
)
|
470
487
|
|
471
|
-
def
|
472
|
-
self, event:
|
488
|
+
def _handle_event_function_executor_terminated(
|
489
|
+
self, event: FunctionExecutorTerminated
|
473
490
|
) -> None:
|
474
|
-
"""Handles the Function Executor
|
491
|
+
"""Handles the Function Executor terminated event.
|
475
492
|
|
476
493
|
Doesn't raise any exceptions. Doesn't block.
|
477
494
|
"""
|
478
495
|
if not event.is_success:
|
479
496
|
self._logger.error(
|
480
|
-
"Function Executor
|
497
|
+
"Function Executor termination failed unexpectedly, this should never happen",
|
498
|
+
)
|
499
|
+
|
500
|
+
self._fe = None
|
501
|
+
# Set reported status only after the FE got destroyed because Server assumes that all FE resources are freed when the status changes.
|
502
|
+
self._update_reported_state(
|
503
|
+
FunctionExecutorState(
|
504
|
+
description=self._fe_description,
|
505
|
+
status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED,
|
506
|
+
termination_reason=event.fe_termination_reason,
|
507
|
+
allocation_ids_caused_termination=event.allocation_ids_caused_termination,
|
481
508
|
)
|
482
|
-
|
483
|
-
self.
|
484
|
-
|
509
|
+
)
|
510
|
+
self._update_internal_state(_FE_CONTROLLER_STATE.TERMINATED)
|
511
|
+
|
485
512
|
# Invoke the scheduler so it can fail runnable tasks with FE Terminated error.
|
486
513
|
self._add_event(
|
487
514
|
ScheduleTaskExecution(),
|
@@ -493,8 +520,14 @@ class FunctionExecutorController:
|
|
493
520
|
"Function Executor health check failed, terminating Function Executor",
|
494
521
|
reason=result.reason,
|
495
522
|
)
|
496
|
-
|
497
|
-
|
523
|
+
|
524
|
+
self._start_termination(
|
525
|
+
fe_termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY,
|
526
|
+
allocation_ids_caused_termination=(
|
527
|
+
[]
|
528
|
+
if self._running_task is None
|
529
|
+
else [self._running_task.allocation.allocation_id]
|
530
|
+
),
|
498
531
|
)
|
499
532
|
|
500
533
|
def _handle_event_task_preparation_finished(
|
@@ -532,14 +565,15 @@ class FunctionExecutorController:
|
|
532
565
|
if len(self._runnable_tasks) == 0:
|
533
566
|
return
|
534
567
|
|
535
|
-
if self.
|
536
|
-
|
537
|
-
|
568
|
+
if self._internal_state not in [
|
569
|
+
_FE_CONTROLLER_STATE.RUNNING,
|
570
|
+
_FE_CONTROLLER_STATE.TERMINATING,
|
571
|
+
_FE_CONTROLLER_STATE.TERMINATED,
|
538
572
|
]:
|
539
|
-
return # Can't progress
|
573
|
+
return # Can't progress runnable tasks in the current state.
|
540
574
|
|
541
575
|
if (
|
542
|
-
self.
|
576
|
+
self._internal_state == _FE_CONTROLLER_STATE.RUNNING
|
543
577
|
and self._running_task is not None
|
544
578
|
):
|
545
579
|
return
|
@@ -555,12 +589,15 @@ class FunctionExecutorController:
|
|
555
589
|
if task_info.is_cancelled:
|
556
590
|
task_info.output = TaskOutput.task_cancelled(task_info.allocation)
|
557
591
|
self._start_task_output_upload(task_info)
|
558
|
-
elif self.
|
592
|
+
elif self._internal_state in [
|
593
|
+
_FE_CONTROLLER_STATE.TERMINATING,
|
594
|
+
_FE_CONTROLLER_STATE.TERMINATED,
|
595
|
+
]:
|
559
596
|
task_info.output = TaskOutput.function_executor_terminated(
|
560
597
|
task_info.allocation
|
561
598
|
)
|
562
599
|
self._start_task_output_upload(task_info)
|
563
|
-
elif self.
|
600
|
+
elif self._internal_state == _FE_CONTROLLER_STATE.RUNNING:
|
564
601
|
self._running_task = task_info
|
565
602
|
next_aio = run_task_on_function_executor(
|
566
603
|
task_info=task_info,
|
@@ -603,8 +640,11 @@ class FunctionExecutorController:
|
|
603
640
|
ScheduleTaskExecution(), source="_handle_event_task_execution_finished"
|
604
641
|
)
|
605
642
|
else:
|
606
|
-
self.
|
607
|
-
|
643
|
+
self._start_termination(
|
644
|
+
fe_termination_reason=event.function_executor_termination_reason,
|
645
|
+
allocation_ids_caused_termination=[
|
646
|
+
event.task_info.allocation.allocation_id
|
647
|
+
],
|
608
648
|
)
|
609
649
|
|
610
650
|
# Ignore is_cancelled because cancelling a task still involves uploading its output.
|
@@ -660,23 +700,37 @@ class FunctionExecutorController:
|
|
660
700
|
)
|
661
701
|
self._state_reporter.schedule_state_report()
|
662
702
|
|
663
|
-
def
|
664
|
-
self,
|
703
|
+
def _start_termination(
|
704
|
+
self,
|
705
|
+
fe_termination_reason: FunctionExecutorTerminationReason,
|
706
|
+
allocation_ids_caused_termination: List[str],
|
665
707
|
) -> None:
|
666
|
-
"""
|
708
|
+
"""Starts termination of the Function Executor if it's not started yet.
|
667
709
|
|
668
710
|
Doesn't raise any exceptions. Doesn't block.
|
669
711
|
"""
|
670
|
-
|
712
|
+
if self._internal_state in [
|
713
|
+
_FE_CONTROLLER_STATE.TERMINATING,
|
714
|
+
_FE_CONTROLLER_STATE.TERMINATED,
|
715
|
+
]:
|
716
|
+
# _start_termination() can be called multiple times, e.g. by each failed task alloc
|
717
|
+
# when the FE is unhealthy. Dedup the calls to keep state machine consistent.
|
718
|
+
return
|
719
|
+
|
720
|
+
self._update_internal_state(_FE_CONTROLLER_STATE.TERMINATING)
|
721
|
+
next_aio = terminate_function_executor(
|
671
722
|
function_executor=self._fe,
|
672
|
-
|
723
|
+
lock=self._destroy_lock,
|
724
|
+
fe_termination_reason=fe_termination_reason,
|
725
|
+
allocation_ids_caused_termination=allocation_ids_caused_termination,
|
673
726
|
logger=self._logger,
|
674
727
|
)
|
675
|
-
self._fe = None
|
676
728
|
self._spawn_aio_for_fe(
|
677
729
|
aio=next_aio,
|
678
|
-
on_exception=
|
679
|
-
is_success=False,
|
730
|
+
on_exception=FunctionExecutorTerminated(
|
731
|
+
is_success=False,
|
732
|
+
fe_termination_reason=fe_termination_reason,
|
733
|
+
allocation_ids_caused_termination=allocation_ids_caused_termination,
|
680
734
|
),
|
681
735
|
)
|
682
736
|
|
@@ -716,16 +770,18 @@ class FunctionExecutorController:
|
|
716
770
|
# BaseException includes asyncio.CancelledError which is always raised here.
|
717
771
|
pass
|
718
772
|
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
773
|
+
# Makes sure we don't run fe destroy concurrently with an event loop task.
|
774
|
+
# FE destroy uses asyncio.to_thread() calls so it doesn't get cancelled with all the tasks above.
|
775
|
+
async with self._destroy_lock:
|
776
|
+
if self._fe is not None:
|
777
|
+
self._logger.info(
|
778
|
+
"destroying function executor",
|
725
779
|
)
|
726
|
-
|
727
|
-
|
728
|
-
|
780
|
+
await self._fe.destroy()
|
781
|
+
|
782
|
+
# Cleanup the metric from this FE.
|
783
|
+
metric_function_executors_with_state.labels(
|
784
|
+
state=_to_fe_state_metric_label(self._internal_state, self._logger)
|
729
785
|
).dec()
|
730
786
|
|
731
787
|
self._state_reporter.remove_function_executor_state(self.function_executor_id())
|
@@ -735,21 +791,23 @@ class FunctionExecutorController:
|
|
735
791
|
debug_print_events(events=self._events, logger=self._logger)
|
736
792
|
|
737
793
|
|
738
|
-
def
|
739
|
-
if
|
740
|
-
return
|
741
|
-
elif
|
742
|
-
return
|
743
|
-
elif
|
744
|
-
return
|
745
|
-
elif
|
746
|
-
return
|
794
|
+
def _to_fe_state_metric_label(state: _FE_CONTROLLER_STATE, logger: Any) -> str:
|
795
|
+
if state == _FE_CONTROLLER_STATE.NOT_STARTED:
|
796
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED
|
797
|
+
elif state == _FE_CONTROLLER_STATE.STARTING_UP:
|
798
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP
|
799
|
+
elif state == _FE_CONTROLLER_STATE.RUNNING:
|
800
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING
|
801
|
+
elif state == _FE_CONTROLLER_STATE.TERMINATING:
|
802
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING
|
803
|
+
elif state == _FE_CONTROLLER_STATE.TERMINATED:
|
804
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED
|
747
805
|
else:
|
748
806
|
logger.error(
|
749
|
-
"unexpected Function Executor
|
750
|
-
|
807
|
+
"unexpected Function Executor internal state",
|
808
|
+
state=state.name,
|
751
809
|
)
|
752
|
-
return
|
810
|
+
return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN
|
753
811
|
|
754
812
|
|
755
813
|
_termination_reason_to_short_name_map = {
|
@@ -757,8 +815,6 @@ _termination_reason_to_short_name_map = {
|
|
757
815
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR: "STARTUP_FAILED_INTERNAL_ERROR",
|
758
816
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_ERROR: "STARTUP_FAILED_FUNCTION_ERROR",
|
759
817
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_TIMEOUT: "STARTUP_FAILED_FUNCTION_TIMEOUT",
|
760
|
-
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_EXECUTOR_SHUTDOWN: "EXECUTOR_SHUTDOWN",
|
761
|
-
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_REMOVED_FROM_DESIRED_STATE: "REMOVED_FROM_DESIRED_STATE",
|
762
818
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY: "UNHEALTHY",
|
763
819
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR: "INTERNAL_ERROR",
|
764
820
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT: "FUNCTION_TIMEOUT",
|
@@ -34,27 +34,34 @@ metric_runnable_tasks_per_function_name: prometheus_client.Gauge = (
|
|
34
34
|
)
|
35
35
|
)
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
["status"],
|
42
|
-
)
|
37
|
+
metric_function_executors_with_state: prometheus_client.Gauge = prometheus_client.Gauge(
|
38
|
+
"function_executors_with_state",
|
39
|
+
"Number of Function Executors with a particular internal state",
|
40
|
+
["state"],
|
43
41
|
)
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
42
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN = "unknown"
|
43
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED = "not_started"
|
44
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP = "starting_up"
|
45
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING = "running"
|
46
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING = "terminating"
|
47
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED = "terminated"
|
48
|
+
|
48
49
|
|
49
|
-
|
50
|
-
|
50
|
+
metric_function_executors_with_state.labels(
|
51
|
+
state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN
|
52
|
+
)
|
53
|
+
metric_function_executors_with_state.labels(
|
54
|
+
state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED
|
55
|
+
)
|
56
|
+
metric_function_executors_with_state.labels(
|
57
|
+
state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP
|
51
58
|
)
|
52
|
-
|
53
|
-
|
59
|
+
metric_function_executors_with_state.labels(
|
60
|
+
state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING
|
54
61
|
)
|
55
|
-
|
56
|
-
|
62
|
+
metric_function_executors_with_state.labels(
|
63
|
+
state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING
|
57
64
|
)
|
58
|
-
|
59
|
-
|
65
|
+
metric_function_executors_with_state.labels(
|
66
|
+
state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED
|
60
67
|
)
|