indexify 0.3.15__py3-none-any.whl → 0.3.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +20 -91
- indexify/executor/api_objects.py +2 -0
- indexify/executor/executor.py +75 -84
- indexify/executor/function_executor/function_executor.py +5 -2
- indexify/executor/function_executor/function_executor_state.py +43 -43
- indexify/executor/function_executor/function_executor_states_container.py +10 -4
- indexify/executor/function_executor/function_executor_status.py +91 -0
- indexify/executor/function_executor/health_checker.py +37 -13
- indexify/executor/function_executor/metrics/function_executor.py +1 -1
- indexify/executor/function_executor/metrics/function_executor_state.py +36 -0
- indexify/executor/function_executor/server/function_executor_server_factory.py +8 -8
- indexify/executor/function_executor/single_task_runner.py +100 -37
- indexify/executor/grpc/channel_creator.py +53 -0
- indexify/executor/grpc/metrics/channel_creator.py +18 -0
- indexify/executor/grpc/metrics/state_reporter.py +17 -0
- indexify/executor/{state_reconciler.py → grpc/state_reconciler.py} +60 -31
- indexify/executor/grpc/state_reporter.py +199 -0
- indexify/executor/metrics/task_runner.py +7 -0
- indexify/executor/monitoring/health_checker/generic_health_checker.py +27 -12
- indexify/executor/task_runner.py +34 -6
- indexify/{task_scheduler/proto → proto}/task_scheduler.proto +23 -17
- indexify/proto/task_scheduler_pb2.py +64 -0
- indexify/{task_scheduler/proto → proto}/task_scheduler_pb2.pyi +28 -10
- indexify/{task_scheduler/proto → proto}/task_scheduler_pb2_grpc.py +16 -16
- {indexify-0.3.15.dist-info → indexify-0.3.17.dist-info}/METADATA +1 -1
- {indexify-0.3.15.dist-info → indexify-0.3.17.dist-info}/RECORD +28 -24
- indexify/executor/state_reporter.py +0 -127
- indexify/task_scheduler/proto/task_scheduler_pb2.py +0 -69
- {indexify-0.3.15.dist-info → indexify-0.3.17.dist-info}/WHEEL +0 -0
- {indexify-0.3.15.dist-info → indexify-0.3.17.dist-info}/entry_points.txt +0 -0
@@ -7,29 +7,29 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
|
|
7
7
|
SerializedObject,
|
8
8
|
)
|
9
9
|
|
10
|
-
from indexify.
|
10
|
+
from indexify.proto.task_scheduler_pb2 import (
|
11
11
|
DesiredExecutorState,
|
12
12
|
FunctionExecutorDescription,
|
13
13
|
FunctionExecutorStatus,
|
14
14
|
GetDesiredExecutorStatesRequest,
|
15
15
|
)
|
16
|
-
from indexify.
|
16
|
+
from indexify.proto.task_scheduler_pb2_grpc import (
|
17
17
|
TaskSchedulerServiceStub,
|
18
18
|
)
|
19
19
|
|
20
|
-
from
|
21
|
-
from
|
22
|
-
from
|
23
|
-
from
|
20
|
+
from ..downloader import Downloader
|
21
|
+
from ..function_executor.function_executor import CustomerError, FunctionExecutor
|
22
|
+
from ..function_executor.function_executor_state import FunctionExecutorState
|
23
|
+
from ..function_executor.function_executor_states_container import (
|
24
24
|
FunctionExecutorStatesContainer,
|
25
25
|
)
|
26
|
-
from
|
26
|
+
from ..function_executor.server.function_executor_server_factory import (
|
27
27
|
FunctionExecutorServerConfiguration,
|
28
28
|
FunctionExecutorServerFactory,
|
29
29
|
)
|
30
|
-
from
|
31
|
-
from
|
32
|
-
from
|
30
|
+
from ..function_executor.task_input import TaskInput
|
31
|
+
from ..function_executor.task_output import TaskOutput
|
32
|
+
from ..metrics.executor import (
|
33
33
|
METRIC_TASKS_COMPLETED_OUTCOME_ALL,
|
34
34
|
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
|
35
35
|
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
|
@@ -42,7 +42,10 @@ from .metrics.executor import (
|
|
42
42
|
metric_tasks_fetched,
|
43
43
|
metric_tasks_reporting_outcome,
|
44
44
|
)
|
45
|
-
from
|
45
|
+
from ..task_reporter import TaskReporter
|
46
|
+
from .channel_creator import ChannelCreator
|
47
|
+
|
48
|
+
_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
|
46
49
|
|
47
50
|
|
48
51
|
class ExecutorStateReconciler:
|
@@ -55,11 +58,13 @@ class ExecutorStateReconciler:
|
|
55
58
|
config_path: Optional[str],
|
56
59
|
downloader: Downloader,
|
57
60
|
task_reporter: TaskReporter,
|
58
|
-
|
61
|
+
channel_creator: ChannelCreator,
|
59
62
|
logger: Any,
|
60
63
|
):
|
61
64
|
self._executor_id: str = executor_id
|
62
|
-
self.
|
65
|
+
self._function_executor_server_factory: FunctionExecutorServerFactory = (
|
66
|
+
function_executor_server_factory
|
67
|
+
)
|
63
68
|
self._base_url: str = base_url
|
64
69
|
self._config_path: Optional[str] = config_path
|
65
70
|
self._downloader: Downloader = downloader
|
@@ -67,39 +72,60 @@ class ExecutorStateReconciler:
|
|
67
72
|
self._function_executor_states: FunctionExecutorStatesContainer = (
|
68
73
|
function_executor_states
|
69
74
|
)
|
70
|
-
self.
|
75
|
+
self._channel_creator = channel_creator
|
71
76
|
self._logger: Any = logger.bind(module=__name__)
|
72
77
|
self._is_shutdown: bool = False
|
73
|
-
self._reconciliation_lock: asyncio.Lock = asyncio.Lock()
|
74
78
|
self._server_last_clock: Optional[int] = None
|
75
79
|
|
76
80
|
async def run(self):
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
81
|
+
"""Runs the state reconciler.
|
82
|
+
|
83
|
+
Never raises any exceptions.
|
84
|
+
"""
|
85
|
+
while not self._is_shutdown:
|
86
|
+
async with await self._channel_creator.create() as server_channel:
|
87
|
+
server_channel: grpc.aio.Channel
|
88
|
+
stub = TaskSchedulerServiceStub(server_channel)
|
89
|
+
while not self._is_shutdown:
|
90
|
+
try:
|
91
|
+
# TODO: Report state once before starting the stream.
|
92
|
+
desired_states_stream: AsyncGenerator[
|
93
|
+
DesiredExecutorState, None
|
94
|
+
] = stub.get_desired_executor_states(
|
95
|
+
GetDesiredExecutorStatesRequest(
|
96
|
+
executor_id=self._executor_id
|
97
|
+
)
|
98
|
+
)
|
99
|
+
await self._process_desired_states_stream(desired_states_stream)
|
100
|
+
except Exception as e:
|
101
|
+
self._logger.error(
|
102
|
+
f"Failed processing desired states stream, reconnecting in {_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC} sec.",
|
103
|
+
exc_info=e,
|
104
|
+
)
|
105
|
+
await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
|
106
|
+
break
|
107
|
+
|
108
|
+
self._logger.info("State reconciler shutdown.")
|
109
|
+
|
110
|
+
async def _process_desired_states_stream(
|
111
|
+
self, desired_states: AsyncGenerator[DesiredExecutorState, None]
|
112
|
+
):
|
82
113
|
async for new_state in desired_states:
|
83
114
|
if self._is_shutdown:
|
84
115
|
return
|
116
|
+
|
85
117
|
new_state: DesiredExecutorState
|
86
118
|
if self._server_last_clock is not None:
|
87
119
|
if self._server_last_clock >= new_state.clock:
|
88
120
|
continue # Duplicate or outdated message state sent by Server.
|
89
121
|
|
90
122
|
self._server_last_clock = new_state.clock
|
91
|
-
|
123
|
+
await self._reconcile_state(new_state)
|
92
124
|
|
93
125
|
async def _reconcile_state(self, new_state: DesiredExecutorState):
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
# Simple non concurrent implementation for now for the PoC.
|
98
|
-
# Obtain this lock to force only a single coroutine doing the reconciliation.
|
99
|
-
async with self._reconciliation_lock:
|
100
|
-
await self._reconcile_function_executors(new_state)
|
101
|
-
# TODO
|
102
|
-
# await self._reconcile_task_allocations(new_state)
|
126
|
+
await self._reconcile_function_executors(new_state)
|
127
|
+
# TODO
|
128
|
+
# await self._reconcile_task_allocations(new_state)
|
103
129
|
|
104
130
|
async def shutdown(self):
|
105
131
|
"""Shuts down the state reconciler.
|
@@ -121,6 +147,7 @@ class ExecutorStateReconciler:
|
|
121
147
|
graph_name=desired_function_executor.graph_name,
|
122
148
|
graph_version=desired_function_executor.graph_version,
|
123
149
|
function_name=desired_function_executor.function_name,
|
150
|
+
image_uri=desired_function_executor.image_uri,
|
124
151
|
)
|
125
152
|
)
|
126
153
|
|
@@ -203,13 +230,15 @@ class ExecutorStateReconciler:
|
|
203
230
|
logger=logger,
|
204
231
|
)
|
205
232
|
function_executor: FunctionExecutor = FunctionExecutor(
|
206
|
-
server_factory=self.
|
233
|
+
server_factory=self._function_executor_server_factory, logger=logger
|
207
234
|
)
|
208
235
|
config: FunctionExecutorServerConfiguration = (
|
209
236
|
FunctionExecutorServerConfiguration(
|
210
237
|
executor_id=self._executor_id,
|
211
238
|
function_executor_id=description.id,
|
239
|
+
namespace=description.namespace,
|
212
240
|
image_uri=description.image_uri,
|
241
|
+
secret_names=list(description.secret_names),
|
213
242
|
)
|
214
243
|
)
|
215
244
|
initialize_request: InitializeRequest = InitializeRequest(
|
@@ -0,0 +1,199 @@
|
|
1
|
+
import asyncio
|
2
|
+
from typing import Any, Dict, List, Optional
|
3
|
+
|
4
|
+
import grpc
|
5
|
+
|
6
|
+
from indexify.proto.task_scheduler_pb2 import (
|
7
|
+
AllowedFunction,
|
8
|
+
ExecutorState,
|
9
|
+
ExecutorStatus,
|
10
|
+
FunctionExecutorDescription,
|
11
|
+
)
|
12
|
+
from indexify.proto.task_scheduler_pb2 import (
|
13
|
+
FunctionExecutorState as FunctionExecutorStateProto,
|
14
|
+
)
|
15
|
+
from indexify.proto.task_scheduler_pb2 import (
|
16
|
+
FunctionExecutorStatus as FunctionExecutorStatusProto,
|
17
|
+
)
|
18
|
+
from indexify.proto.task_scheduler_pb2 import (
|
19
|
+
GPUModel,
|
20
|
+
GPUResources,
|
21
|
+
HostResources,
|
22
|
+
ReportExecutorStateRequest,
|
23
|
+
)
|
24
|
+
from indexify.proto.task_scheduler_pb2_grpc import (
|
25
|
+
TaskSchedulerServiceStub,
|
26
|
+
)
|
27
|
+
|
28
|
+
from ..api_objects import FunctionURI
|
29
|
+
from ..function_executor.function_executor_state import FunctionExecutorState
|
30
|
+
from ..function_executor.function_executor_states_container import (
|
31
|
+
FunctionExecutorStatesContainer,
|
32
|
+
)
|
33
|
+
from ..function_executor.function_executor_status import FunctionExecutorStatus
|
34
|
+
from .channel_creator import ChannelCreator
|
35
|
+
from .metrics.state_reporter import (
|
36
|
+
metric_state_report_errors,
|
37
|
+
metric_state_report_latency,
|
38
|
+
metric_state_report_rpcs,
|
39
|
+
)
|
40
|
+
|
41
|
+
_REPORTING_INTERVAL_SEC = 5
|
42
|
+
_REPORT_RPC_TIMEOUT_SEC = 5
|
43
|
+
_REPORT_BACKOFF_ON_ERROR_SEC = 5
|
44
|
+
|
45
|
+
|
46
|
+
class ExecutorStateReporter:
|
47
|
+
def __init__(
|
48
|
+
self,
|
49
|
+
executor_id: str,
|
50
|
+
development_mode: bool,
|
51
|
+
function_allowlist: Optional[List[FunctionURI]],
|
52
|
+
function_executor_states: FunctionExecutorStatesContainer,
|
53
|
+
channel_creator: ChannelCreator,
|
54
|
+
logger: Any,
|
55
|
+
):
|
56
|
+
self._executor_id: str = executor_id
|
57
|
+
self._development_mode: bool = development_mode
|
58
|
+
self._function_executor_states: FunctionExecutorStatesContainer = (
|
59
|
+
function_executor_states
|
60
|
+
)
|
61
|
+
self._channel_creator = channel_creator
|
62
|
+
self._logger: Any = logger.bind(module=__name__)
|
63
|
+
self._is_shutdown: bool = False
|
64
|
+
self._executor_status: ExecutorStatus = ExecutorStatus.EXECUTOR_STATUS_UNKNOWN
|
65
|
+
self._allowed_functions: List[AllowedFunction] = _to_grpc_allowed_functions(
|
66
|
+
function_allowlist
|
67
|
+
)
|
68
|
+
|
69
|
+
def update_executor_status(self, value: ExecutorStatus):
|
70
|
+
self._executor_status = value
|
71
|
+
|
72
|
+
async def run(self):
|
73
|
+
"""Runs the state reporter.
|
74
|
+
|
75
|
+
Never raises any exceptions.
|
76
|
+
"""
|
77
|
+
while not self._is_shutdown:
|
78
|
+
async with await self._channel_creator.create() as server_channel:
|
79
|
+
server_channel: grpc.aio.Channel
|
80
|
+
stub = TaskSchedulerServiceStub(server_channel)
|
81
|
+
while not self._is_shutdown:
|
82
|
+
try:
|
83
|
+
await self._report_state(stub)
|
84
|
+
await asyncio.sleep(_REPORTING_INTERVAL_SEC)
|
85
|
+
except Exception as e:
|
86
|
+
self._logger.error(
|
87
|
+
f"Failed to report state to the server, reconnecting in {_REPORT_BACKOFF_ON_ERROR_SEC} sec.",
|
88
|
+
exc_info=e,
|
89
|
+
)
|
90
|
+
await asyncio.sleep(_REPORT_BACKOFF_ON_ERROR_SEC)
|
91
|
+
break
|
92
|
+
|
93
|
+
self._logger.info("State reporter shutdown")
|
94
|
+
|
95
|
+
async def _report_state(self, stub: TaskSchedulerServiceStub):
|
96
|
+
with (
|
97
|
+
metric_state_report_errors.count_exceptions(),
|
98
|
+
metric_state_report_latency.time(),
|
99
|
+
):
|
100
|
+
metric_state_report_rpcs.inc()
|
101
|
+
state = ExecutorState(
|
102
|
+
executor_id=self._executor_id,
|
103
|
+
development_mode=self._development_mode,
|
104
|
+
executor_status=self._executor_status,
|
105
|
+
free_resources=await self._fetch_free_host_resources(),
|
106
|
+
allowed_functions=self._allowed_functions,
|
107
|
+
function_executor_states=await self._fetch_function_executor_states(),
|
108
|
+
)
|
109
|
+
|
110
|
+
await stub.report_executor_state(
|
111
|
+
ReportExecutorStateRequest(executor_state=state),
|
112
|
+
timeout=_REPORT_RPC_TIMEOUT_SEC,
|
113
|
+
)
|
114
|
+
|
115
|
+
async def _fetch_free_host_resources(self) -> HostResources:
|
116
|
+
# TODO: Implement host resource metrics reporting.
|
117
|
+
return HostResources(
|
118
|
+
cpu_count=0,
|
119
|
+
memory_bytes=0,
|
120
|
+
disk_bytes=0,
|
121
|
+
gpu=GPUResources(
|
122
|
+
count=0,
|
123
|
+
model=GPUModel.GPU_MODEL_UNKNOWN,
|
124
|
+
),
|
125
|
+
)
|
126
|
+
|
127
|
+
async def _fetch_function_executor_states(self) -> List[FunctionExecutorStateProto]:
|
128
|
+
states = []
|
129
|
+
|
130
|
+
async for function_executor_state in self._function_executor_states:
|
131
|
+
function_executor_state: FunctionExecutorState
|
132
|
+
states.append(
|
133
|
+
FunctionExecutorStateProto(
|
134
|
+
description=FunctionExecutorDescription(
|
135
|
+
id=function_executor_state.id,
|
136
|
+
namespace=function_executor_state.namespace,
|
137
|
+
graph_name=function_executor_state.graph_name,
|
138
|
+
graph_version=function_executor_state.graph_version,
|
139
|
+
function_name=function_executor_state.function_name,
|
140
|
+
),
|
141
|
+
status=_to_grpc_function_executor_status(
|
142
|
+
function_executor_state.status, self._logger
|
143
|
+
),
|
144
|
+
)
|
145
|
+
)
|
146
|
+
|
147
|
+
return states
|
148
|
+
|
149
|
+
async def shutdown(self):
|
150
|
+
"""Shuts down the state reporter.
|
151
|
+
|
152
|
+
Never raises any exceptions.
|
153
|
+
"""
|
154
|
+
self._is_shutdown = True
|
155
|
+
|
156
|
+
|
157
|
+
def _to_grpc_allowed_functions(function_allowlist: Optional[List[FunctionURI]]):
|
158
|
+
if function_allowlist is None:
|
159
|
+
return []
|
160
|
+
|
161
|
+
allowed_functions: List[AllowedFunction] = []
|
162
|
+
for function_uri in function_allowlist:
|
163
|
+
function_uri: FunctionURI
|
164
|
+
allowed_function = AllowedFunction(
|
165
|
+
namespace=function_uri.namespace,
|
166
|
+
graph_name=function_uri.compute_graph,
|
167
|
+
function_name=function_uri.compute_fn,
|
168
|
+
)
|
169
|
+
if function_uri.version is not None:
|
170
|
+
allowed_function.graph_version = function_uri.version
|
171
|
+
allowed_functions.append(allowed_function)
|
172
|
+
|
173
|
+
return allowed_functions
|
174
|
+
|
175
|
+
|
176
|
+
_STATUS_MAPPING: Dict[FunctionExecutorStatus, Any] = {
|
177
|
+
FunctionExecutorStatus.STARTING_UP: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTING_UP,
|
178
|
+
FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR,
|
179
|
+
FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR,
|
180
|
+
FunctionExecutorStatus.IDLE: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE,
|
181
|
+
FunctionExecutorStatus.RUNNING_TASK: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK,
|
182
|
+
FunctionExecutorStatus.UNHEALTHY: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNHEALTHY,
|
183
|
+
FunctionExecutorStatus.DESTROYING: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPING,
|
184
|
+
FunctionExecutorStatus.DESTROYED: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
|
185
|
+
FunctionExecutorStatus.SHUTDOWN: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
|
186
|
+
}
|
187
|
+
|
188
|
+
|
189
|
+
def _to_grpc_function_executor_status(
|
190
|
+
status: FunctionExecutorStatus, logger: Any
|
191
|
+
) -> FunctionExecutorStatusProto:
|
192
|
+
result: FunctionExecutorStatusProto = _STATUS_MAPPING.get(
|
193
|
+
status, FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNKNOWN
|
194
|
+
)
|
195
|
+
|
196
|
+
if result == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNKNOWN:
|
197
|
+
logger.error("Unexpected Function Executor status", status=status)
|
198
|
+
|
199
|
+
return result
|
@@ -23,6 +23,13 @@ metric_tasks_blocked_by_policy: prometheus_client.Gauge = prometheus_client.Gaug
|
|
23
23
|
"tasks_blocked_by_policy",
|
24
24
|
"Number of tasks that are ready for execution but are blocked according to the current policy (typically waiting for a free Function Executor)",
|
25
25
|
)
|
26
|
+
metric_tasks_blocked_by_policy_per_function_name: prometheus_client.Gauge = (
|
27
|
+
prometheus_client.Gauge(
|
28
|
+
"tasks_blocked_by_policy_per_function_name",
|
29
|
+
"Number of tasks that are ready for execution but are blocked according to the current policy (typically waiting for a free Function Executor)",
|
30
|
+
["function_name"],
|
31
|
+
)
|
32
|
+
)
|
26
33
|
|
27
34
|
# Metrics for the stage when task is running.
|
28
35
|
metric_task_runs: prometheus_client.Counter = prometheus_client.Counter(
|
@@ -3,6 +3,7 @@ from typing import Optional
|
|
3
3
|
from ...function_executor.function_executor_states_container import (
|
4
4
|
FunctionExecutorStatesContainer,
|
5
5
|
)
|
6
|
+
from ...function_executor.function_executor_status import FunctionExecutorStatus
|
6
7
|
from .health_checker import HealthChecker, HealthCheckResult
|
7
8
|
|
8
9
|
HEALTH_CHECKER_NAME = "GenericHealthChecker"
|
@@ -16,6 +17,7 @@ class GenericHealthChecker(HealthChecker):
|
|
16
17
|
|
17
18
|
def __init__(self):
|
18
19
|
self._function_executor_states: Optional[FunctionExecutorStatesContainer] = None
|
20
|
+
self._function_executor_health_check_ever_failed = False
|
19
21
|
|
20
22
|
def set_function_executor_states_container(
|
21
23
|
self, states: FunctionExecutorStatesContainer
|
@@ -42,17 +44,30 @@ class GenericHealthChecker(HealthChecker):
|
|
42
44
|
# * So we fail whole Executor health check if a Function Executor health check ever failed to hint the users
|
43
45
|
# that we probably need to recreate the Executor machine/VM/container (unless there's a bug in Function
|
44
46
|
# code that user can investigate themself).
|
47
|
+
await self._check_function_executors()
|
48
|
+
if self._function_executor_health_check_ever_failed:
|
49
|
+
return HealthCheckResult(
|
50
|
+
is_success=False,
|
51
|
+
status_message="A Function Executor health check failed",
|
52
|
+
checker_name=HEALTH_CHECKER_NAME,
|
53
|
+
)
|
54
|
+
else:
|
55
|
+
return HealthCheckResult(
|
56
|
+
is_success=True,
|
57
|
+
status_message="All Function Executors pass health checks",
|
58
|
+
checker_name=HEALTH_CHECKER_NAME,
|
59
|
+
)
|
60
|
+
|
61
|
+
async def _check_function_executors(self):
|
62
|
+
if self._function_executor_health_check_ever_failed:
|
63
|
+
return
|
64
|
+
|
45
65
|
async for state in self._function_executor_states:
|
46
66
|
# No need to async lock the state to read a single value.
|
47
|
-
if state.
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
return HealthCheckResult(
|
55
|
-
is_success=True,
|
56
|
-
status_message="All Function Executors pass health checks",
|
57
|
-
checker_name=HEALTH_CHECKER_NAME,
|
58
|
-
)
|
67
|
+
if state.status in [
|
68
|
+
FunctionExecutorStatus.UNHEALTHY,
|
69
|
+
FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
|
70
|
+
FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
|
71
|
+
]:
|
72
|
+
self._function_executor_health_check_ever_failed = True
|
73
|
+
return
|
indexify/executor/task_runner.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
from typing import Any, Optional
|
2
2
|
|
3
3
|
from .api_objects import Task
|
4
|
-
from .function_executor.function_executor_state import
|
4
|
+
from .function_executor.function_executor_state import (
|
5
|
+
FunctionExecutorState,
|
6
|
+
FunctionExecutorStatus,
|
7
|
+
)
|
5
8
|
from .function_executor.function_executor_states_container import (
|
6
9
|
FunctionExecutorStatesContainer,
|
7
10
|
)
|
@@ -19,6 +22,7 @@ from .metrics.task_runner import (
|
|
19
22
|
metric_task_run_platform_errors,
|
20
23
|
metric_task_runs,
|
21
24
|
metric_tasks_blocked_by_policy,
|
25
|
+
metric_tasks_blocked_by_policy_per_function_name,
|
22
26
|
metric_tasks_running,
|
23
27
|
)
|
24
28
|
|
@@ -52,6 +56,9 @@ class TaskRunner:
|
|
52
56
|
with (
|
53
57
|
metric_task_policy_errors.count_exceptions(),
|
54
58
|
metric_tasks_blocked_by_policy.track_inprogress(),
|
59
|
+
metric_tasks_blocked_by_policy_per_function_name.labels(
|
60
|
+
function_name=task_input.task.compute_fn
|
61
|
+
).track_inprogress(),
|
55
62
|
metric_task_policy_latency.time(),
|
56
63
|
):
|
57
64
|
metric_task_policy_runs.inc()
|
@@ -109,6 +116,10 @@ class TaskRunner:
|
|
109
116
|
raise
|
110
117
|
|
111
118
|
async def _run_task_policy(self, state: FunctionExecutorState, task: Task) -> None:
|
119
|
+
"""Runs the task policy until the task can run on the Function Executor.
|
120
|
+
|
121
|
+
On successful return the Function Executor status is either IDLE or DESTROYED.
|
122
|
+
"""
|
112
123
|
# Current policy for running tasks:
|
113
124
|
# - There can only be a single Function Executor per function regardless of function versions.
|
114
125
|
# -- If a Function Executor already exists for a different function version then wait until
|
@@ -116,13 +127,30 @@ class TaskRunner:
|
|
116
127
|
# -- This prevents failed tasks for different versions of the same function continiously
|
117
128
|
# destroying each other's Function Executors.
|
118
129
|
# - Each Function Executor rans at most 1 task concurrently.
|
119
|
-
await state.
|
130
|
+
await state.wait_status(
|
131
|
+
[
|
132
|
+
FunctionExecutorStatus.DESTROYED,
|
133
|
+
FunctionExecutorStatus.IDLE,
|
134
|
+
FunctionExecutorStatus.UNHEALTHY,
|
135
|
+
FunctionExecutorStatus.SHUTDOWN,
|
136
|
+
]
|
137
|
+
)
|
138
|
+
# We only shutdown the Function Executor on full Executor shutdown so it's fine to raise error here.
|
139
|
+
if state.status == FunctionExecutorStatus.SHUTDOWN:
|
140
|
+
raise Exception("Function Executor state is shutting down")
|
120
141
|
|
121
|
-
if state.
|
142
|
+
if state.status == FunctionExecutorStatus.UNHEALTHY:
|
122
143
|
await state.destroy_function_executor()
|
123
|
-
|
124
|
-
|
125
|
-
#
|
144
|
+
|
145
|
+
if state.graph_version == task.graph_version:
|
146
|
+
return # All good, we can run on this Function Executor.
|
147
|
+
|
148
|
+
if state.status in [FunctionExecutorStatus.IDLE]:
|
149
|
+
await state.destroy_function_executor()
|
150
|
+
|
151
|
+
state.graph_version = task.graph_version
|
152
|
+
# At this point the state belongs to the version of the function from the task
|
153
|
+
# and there are no running tasks in the Function Executor.
|
126
154
|
|
127
155
|
async def _run_task(
|
128
156
|
self, state: FunctionExecutorState, task_input: TaskInput, logger: Any
|
@@ -26,7 +26,7 @@ message GPUResources {
|
|
26
26
|
optional GPUModel model = 2;
|
27
27
|
}
|
28
28
|
|
29
|
-
//
|
29
|
+
// Resources that we're currently tracking and limiting on Executor.
|
30
30
|
message HostResources {
|
31
31
|
optional uint32 cpu_count = 1;
|
32
32
|
optional uint64 memory_bytes = 2;
|
@@ -45,14 +45,14 @@ message AllowedFunction {
|
|
45
45
|
|
46
46
|
enum FunctionExecutorStatus {
|
47
47
|
FUNCTION_EXECUTOR_STATUS_UNKNOWN = 0;
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
48
|
+
FUNCTION_EXECUTOR_STATUS_STARTING_UP = 1;
|
49
|
+
FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR = 2;
|
50
|
+
FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR = 3;
|
51
|
+
FUNCTION_EXECUTOR_STATUS_IDLE = 4;
|
52
|
+
FUNCTION_EXECUTOR_STATUS_RUNNING_TASK = 5;
|
53
|
+
FUNCTION_EXECUTOR_STATUS_UNHEALTHY = 6;
|
54
|
+
FUNCTION_EXECUTOR_STATUS_STOPPING = 7;
|
55
|
+
FUNCTION_EXECUTOR_STATUS_STOPPED = 8;
|
56
56
|
}
|
57
57
|
|
58
58
|
// Immutable information that identifies and describes a Function Executor.
|
@@ -63,6 +63,8 @@ message FunctionExecutorDescription {
|
|
63
63
|
optional string graph_version = 4;
|
64
64
|
optional string function_name = 5;
|
65
65
|
optional string image_uri = 6;
|
66
|
+
repeated string secret_names = 7;
|
67
|
+
optional HostResources resource_limits = 8;
|
66
68
|
}
|
67
69
|
|
68
70
|
message FunctionExecutorState {
|
@@ -72,19 +74,22 @@ message FunctionExecutorState {
|
|
72
74
|
|
73
75
|
enum ExecutorStatus {
|
74
76
|
EXECUTOR_STATUS_UNKNOWN = 0;
|
75
|
-
|
77
|
+
EXECUTOR_STATUS_STARTING_UP = 1;
|
76
78
|
EXECUTOR_STATUS_RUNNING = 2;
|
77
79
|
EXECUTOR_STATUS_DRAINED = 3;
|
78
|
-
|
80
|
+
EXECUTOR_STATUS_STOPPING = 4;
|
81
|
+
EXECUTOR_STATUS_STOPPED = 5;
|
79
82
|
}
|
80
83
|
|
81
84
|
message ExecutorState {
|
82
85
|
optional string executor_id = 1;
|
83
|
-
optional
|
84
|
-
optional
|
86
|
+
optional bool development_mode = 2;
|
87
|
+
optional ExecutorStatus executor_status = 3;
|
88
|
+
// Free resources available at the Executor.
|
89
|
+
optional HostResources free_resources = 4;
|
85
90
|
// Empty allowed_functions list means that any function can run on the Executor.
|
86
|
-
repeated AllowedFunction allowed_functions =
|
87
|
-
repeated FunctionExecutorState function_executor_states =
|
91
|
+
repeated AllowedFunction allowed_functions = 5;
|
92
|
+
repeated FunctionExecutorState function_executor_states = 6;
|
88
93
|
}
|
89
94
|
|
90
95
|
// A message sent by Executor to report its up to date state to Server.
|
@@ -106,6 +111,7 @@ message Task {
|
|
106
111
|
optional string graph_invocation_id = 6;
|
107
112
|
optional string input_key = 8;
|
108
113
|
optional string reducer_output_key = 9;
|
114
|
+
optional string timeout_ms = 10;
|
109
115
|
}
|
110
116
|
|
111
117
|
message TaskAllocation {
|
@@ -139,9 +145,9 @@ service TaskSchedulerService {
|
|
139
145
|
// Called by Executor to open a stream of its desired states. When Server wants Executor to change something
|
140
146
|
// it puts a message on the stream with the new desired state of the Executor.
|
141
147
|
//
|
142
|
-
//
|
148
|
+
// Deprecated HTTP API is used to download the serialized graph and task inputs.
|
143
149
|
rpc get_desired_executor_states(GetDesiredExecutorStatesRequest) returns (stream DesiredExecutorState) {}
|
144
150
|
|
145
|
-
// Task outcome is currently reported via
|
151
|
+
// Task outcome is currently reported via deprecated HTTP API. We're going to migrate task output reporting to gRPC
|
146
152
|
// when we move S3 downloads and uploads to Executor.
|
147
153
|
}
|