indexify 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +21 -18
- indexify/executor/api_objects.py +12 -0
- indexify/executor/downloader.py +4 -1
- indexify/executor/executor.py +65 -28
- indexify/executor/executor_flavor.py +7 -0
- indexify/executor/function_executor/function_executor.py +24 -11
- indexify/executor/function_executor/function_executor_state.py +9 -1
- indexify/executor/function_executor/function_executor_states_container.py +3 -1
- indexify/executor/function_executor/function_executor_status.py +2 -0
- indexify/executor/function_executor/health_checker.py +20 -2
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
- indexify/executor/function_executor/single_task_runner.py +15 -11
- indexify/executor/function_executor/task_output.py +35 -2
- indexify/executor/grpc/channel_manager.py +160 -0
- indexify/executor/grpc/completed_tasks_container.py +26 -0
- indexify/executor/grpc/function_executor_controller.py +421 -0
- indexify/executor/grpc/state_reconciler.py +33 -38
- indexify/executor/grpc/state_reporter.py +100 -39
- indexify/executor/grpc/task_controller.py +449 -0
- indexify/executor/metrics/task_reporter.py +14 -0
- indexify/executor/task_fetcher.py +8 -3
- indexify/executor/task_reporter.py +112 -4
- indexify/executor/task_runner.py +1 -0
- indexify/proto/{task_scheduler.proto → executor_api.proto} +86 -11
- indexify/proto/executor_api_pb2.py +80 -0
- indexify/proto/{task_scheduler_pb2.pyi → executor_api_pb2.pyi} +162 -7
- indexify/proto/executor_api_pb2_grpc.py +227 -0
- {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/METADATA +1 -1
- {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/RECORD +32 -28
- indexify/executor/grpc/channel_creator.py +0 -53
- indexify/proto/task_scheduler_pb2.py +0 -64
- indexify/proto/task_scheduler_pb2_grpc.py +0 -170
- /indexify/executor/grpc/metrics/{channel_creator.py → channel_manager.py} +0 -0
- {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/WHEEL +0 -0
- {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/entry_points.txt +0 -0
@@ -7,14 +7,14 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
|
|
7
7
|
SerializedObject,
|
8
8
|
)
|
9
9
|
|
10
|
-
from indexify.proto.
|
10
|
+
from indexify.proto.executor_api_pb2 import (
|
11
11
|
DesiredExecutorState,
|
12
12
|
FunctionExecutorDescription,
|
13
13
|
FunctionExecutorStatus,
|
14
14
|
GetDesiredExecutorStatesRequest,
|
15
15
|
)
|
16
|
-
from indexify.proto.
|
17
|
-
|
16
|
+
from indexify.proto.executor_api_pb2_grpc import (
|
17
|
+
ExecutorAPIStub,
|
18
18
|
)
|
19
19
|
|
20
20
|
from ..downloader import Downloader
|
@@ -30,20 +30,11 @@ from ..function_executor.server.function_executor_server_factory import (
|
|
30
30
|
from ..function_executor.task_input import TaskInput
|
31
31
|
from ..function_executor.task_output import TaskOutput
|
32
32
|
from ..metrics.executor import (
|
33
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ALL,
|
34
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
|
35
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
|
36
|
-
METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
|
37
|
-
metric_task_completion_latency,
|
38
|
-
metric_task_outcome_report_latency,
|
39
|
-
metric_task_outcome_report_retries,
|
40
|
-
metric_task_outcome_reports,
|
41
|
-
metric_tasks_completed,
|
42
33
|
metric_tasks_fetched,
|
43
|
-
metric_tasks_reporting_outcome,
|
44
34
|
)
|
45
35
|
from ..task_reporter import TaskReporter
|
46
|
-
from .
|
36
|
+
from .channel_manager import ChannelManager
|
37
|
+
from .state_reporter import ExecutorStateReporter
|
47
38
|
|
48
39
|
_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
|
49
40
|
|
@@ -58,7 +49,8 @@ class ExecutorStateReconciler:
|
|
58
49
|
config_path: Optional[str],
|
59
50
|
downloader: Downloader,
|
60
51
|
task_reporter: TaskReporter,
|
61
|
-
|
52
|
+
channel_manager: ChannelManager,
|
53
|
+
state_reporter: ExecutorStateReporter,
|
62
54
|
logger: Any,
|
63
55
|
):
|
64
56
|
self._executor_id: str = executor_id
|
@@ -72,7 +64,8 @@ class ExecutorStateReconciler:
|
|
72
64
|
self._function_executor_states: FunctionExecutorStatesContainer = (
|
73
65
|
function_executor_states
|
74
66
|
)
|
75
|
-
self.
|
67
|
+
self._channel_manager: ChannelManager = channel_manager
|
68
|
+
self._state_reporter: ExecutorStateReporter = state_reporter
|
76
69
|
self._logger: Any = logger.bind(module=__name__)
|
77
70
|
self._is_shutdown: bool = False
|
78
71
|
self._server_last_clock: Optional[int] = None
|
@@ -83,27 +76,25 @@ class ExecutorStateReconciler:
|
|
83
76
|
Never raises any exceptions.
|
84
77
|
"""
|
85
78
|
while not self._is_shutdown:
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
|
106
|
-
break
|
79
|
+
stub = ExecutorAPIStub(await self._channel_manager.get_channel())
|
80
|
+
while not self._is_shutdown:
|
81
|
+
try:
|
82
|
+
# Report state once before starting the stream so Server
|
83
|
+
# doesn't use old state it knew about this Executor in the past.
|
84
|
+
await self._state_reporter.report_state(stub)
|
85
|
+
desired_states_stream: AsyncGenerator[
|
86
|
+
DesiredExecutorState, None
|
87
|
+
] = stub.get_desired_executor_states(
|
88
|
+
GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
|
89
|
+
)
|
90
|
+
await self._process_desired_states_stream(desired_states_stream)
|
91
|
+
except Exception as e:
|
92
|
+
self._logger.error(
|
93
|
+
f"Failed processing desired states stream, reconnecting in {_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC} sec.",
|
94
|
+
exc_info=e,
|
95
|
+
)
|
96
|
+
await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
|
97
|
+
break
|
107
98
|
|
108
99
|
self._logger.info("State reconciler shutdown.")
|
109
100
|
|
@@ -123,6 +114,7 @@ class ExecutorStateReconciler:
|
|
123
114
|
await self._reconcile_state(new_state)
|
124
115
|
|
125
116
|
async def _reconcile_state(self, new_state: DesiredExecutorState):
|
117
|
+
# TODO: use completed_tasks_container to ignore tasks that were already completed.
|
126
118
|
await self._reconcile_function_executors(new_state)
|
127
119
|
# TODO
|
128
120
|
# await self._reconcile_task_allocations(new_state)
|
@@ -148,6 +140,7 @@ class ExecutorStateReconciler:
|
|
148
140
|
graph_version=desired_function_executor.graph_version,
|
149
141
|
function_name=desired_function_executor.function_name,
|
150
142
|
image_uri=desired_function_executor.image_uri,
|
143
|
+
secret_names=list(desired_function_executor.secret_names),
|
151
144
|
)
|
152
145
|
)
|
153
146
|
|
@@ -291,7 +284,9 @@ class ExecutorStateReconciler:
|
|
291
284
|
while True:
|
292
285
|
logger = logger.bind(retries=reporting_retries)
|
293
286
|
try:
|
294
|
-
await self._task_reporter.report(
|
287
|
+
await self._task_reporter.report(
|
288
|
+
data_payload=task_output, logger=logger
|
289
|
+
)
|
295
290
|
break
|
296
291
|
except Exception as e:
|
297
292
|
logger.error(
|
@@ -1,37 +1,44 @@
|
|
1
1
|
import asyncio
|
2
|
+
import hashlib
|
3
|
+
from socket import gethostname
|
2
4
|
from typing import Any, Dict, List, Optional
|
3
5
|
|
4
6
|
import grpc
|
5
7
|
|
6
|
-
from indexify.proto.
|
8
|
+
from indexify.proto.executor_api_pb2 import (
|
7
9
|
AllowedFunction,
|
10
|
+
)
|
11
|
+
from indexify.proto.executor_api_pb2 import ExecutorFlavor as ExecutorFlavorProto
|
12
|
+
from indexify.proto.executor_api_pb2 import (
|
8
13
|
ExecutorState,
|
9
14
|
ExecutorStatus,
|
10
15
|
FunctionExecutorDescription,
|
11
16
|
)
|
12
|
-
from indexify.proto.
|
17
|
+
from indexify.proto.executor_api_pb2 import (
|
13
18
|
FunctionExecutorState as FunctionExecutorStateProto,
|
14
19
|
)
|
15
|
-
from indexify.proto.
|
20
|
+
from indexify.proto.executor_api_pb2 import (
|
16
21
|
FunctionExecutorStatus as FunctionExecutorStatusProto,
|
17
22
|
)
|
18
|
-
from indexify.proto.
|
23
|
+
from indexify.proto.executor_api_pb2 import (
|
19
24
|
GPUModel,
|
20
25
|
GPUResources,
|
21
26
|
HostResources,
|
22
27
|
ReportExecutorStateRequest,
|
23
28
|
)
|
24
|
-
from indexify.proto.
|
25
|
-
|
29
|
+
from indexify.proto.executor_api_pb2_grpc import (
|
30
|
+
ExecutorAPIStub,
|
26
31
|
)
|
27
32
|
|
28
33
|
from ..api_objects import FunctionURI
|
34
|
+
from ..executor_flavor import ExecutorFlavor
|
29
35
|
from ..function_executor.function_executor_state import FunctionExecutorState
|
30
36
|
from ..function_executor.function_executor_states_container import (
|
31
37
|
FunctionExecutorStatesContainer,
|
32
38
|
)
|
33
39
|
from ..function_executor.function_executor_status import FunctionExecutorStatus
|
34
|
-
from
|
40
|
+
from ..runtime_probes import RuntimeProbes
|
41
|
+
from .channel_manager import ChannelManager
|
35
42
|
from .metrics.state_reporter import (
|
36
43
|
metric_state_report_errors,
|
37
44
|
metric_state_report_latency,
|
@@ -47,24 +54,32 @@ class ExecutorStateReporter:
|
|
47
54
|
def __init__(
|
48
55
|
self,
|
49
56
|
executor_id: str,
|
57
|
+
flavor: ExecutorFlavor,
|
58
|
+
version: str,
|
59
|
+
labels: Dict[str, str],
|
50
60
|
development_mode: bool,
|
51
61
|
function_allowlist: Optional[List[FunctionURI]],
|
52
62
|
function_executor_states: FunctionExecutorStatesContainer,
|
53
|
-
|
63
|
+
channel_manager: ChannelManager,
|
54
64
|
logger: Any,
|
55
65
|
):
|
56
66
|
self._executor_id: str = executor_id
|
67
|
+
self._flavor: ExecutorFlavor = flavor
|
68
|
+
self._version: str = version
|
69
|
+
self._labels: Dict[str, str] = labels.copy()
|
57
70
|
self._development_mode: bool = development_mode
|
71
|
+
self._hostname: str = gethostname()
|
58
72
|
self._function_executor_states: FunctionExecutorStatesContainer = (
|
59
73
|
function_executor_states
|
60
74
|
)
|
61
|
-
self.
|
75
|
+
self._channel_manager = channel_manager
|
62
76
|
self._logger: Any = logger.bind(module=__name__)
|
63
77
|
self._is_shutdown: bool = False
|
64
78
|
self._executor_status: ExecutorStatus = ExecutorStatus.EXECUTOR_STATUS_UNKNOWN
|
65
79
|
self._allowed_functions: List[AllowedFunction] = _to_grpc_allowed_functions(
|
66
80
|
function_allowlist
|
67
81
|
)
|
82
|
+
self._labels.update(_label_values_to_strings(RuntimeProbes().probe().labels))
|
68
83
|
|
69
84
|
def update_executor_status(self, value: ExecutorStatus):
|
70
85
|
self._executor_status = value
|
@@ -75,24 +90,30 @@ class ExecutorStateReporter:
|
|
75
90
|
Never raises any exceptions.
|
76
91
|
"""
|
77
92
|
while not self._is_shutdown:
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
93
|
+
stub = ExecutorAPIStub(await self._channel_manager.get_channel())
|
94
|
+
while not self._is_shutdown:
|
95
|
+
try:
|
96
|
+
# The periodic state reports serve as channel health monitoring requests
|
97
|
+
# (same as TCP keep-alive). Channel Manager returns the same healthy channel
|
98
|
+
# for all RPCs that we do from Executor to Server. So all the RPCs benefit
|
99
|
+
# from this channel health monitoring.
|
100
|
+
await self.report_state(stub)
|
101
|
+
await asyncio.sleep(_REPORTING_INTERVAL_SEC)
|
102
|
+
except Exception as e:
|
103
|
+
self._logger.error(
|
104
|
+
f"Failed to report state to the server, reconnecting in {_REPORT_BACKOFF_ON_ERROR_SEC} sec.",
|
105
|
+
exc_info=e,
|
106
|
+
)
|
107
|
+
await asyncio.sleep(_REPORT_BACKOFF_ON_ERROR_SEC)
|
108
|
+
break
|
92
109
|
|
93
110
|
self._logger.info("State reporter shutdown")
|
94
111
|
|
95
|
-
async def
|
112
|
+
async def report_state(self, stub: ExecutorAPIStub):
|
113
|
+
"""Reports the current state to the server represented by the supplied stub.
|
114
|
+
|
115
|
+
Raises exceptions on failure.
|
116
|
+
"""
|
96
117
|
with (
|
97
118
|
metric_state_report_errors.count_exceptions(),
|
98
119
|
metric_state_report_latency.time(),
|
@@ -101,11 +122,16 @@ class ExecutorStateReporter:
|
|
101
122
|
state = ExecutorState(
|
102
123
|
executor_id=self._executor_id,
|
103
124
|
development_mode=self._development_mode,
|
104
|
-
|
125
|
+
hostname=self._hostname,
|
126
|
+
flavor=_to_grpc_executor_flavor(self._flavor, self._logger),
|
127
|
+
version=self._version,
|
128
|
+
status=self._executor_status,
|
105
129
|
free_resources=await self._fetch_free_host_resources(),
|
106
130
|
allowed_functions=self._allowed_functions,
|
107
131
|
function_executor_states=await self._fetch_function_executor_states(),
|
132
|
+
labels=self._labels,
|
108
133
|
)
|
134
|
+
state.state_hash = _state_hash(state)
|
109
135
|
|
110
136
|
await stub.report_executor_state(
|
111
137
|
ReportExecutorStateRequest(executor_state=state),
|
@@ -129,20 +155,25 @@ class ExecutorStateReporter:
|
|
129
155
|
|
130
156
|
async for function_executor_state in self._function_executor_states:
|
131
157
|
function_executor_state: FunctionExecutorState
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
158
|
+
function_executor_state_proto = FunctionExecutorStateProto(
|
159
|
+
description=FunctionExecutorDescription(
|
160
|
+
id=function_executor_state.id,
|
161
|
+
namespace=function_executor_state.namespace,
|
162
|
+
graph_name=function_executor_state.graph_name,
|
163
|
+
graph_version=function_executor_state.graph_version,
|
164
|
+
function_name=function_executor_state.function_name,
|
165
|
+
secret_names=function_executor_state.secret_names,
|
166
|
+
),
|
167
|
+
status=_to_grpc_function_executor_status(
|
168
|
+
function_executor_state.status, self._logger
|
169
|
+
),
|
170
|
+
status_message=function_executor_state.status_message,
|
145
171
|
)
|
172
|
+
if function_executor_state.image_uri:
|
173
|
+
function_executor_state_proto.description.image_uri = (
|
174
|
+
function_executor_state.image_uri
|
175
|
+
)
|
176
|
+
states.append(function_executor_state_proto)
|
146
177
|
|
147
178
|
return states
|
148
179
|
|
@@ -182,7 +213,7 @@ _STATUS_MAPPING: Dict[FunctionExecutorStatus, Any] = {
|
|
182
213
|
FunctionExecutorStatus.UNHEALTHY: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNHEALTHY,
|
183
214
|
FunctionExecutorStatus.DESTROYING: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPING,
|
184
215
|
FunctionExecutorStatus.DESTROYED: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
|
185
|
-
FunctionExecutorStatus.SHUTDOWN: FunctionExecutorStatusProto.
|
216
|
+
FunctionExecutorStatus.SHUTDOWN: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
|
186
217
|
}
|
187
218
|
|
188
219
|
|
@@ -197,3 +228,33 @@ def _to_grpc_function_executor_status(
|
|
197
228
|
logger.error("Unexpected Function Executor status", status=status)
|
198
229
|
|
199
230
|
return result
|
231
|
+
|
232
|
+
|
233
|
+
_FLAVOR_MAPPING = {
|
234
|
+
ExecutorFlavor.OSS: ExecutorFlavorProto.EXECUTOR_FLAVOR_OSS,
|
235
|
+
ExecutorFlavor.PLATFORM: ExecutorFlavorProto.EXECUTOR_FLAVOR_PLATFORM,
|
236
|
+
}
|
237
|
+
|
238
|
+
|
239
|
+
def _to_grpc_executor_flavor(
|
240
|
+
flavor: ExecutorFlavor, logger: Any
|
241
|
+
) -> ExecutorFlavorProto:
|
242
|
+
result: ExecutorFlavorProto = _FLAVOR_MAPPING.get(
|
243
|
+
flavor, ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN
|
244
|
+
)
|
245
|
+
|
246
|
+
if result == ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN:
|
247
|
+
logger.error("Unexpected Executor flavor", flavor=flavor)
|
248
|
+
|
249
|
+
return result
|
250
|
+
|
251
|
+
|
252
|
+
def _label_values_to_strings(labels: Dict[str, Any]) -> Dict[str, str]:
|
253
|
+
return {k: str(v) for k, v in labels.items()}
|
254
|
+
|
255
|
+
|
256
|
+
def _state_hash(state: ExecutorState) -> str:
|
257
|
+
serialized_state: bytes = state.SerializeToString(deterministic=True)
|
258
|
+
hasher = hashlib.sha256(usedforsecurity=False)
|
259
|
+
hasher.update(serialized_state)
|
260
|
+
return hasher.hexdigest()
|