indexify 0.3.30__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/__init__.py +18 -0
- indexify/cli/build_image.py +51 -0
- indexify/cli/deploy.py +57 -0
- indexify/cli/executor.py +205 -0
- indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
- indexify/executor/executor.py +57 -311
- indexify/executor/function_allowlist.py +59 -0
- indexify/executor/function_executor/function_executor.py +12 -6
- indexify/executor/function_executor/invocation_state_client.py +25 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
- indexify/executor/function_executor_controller/__init__.py +13 -0
- indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
- indexify/executor/function_executor_controller/create_function_executor.py +154 -0
- indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
- indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
- indexify/executor/function_executor_controller/downloads.py +199 -0
- indexify/executor/function_executor_controller/events.py +172 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
- indexify/executor/function_executor_controller/loggers.py +57 -0
- indexify/executor/function_executor_controller/message_validators.py +65 -0
- indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
- indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
- indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
- indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
- indexify/executor/function_executor_controller/prepare_task.py +38 -0
- indexify/executor/function_executor_controller/run_task.py +201 -0
- indexify/executor/function_executor_controller/task_info.py +33 -0
- indexify/executor/function_executor_controller/task_output.py +122 -0
- indexify/executor/function_executor_controller/upload_task_output.py +234 -0
- indexify/executor/host_resources/host_resources.py +20 -25
- indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
- indexify/executor/metrics/executor.py +0 -47
- indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
- indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
- indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
- indexify/executor/monitoring/health_checker/health_checker.py +0 -11
- indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
- indexify/executor/state_reporter.py +364 -0
- indexify/proto/executor_api.proto +67 -59
- indexify/proto/executor_api_pb2.py +52 -52
- indexify/proto/executor_api_pb2.pyi +125 -104
- indexify/proto/executor_api_pb2_grpc.py +0 -47
- {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/METADATA +1 -3
- indexify-0.4.2.dist-info/RECORD +68 -0
- indexify-0.4.2.dist-info/entry_points.txt +3 -0
- indexify/cli/cli.py +0 -267
- indexify/executor/api_objects.py +0 -92
- indexify/executor/downloader.py +0 -417
- indexify/executor/executor_flavor.py +0 -7
- indexify/executor/function_executor/function_executor_state.py +0 -107
- indexify/executor/function_executor/function_executor_states_container.py +0 -93
- indexify/executor/function_executor/function_executor_status.py +0 -95
- indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
- indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
- indexify/executor/function_executor/single_task_runner.py +0 -345
- indexify/executor/function_executor/task_input.py +0 -21
- indexify/executor/function_executor/task_output.py +0 -105
- indexify/executor/grpc/function_executor_controller.py +0 -418
- indexify/executor/grpc/metrics/task_controller.py +0 -8
- indexify/executor/grpc/state_reporter.py +0 -314
- indexify/executor/grpc/task_controller.py +0 -508
- indexify/executor/metrics/task_fetcher.py +0 -21
- indexify/executor/metrics/task_reporter.py +0 -53
- indexify/executor/metrics/task_runner.py +0 -52
- indexify/executor/monitoring/function_allowlist.py +0 -25
- indexify/executor/runtime_probes.py +0 -68
- indexify/executor/task_fetcher.py +0 -96
- indexify/executor/task_reporter.py +0 -459
- indexify/executor/task_runner.py +0 -177
- indexify-0.3.30.dist-info/RECORD +0 -68
- indexify-0.3.30.dist-info/entry_points.txt +0 -3
- {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/WHEEL +0 -0
@@ -1,314 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
import hashlib
|
3
|
-
from socket import gethostname
|
4
|
-
from typing import Any, Dict, List, Optional
|
5
|
-
|
6
|
-
from indexify.proto.executor_api_pb2 import (
|
7
|
-
AllowedFunction,
|
8
|
-
)
|
9
|
-
from indexify.proto.executor_api_pb2 import ExecutorFlavor as ExecutorFlavorProto
|
10
|
-
from indexify.proto.executor_api_pb2 import (
|
11
|
-
ExecutorState,
|
12
|
-
ExecutorStatus,
|
13
|
-
FunctionExecutorDescription,
|
14
|
-
)
|
15
|
-
from indexify.proto.executor_api_pb2 import (
|
16
|
-
FunctionExecutorState as FunctionExecutorStateProto,
|
17
|
-
)
|
18
|
-
from indexify.proto.executor_api_pb2 import (
|
19
|
-
FunctionExecutorStatus as FunctionExecutorStatusProto,
|
20
|
-
)
|
21
|
-
from indexify.proto.executor_api_pb2 import GPUModel as GPUModelProto
|
22
|
-
from indexify.proto.executor_api_pb2 import GPUResources as GPUResourcesProto
|
23
|
-
from indexify.proto.executor_api_pb2 import HostResources as HostResourcesProto
|
24
|
-
from indexify.proto.executor_api_pb2 import (
|
25
|
-
ReportExecutorStateRequest,
|
26
|
-
)
|
27
|
-
from indexify.proto.executor_api_pb2_grpc import ExecutorAPIStub
|
28
|
-
|
29
|
-
from ..api_objects import FunctionURI
|
30
|
-
from ..executor_flavor import ExecutorFlavor
|
31
|
-
from ..function_executor.function_executor_state import FunctionExecutorState
|
32
|
-
from ..function_executor.function_executor_states_container import (
|
33
|
-
FunctionExecutorStatesContainer,
|
34
|
-
)
|
35
|
-
from ..function_executor.function_executor_status import FunctionExecutorStatus
|
36
|
-
from ..host_resources.host_resources import HostResources, HostResourcesProvider
|
37
|
-
from ..host_resources.nvidia_gpu import NVIDIA_GPU_MODEL
|
38
|
-
from ..runtime_probes import RuntimeProbes
|
39
|
-
from .channel_manager import ChannelManager
|
40
|
-
from .metrics.state_reporter import (
|
41
|
-
metric_state_report_errors,
|
42
|
-
metric_state_report_latency,
|
43
|
-
metric_state_report_rpcs,
|
44
|
-
)
|
45
|
-
|
46
|
-
_REPORTING_INTERVAL_SEC = 5
|
47
|
-
_REPORT_RPC_TIMEOUT_SEC = 5
|
48
|
-
_REPORT_BACKOFF_ON_ERROR_SEC = 5
|
49
|
-
|
50
|
-
|
51
|
-
class ExecutorStateReporter:
|
52
|
-
def __init__(
|
53
|
-
self,
|
54
|
-
executor_id: str,
|
55
|
-
flavor: ExecutorFlavor,
|
56
|
-
version: str,
|
57
|
-
labels: Dict[str, str],
|
58
|
-
function_allowlist: Optional[List[FunctionURI]],
|
59
|
-
function_executor_states: FunctionExecutorStatesContainer,
|
60
|
-
channel_manager: ChannelManager,
|
61
|
-
host_resources_provider: HostResourcesProvider,
|
62
|
-
logger: Any,
|
63
|
-
reporting_interval_sec: int = _REPORTING_INTERVAL_SEC,
|
64
|
-
):
|
65
|
-
self._executor_id: str = executor_id
|
66
|
-
self._flavor: ExecutorFlavor = flavor
|
67
|
-
self._version: str = version
|
68
|
-
self._labels: Dict[str, str] = labels.copy()
|
69
|
-
self._hostname: str = gethostname()
|
70
|
-
self._function_executor_states: FunctionExecutorStatesContainer = (
|
71
|
-
function_executor_states
|
72
|
-
)
|
73
|
-
self._channel_manager = channel_manager
|
74
|
-
self._host_resources_provider: HostResourcesProvider = host_resources_provider
|
75
|
-
self._logger: Any = logger.bind(module=__name__)
|
76
|
-
self._reporting_interval_sec: int = reporting_interval_sec
|
77
|
-
self._total_host_resources: Optional[HostResourcesProto] = None
|
78
|
-
self._total_function_executor_resources: Optional[HostResourcesProto] = None
|
79
|
-
|
80
|
-
self._is_shutdown: bool = False
|
81
|
-
self._executor_status: ExecutorStatus = ExecutorStatus.EXECUTOR_STATUS_UNKNOWN
|
82
|
-
self._allowed_functions: List[AllowedFunction] = _to_grpc_allowed_functions(
|
83
|
-
function_allowlist
|
84
|
-
)
|
85
|
-
self._labels.update(_label_values_to_strings(RuntimeProbes().probe().labels))
|
86
|
-
self._last_server_clock: int = (
|
87
|
-
0 # Server expects initial value to be 0 until it is set by Server.
|
88
|
-
)
|
89
|
-
|
90
|
-
def update_executor_status(self, value: ExecutorStatus):
|
91
|
-
self._executor_status = value
|
92
|
-
|
93
|
-
def update_last_server_clock(self, value: int):
|
94
|
-
self._last_server_clock = value
|
95
|
-
|
96
|
-
async def run(self):
|
97
|
-
"""Runs the state reporter.
|
98
|
-
|
99
|
-
Never raises any exceptions.
|
100
|
-
"""
|
101
|
-
# TODO: Move this method into a new async task and cancel it in shutdown().
|
102
|
-
while not self._is_shutdown:
|
103
|
-
stub = ExecutorAPIStub(await self._channel_manager.get_channel())
|
104
|
-
while not self._is_shutdown:
|
105
|
-
try:
|
106
|
-
# The periodic state reports serve as channel health monitoring requests
|
107
|
-
# (same as TCP keep-alive). Channel Manager returns the same healthy channel
|
108
|
-
# for all RPCs that we do from Executor to Server. So all the RPCs benefit
|
109
|
-
# from this channel health monitoring.
|
110
|
-
await self.report_state(stub)
|
111
|
-
await asyncio.sleep(self._reporting_interval_sec)
|
112
|
-
except Exception as e:
|
113
|
-
self._logger.error(
|
114
|
-
f"failed to report state to the server, reconnecting in {_REPORT_BACKOFF_ON_ERROR_SEC} sec.",
|
115
|
-
exc_info=e,
|
116
|
-
)
|
117
|
-
await asyncio.sleep(_REPORT_BACKOFF_ON_ERROR_SEC)
|
118
|
-
break
|
119
|
-
|
120
|
-
self._logger.info("state reporter shutdown")
|
121
|
-
|
122
|
-
async def report_state(self, stub: ExecutorAPIStub):
|
123
|
-
"""Reports the current state to the server represented by the supplied stub.
|
124
|
-
|
125
|
-
Raises exceptions on failure.
|
126
|
-
"""
|
127
|
-
if self._total_host_resources is None:
|
128
|
-
# We need to fetch total resources only once, because they are not changing.
|
129
|
-
total_host_resources: HostResources = (
|
130
|
-
await self._host_resources_provider.total_host_resources(self._logger)
|
131
|
-
)
|
132
|
-
total_function_executor_resources: HostResources = (
|
133
|
-
await self._host_resources_provider.total_function_executor_resources(
|
134
|
-
self._logger
|
135
|
-
)
|
136
|
-
)
|
137
|
-
self._logger.info(
|
138
|
-
"detected host resources",
|
139
|
-
total_host_resources=total_host_resources,
|
140
|
-
total_function_executor_resources=total_function_executor_resources,
|
141
|
-
)
|
142
|
-
self._total_host_resources = _host_resources_to_proto(total_host_resources)
|
143
|
-
self._total_function_executor_resources = _host_resources_to_proto(
|
144
|
-
total_function_executor_resources
|
145
|
-
)
|
146
|
-
|
147
|
-
with (
|
148
|
-
metric_state_report_errors.count_exceptions(),
|
149
|
-
metric_state_report_latency.time(),
|
150
|
-
):
|
151
|
-
metric_state_report_rpcs.inc()
|
152
|
-
state = ExecutorState(
|
153
|
-
executor_id=self._executor_id,
|
154
|
-
hostname=self._hostname,
|
155
|
-
flavor=_to_grpc_executor_flavor(self._flavor, self._logger),
|
156
|
-
version=self._version,
|
157
|
-
status=self._executor_status,
|
158
|
-
total_function_executor_resources=self._total_function_executor_resources,
|
159
|
-
total_resources=self._total_host_resources,
|
160
|
-
allowed_functions=self._allowed_functions,
|
161
|
-
function_executor_states=await self._fetch_function_executor_states(),
|
162
|
-
labels=self._labels,
|
163
|
-
)
|
164
|
-
state.state_hash = _state_hash(state)
|
165
|
-
# Set fields not included in the state hash.
|
166
|
-
state.server_clock = self._last_server_clock
|
167
|
-
|
168
|
-
await stub.report_executor_state(
|
169
|
-
ReportExecutorStateRequest(executor_state=state),
|
170
|
-
timeout=_REPORT_RPC_TIMEOUT_SEC,
|
171
|
-
)
|
172
|
-
|
173
|
-
async def shutdown(self):
|
174
|
-
"""Shuts down the state reporter.
|
175
|
-
|
176
|
-
Never raises any exceptions.
|
177
|
-
"""
|
178
|
-
self._is_shutdown = True
|
179
|
-
|
180
|
-
async def _fetch_function_executor_states(self) -> List[FunctionExecutorStateProto]:
|
181
|
-
states = []
|
182
|
-
|
183
|
-
async for function_executor_state in self._function_executor_states:
|
184
|
-
function_executor_state: FunctionExecutorState
|
185
|
-
function_executor_state_proto = FunctionExecutorStateProto(
|
186
|
-
description=FunctionExecutorDescription(
|
187
|
-
id=function_executor_state.id,
|
188
|
-
namespace=function_executor_state.namespace,
|
189
|
-
graph_name=function_executor_state.graph_name,
|
190
|
-
graph_version=function_executor_state.graph_version,
|
191
|
-
function_name=function_executor_state.function_name,
|
192
|
-
secret_names=function_executor_state.secret_names,
|
193
|
-
),
|
194
|
-
status=_to_grpc_function_executor_status(
|
195
|
-
function_executor_state.status, self._logger
|
196
|
-
),
|
197
|
-
)
|
198
|
-
if function_executor_state.image_uri:
|
199
|
-
function_executor_state_proto.description.image_uri = (
|
200
|
-
function_executor_state.image_uri
|
201
|
-
)
|
202
|
-
states.append(function_executor_state_proto)
|
203
|
-
|
204
|
-
return states
|
205
|
-
|
206
|
-
|
207
|
-
def _to_grpc_allowed_functions(function_allowlist: Optional[List[FunctionURI]]):
|
208
|
-
if function_allowlist is None:
|
209
|
-
return []
|
210
|
-
|
211
|
-
allowed_functions: List[AllowedFunction] = []
|
212
|
-
for function_uri in function_allowlist:
|
213
|
-
function_uri: FunctionURI
|
214
|
-
allowed_function = AllowedFunction(
|
215
|
-
namespace=function_uri.namespace,
|
216
|
-
graph_name=function_uri.compute_graph,
|
217
|
-
function_name=function_uri.compute_fn,
|
218
|
-
)
|
219
|
-
if function_uri.version is not None:
|
220
|
-
allowed_function.graph_version = function_uri.version
|
221
|
-
allowed_functions.append(allowed_function)
|
222
|
-
|
223
|
-
return allowed_functions
|
224
|
-
|
225
|
-
|
226
|
-
_STATUS_MAPPING: Dict[FunctionExecutorStatus, Any] = {
|
227
|
-
FunctionExecutorStatus.STARTING_UP: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTING_UP,
|
228
|
-
FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR,
|
229
|
-
FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR,
|
230
|
-
FunctionExecutorStatus.IDLE: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE,
|
231
|
-
FunctionExecutorStatus.RUNNING_TASK: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK,
|
232
|
-
FunctionExecutorStatus.UNHEALTHY: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNHEALTHY,
|
233
|
-
FunctionExecutorStatus.DESTROYING: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPING,
|
234
|
-
FunctionExecutorStatus.DESTROYED: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
|
235
|
-
FunctionExecutorStatus.SHUTDOWN: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
|
236
|
-
}
|
237
|
-
|
238
|
-
|
239
|
-
def _to_grpc_function_executor_status(
|
240
|
-
status: FunctionExecutorStatus, logger: Any
|
241
|
-
) -> FunctionExecutorStatusProto:
|
242
|
-
result: FunctionExecutorStatusProto = _STATUS_MAPPING.get(
|
243
|
-
status, FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNKNOWN
|
244
|
-
)
|
245
|
-
|
246
|
-
if result == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNKNOWN:
|
247
|
-
logger.error("unexpected Function Executor status", status=status)
|
248
|
-
|
249
|
-
return result
|
250
|
-
|
251
|
-
|
252
|
-
_FLAVOR_MAPPING = {
|
253
|
-
ExecutorFlavor.OSS: ExecutorFlavorProto.EXECUTOR_FLAVOR_OSS,
|
254
|
-
ExecutorFlavor.PLATFORM: ExecutorFlavorProto.EXECUTOR_FLAVOR_PLATFORM,
|
255
|
-
}
|
256
|
-
|
257
|
-
|
258
|
-
def _to_grpc_executor_flavor(
|
259
|
-
flavor: ExecutorFlavor, logger: Any
|
260
|
-
) -> ExecutorFlavorProto:
|
261
|
-
result: ExecutorFlavorProto = _FLAVOR_MAPPING.get(
|
262
|
-
flavor, ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN
|
263
|
-
)
|
264
|
-
|
265
|
-
if result == ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN:
|
266
|
-
logger.error("unexpected Executor flavor", flavor=flavor)
|
267
|
-
|
268
|
-
return result
|
269
|
-
|
270
|
-
|
271
|
-
def _label_values_to_strings(labels: Dict[str, Any]) -> Dict[str, str]:
|
272
|
-
return {k: str(v) for k, v in labels.items()}
|
273
|
-
|
274
|
-
|
275
|
-
def _state_hash(state: ExecutorState) -> str:
|
276
|
-
serialized_state: bytes = state.SerializeToString(deterministic=True)
|
277
|
-
hasher = hashlib.sha256(usedforsecurity=False)
|
278
|
-
hasher.update(serialized_state)
|
279
|
-
return hasher.hexdigest()
|
280
|
-
|
281
|
-
|
282
|
-
def _host_resources_to_proto(host_resources: HostResources) -> HostResourcesProto:
|
283
|
-
proto = HostResourcesProto(
|
284
|
-
cpu_count=host_resources.cpu_count,
|
285
|
-
memory_bytes=host_resources.memory_mb * 1024 * 1024,
|
286
|
-
disk_bytes=host_resources.disk_mb * 1024 * 1024,
|
287
|
-
)
|
288
|
-
if len(host_resources.gpus) > 0:
|
289
|
-
proto.gpu.CopyFrom(
|
290
|
-
GPUResourcesProto(
|
291
|
-
count=len(host_resources.gpus),
|
292
|
-
model=_gpu_model_to_proto(
|
293
|
-
host_resources.gpus[0].model
|
294
|
-
), # All GPUs have the same model
|
295
|
-
)
|
296
|
-
)
|
297
|
-
return proto
|
298
|
-
|
299
|
-
|
300
|
-
def _gpu_model_to_proto(gpu_model: NVIDIA_GPU_MODEL) -> GPUModelProto:
|
301
|
-
if gpu_model == NVIDIA_GPU_MODEL.A100_40GB:
|
302
|
-
return GPUModelProto.GPU_MODEL_NVIDIA_A100_40GB
|
303
|
-
elif gpu_model == NVIDIA_GPU_MODEL.A100_80GB:
|
304
|
-
return GPUModelProto.GPU_MODEL_NVIDIA_A100_80GB
|
305
|
-
elif gpu_model == NVIDIA_GPU_MODEL.H100_80GB:
|
306
|
-
return GPUModelProto.GPU_MODEL_NVIDIA_H100_80GB
|
307
|
-
elif gpu_model == NVIDIA_GPU_MODEL.TESLA_T4:
|
308
|
-
return GPUModelProto.GPU_MODEL_NVIDIA_TESLA_T4
|
309
|
-
elif gpu_model == NVIDIA_GPU_MODEL.A6000:
|
310
|
-
return GPUModelProto.GPU_MODEL_NVIDIA_A6000
|
311
|
-
elif gpu_model == NVIDIA_GPU_MODEL.A10:
|
312
|
-
return GPUModelProto.GPU_MODEL_NVIDIA_A10
|
313
|
-
else:
|
314
|
-
return GPUModelProto.GPU_MODEL_UNKNOWN
|