indexify 0.3.28__tar.gz → 0.3.30__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {indexify-0.3.28 → indexify-0.3.30}/PKG-INFO +1 -1
- {indexify-0.3.28 → indexify-0.3.30}/pyproject.toml +1 -1
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/cli/cli.py +25 -33
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/executor.py +0 -3
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +12 -28
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/grpc/state_reporter.py +4 -3
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/host_resources/nvidia_gpu.py +26 -12
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/proto/executor_api.proto +2 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/proto/executor_api_pb2.py +14 -14
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/proto/executor_api_pb2.pyi +4 -0
- {indexify-0.3.28 → indexify-0.3.30}/README.md +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/README.md +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/api_objects.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/blob_store/blob_store.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/blob_store/local_fs_blob_store.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/blob_store/metrics/blob_store.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/blob_store/s3_blob_store.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/downloader.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/executor_flavor.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/function_executor.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/function_executor_state.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/function_executor_states_container.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/function_executor_status.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/health_checker.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/invocation_state_client.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/metrics/function_executor.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/metrics/function_executor_state.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/metrics/health_checker.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/metrics/invocation_state_client.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/metrics/single_task_runner.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/server/client_configuration.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/server/function_executor_server.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/server/function_executor_server_factory.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/server/subprocess_function_executor_server.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/single_task_runner.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/task_input.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/task_output.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/grpc/channel_manager.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/grpc/function_executor_controller.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/grpc/metrics/channel_manager.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/grpc/metrics/state_reconciler.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/grpc/metrics/state_reporter.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/grpc/metrics/task_controller.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/grpc/state_reconciler.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/grpc/task_controller.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/host_resources/host_resources.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/host_resources/nvidia_gpu_allocator.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/metrics/downloader.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/metrics/executor.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/metrics/task_fetcher.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/metrics/task_reporter.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/metrics/task_runner.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/monitoring/function_allowlist.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/monitoring/handler.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/monitoring/health_check_handler.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/monitoring/health_checker/generic_health_checker.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/monitoring/health_checker/health_checker.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/monitoring/metrics.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/monitoring/prometheus_metrics_handler.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/monitoring/server.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/monitoring/startup_probe_handler.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/runtime_probes.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/task_fetcher.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/task_reporter.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/task_runner.py +0 -0
- {indexify-0.3.28 → indexify-0.3.30}/src/indexify/proto/executor_api_pb2_grpc.py +0 -0
@@ -1,7 +1,7 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "indexify"
|
3
3
|
# Incremented if any of the components provided in this packages are updated.
|
4
|
-
version = "0.3.
|
4
|
+
version = "0.3.30"
|
5
5
|
description = "Open Source Indexify components and helper tools"
|
6
6
|
authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
|
7
7
|
license = "Apache 2.0"
|
@@ -78,13 +78,15 @@ def build_image(
|
|
78
78
|
|
79
79
|
|
80
80
|
@app.command(
|
81
|
-
|
81
|
+
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
82
|
+
help="Runs Executor that connects to the Indexify server and starts running its tasks",
|
82
83
|
)
|
83
84
|
def executor(
|
85
|
+
ctx: typer.Context,
|
84
86
|
server_addr: str = "localhost:8900",
|
85
87
|
grpc_server_addr: str = "localhost:8901",
|
86
|
-
|
87
|
-
bool, typer.Option("--
|
88
|
+
verbose_logs: Annotated[
|
89
|
+
bool, typer.Option("--verbose", "-v", help="Run the executor in verbose mode")
|
88
90
|
] = False,
|
89
91
|
function_uris: Annotated[
|
90
92
|
Optional[List[str]],
|
@@ -103,11 +105,6 @@ def executor(
|
|
103
105
|
executor_cache: Optional[str] = typer.Option(
|
104
106
|
"~/.indexify/executor_cache", help="Path to the executor cache directory"
|
105
107
|
),
|
106
|
-
# Registred ports range ends at 49151.
|
107
|
-
ports: Tuple[int, int] = typer.Option(
|
108
|
-
(50000, 51000),
|
109
|
-
help="Range of localhost TCP ports to be used by Function Executors",
|
110
|
-
),
|
111
108
|
monitoring_server_host: Annotated[
|
112
109
|
str,
|
113
110
|
typer.Option(
|
@@ -131,16 +128,22 @@ def executor(
|
|
131
128
|
"Specified as <key>=<value>",
|
132
129
|
),
|
133
130
|
] = [],
|
131
|
+
enable_grpc_state_reconciler: Annotated[
|
132
|
+
bool,
|
133
|
+
typer.Option(
|
134
|
+
"--enable-grpc-state-reconciler",
|
135
|
+
help=(
|
136
|
+
"(exprimental) Enable gRPC state reconciler that will reconcile the state of the Function Executors and Task Allocations\n"
|
137
|
+
"with the desired state provided by Server. Required --grpc-server-addr to be set."
|
138
|
+
),
|
139
|
+
),
|
140
|
+
] = False,
|
134
141
|
):
|
135
|
-
if
|
142
|
+
if verbose_logs:
|
136
143
|
compact_tracebacks: bool = os.getenv("INDEXIFY_COMPACT_TRACEBACKS", "1") == "1"
|
137
144
|
configure_development_mode_logging(compact_tracebacks=compact_tracebacks)
|
138
145
|
else:
|
139
146
|
configure_production_mode_logging()
|
140
|
-
if function_uris is None:
|
141
|
-
raise typer.BadParameter(
|
142
|
-
"At least one function must be specified when not running in development mode"
|
143
|
-
)
|
144
147
|
|
145
148
|
kv_labels: Dict[str, str] = {}
|
146
149
|
for label in labels:
|
@@ -160,30 +163,23 @@ def executor(
|
|
160
163
|
executor_version=executor_version,
|
161
164
|
labels=kv_labels,
|
162
165
|
executor_cache=executor_cache,
|
163
|
-
ports=ports,
|
164
166
|
functions=function_uris,
|
165
|
-
|
167
|
+
verbose_logs=verbose_logs,
|
166
168
|
monitoring_server_host=monitoring_server_host,
|
167
169
|
monitoring_server_port=monitoring_server_port,
|
168
|
-
enable_grpc_state_reconciler=
|
170
|
+
enable_grpc_state_reconciler=enable_grpc_state_reconciler,
|
169
171
|
)
|
172
|
+
if ctx.args:
|
173
|
+
logger.warning(
|
174
|
+
"Unknown arguments passed to the executor",
|
175
|
+
unknown_args=ctx.args,
|
176
|
+
)
|
170
177
|
|
171
178
|
executor_cache = Path(executor_cache).expanduser().absolute()
|
172
179
|
if os.path.exists(executor_cache):
|
173
180
|
shutil.rmtree(executor_cache)
|
174
181
|
Path(executor_cache).mkdir(parents=True, exist_ok=True)
|
175
182
|
|
176
|
-
start_port: int = ports[0]
|
177
|
-
end_port: int = ports[1]
|
178
|
-
if start_port >= end_port:
|
179
|
-
console.print(
|
180
|
-
Text(
|
181
|
-
f"start port {start_port} should be less than {end_port}", style="red"
|
182
|
-
),
|
183
|
-
)
|
184
|
-
exit(1)
|
185
|
-
|
186
|
-
# Enable all available blob stores in OSS because we don't know which one is going to be used.
|
187
183
|
blob_store: BLOBStore = BLOBStore(
|
188
184
|
# Local FS mode is used in tests and in cases when user wants to store data on NFS.
|
189
185
|
local=LocalFSBLOBStore(),
|
@@ -209,23 +205,19 @@ def executor(
|
|
209
205
|
|
210
206
|
Executor(
|
211
207
|
id=executor_id,
|
212
|
-
development_mode=dev,
|
213
208
|
flavor=ExecutorFlavor.OSS,
|
214
209
|
version=executor_version,
|
215
210
|
labels=kv_labels,
|
216
211
|
health_checker=GenericHealthChecker(),
|
217
212
|
code_path=executor_cache,
|
218
213
|
function_allowlist=_parse_function_uris(function_uris),
|
219
|
-
function_executor_server_factory=SubprocessFunctionExecutorServerFactory(
|
220
|
-
development_mode=dev,
|
221
|
-
server_ports=range(ports[0], ports[1]),
|
222
|
-
),
|
214
|
+
function_executor_server_factory=SubprocessFunctionExecutorServerFactory(),
|
223
215
|
server_addr=server_addr,
|
224
216
|
grpc_server_addr=grpc_server_addr,
|
225
217
|
config_path=config_path,
|
226
218
|
monitoring_server_host=monitoring_server_host,
|
227
219
|
monitoring_server_port=monitoring_server_port,
|
228
|
-
enable_grpc_state_reconciler=
|
220
|
+
enable_grpc_state_reconciler=enable_grpc_state_reconciler,
|
229
221
|
blob_store=blob_store,
|
230
222
|
host_resources_provider=host_resources_provider,
|
231
223
|
).run()
|
@@ -57,7 +57,6 @@ class Executor:
|
|
57
57
|
def __init__(
|
58
58
|
self,
|
59
59
|
id: str,
|
60
|
-
development_mode: bool,
|
61
60
|
flavor: ExecutorFlavor,
|
62
61
|
version: str,
|
63
62
|
labels: Dict[str, str],
|
@@ -116,7 +115,6 @@ class Executor:
|
|
116
115
|
flavor=flavor,
|
117
116
|
version=version,
|
118
117
|
labels=labels,
|
119
|
-
development_mode=development_mode,
|
120
118
|
function_allowlist=self._function_allowlist,
|
121
119
|
function_executor_states=self._function_executor_states,
|
122
120
|
channel_manager=self._channel_manager,
|
@@ -173,7 +171,6 @@ class Executor:
|
|
173
171
|
|
174
172
|
executor_info: Dict[str, str] = {
|
175
173
|
"id": id,
|
176
|
-
"dev_mode": str(development_mode),
|
177
174
|
"flavor": flavor.name,
|
178
175
|
"version": version,
|
179
176
|
"code_path": str(code_path),
|
@@ -1,7 +1,8 @@
|
|
1
1
|
import asyncio
|
2
2
|
import os
|
3
3
|
import signal
|
4
|
-
|
4
|
+
import socket
|
5
|
+
from typing import Any, Optional
|
5
6
|
|
6
7
|
from .function_executor_server_factory import (
|
7
8
|
FunctionExecutorServerConfiguration,
|
@@ -10,15 +11,15 @@ from .function_executor_server_factory import (
|
|
10
11
|
from .subprocess_function_executor_server import SubprocessFunctionExecutorServer
|
11
12
|
|
12
13
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
self._free_ports: List[int] = list(reversed(server_ports))
|
14
|
+
def get_free_tcp_port(iface_name="localhost") -> int:
|
15
|
+
tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
16
|
+
tcp.bind((iface_name, 0))
|
17
|
+
_, port = tcp.getsockname()
|
18
|
+
tcp.close()
|
19
|
+
return port
|
20
|
+
|
21
21
|
|
22
|
+
class SubprocessFunctionExecutorServerFactory(FunctionExecutorServerFactory):
|
22
23
|
async def create(
|
23
24
|
self, config: FunctionExecutorServerConfiguration, logger: Any
|
24
25
|
) -> SubprocessFunctionExecutorServer:
|
@@ -32,14 +33,13 @@ class SubprocessFunctionExecutorServerFactory(FunctionExecutorServerFactory):
|
|
32
33
|
)
|
33
34
|
|
34
35
|
try:
|
35
|
-
port =
|
36
|
+
port = get_free_tcp_port()
|
37
|
+
logger.info("allocated function executor port", port=port)
|
36
38
|
args = [
|
37
39
|
f"--executor-id={config.executor_id}", # use = as executor_id can start with -
|
38
40
|
"--address",
|
39
41
|
_server_address(port),
|
40
42
|
]
|
41
|
-
if self._development_mode:
|
42
|
-
args.append("--dev")
|
43
43
|
# Run the process with our stdout, stderr. We want to see process logs and exceptions in our process output.
|
44
44
|
# This is useful for dubugging. Customer function stdout and stderr is captured and returned in the response
|
45
45
|
# so we won't see it in our process outputs. This is the right behavior as customer function stdout and stderr
|
@@ -56,8 +56,6 @@ class SubprocessFunctionExecutorServerFactory(FunctionExecutorServerFactory):
|
|
56
56
|
address=_server_address(port),
|
57
57
|
)
|
58
58
|
except Exception as e:
|
59
|
-
if port is not None:
|
60
|
-
self._release_port(port)
|
61
59
|
logger.error(
|
62
60
|
"failed starting a new Function Executor process at port {port}",
|
63
61
|
exc_info=e,
|
@@ -91,20 +89,6 @@ class SubprocessFunctionExecutorServerFactory(FunctionExecutorServerFactory):
|
|
91
89
|
"failed to cleanup Function Executor process",
|
92
90
|
exc_info=e,
|
93
91
|
)
|
94
|
-
finally:
|
95
|
-
self._release_port(port)
|
96
|
-
|
97
|
-
def _allocate_port(self) -> int:
|
98
|
-
# No asyncio.Lock is required here because this operation never awaits
|
99
|
-
# and it is always called from the same thread where the event loop is running.
|
100
|
-
return self._free_ports.pop()
|
101
|
-
|
102
|
-
def _release_port(self, port: int) -> None:
|
103
|
-
# No asyncio.Lock is required here because this operation never awaits
|
104
|
-
# and it is always called from the same thread where the event loop is running.
|
105
|
-
#
|
106
|
-
# Prefer port reuse to repro as many possible issues deterministically as possible.
|
107
|
-
self._free_ports.append(port)
|
108
92
|
|
109
93
|
|
110
94
|
def _server_address(port: int) -> str:
|
@@ -55,7 +55,6 @@ class ExecutorStateReporter:
|
|
55
55
|
flavor: ExecutorFlavor,
|
56
56
|
version: str,
|
57
57
|
labels: Dict[str, str],
|
58
|
-
development_mode: bool,
|
59
58
|
function_allowlist: Optional[List[FunctionURI]],
|
60
59
|
function_executor_states: FunctionExecutorStatesContainer,
|
61
60
|
channel_manager: ChannelManager,
|
@@ -67,7 +66,6 @@ class ExecutorStateReporter:
|
|
67
66
|
self._flavor: ExecutorFlavor = flavor
|
68
67
|
self._version: str = version
|
69
68
|
self._labels: Dict[str, str] = labels.copy()
|
70
|
-
self._development_mode: bool = development_mode
|
71
69
|
self._hostname: str = gethostname()
|
72
70
|
self._function_executor_states: FunctionExecutorStatesContainer = (
|
73
71
|
function_executor_states
|
@@ -153,7 +151,6 @@ class ExecutorStateReporter:
|
|
153
151
|
metric_state_report_rpcs.inc()
|
154
152
|
state = ExecutorState(
|
155
153
|
executor_id=self._executor_id,
|
156
|
-
development_mode=self._development_mode,
|
157
154
|
hostname=self._hostname,
|
158
155
|
flavor=_to_grpc_executor_flavor(self._flavor, self._logger),
|
159
156
|
version=self._version,
|
@@ -309,5 +306,9 @@ def _gpu_model_to_proto(gpu_model: NVIDIA_GPU_MODEL) -> GPUModelProto:
|
|
309
306
|
return GPUModelProto.GPU_MODEL_NVIDIA_H100_80GB
|
310
307
|
elif gpu_model == NVIDIA_GPU_MODEL.TESLA_T4:
|
311
308
|
return GPUModelProto.GPU_MODEL_NVIDIA_TESLA_T4
|
309
|
+
elif gpu_model == NVIDIA_GPU_MODEL.A6000:
|
310
|
+
return GPUModelProto.GPU_MODEL_NVIDIA_A6000
|
311
|
+
elif gpu_model == NVIDIA_GPU_MODEL.A10:
|
312
|
+
return GPUModelProto.GPU_MODEL_NVIDIA_A10
|
312
313
|
else:
|
313
314
|
return GPUModelProto.GPU_MODEL_UNKNOWN
|
@@ -6,12 +6,15 @@ from pydantic import BaseModel
|
|
6
6
|
|
7
7
|
|
8
8
|
# Only NVIDIA GPUs currently supported in Tensorlake SDK are listed here.
|
9
|
+
# GPU models coming with multiple memory sizes have a different enum value per memory size.
|
9
10
|
class NVIDIA_GPU_MODEL(str, Enum):
|
10
11
|
UNKNOWN = "UNKNOWN"
|
11
12
|
A100_40GB = "A100-40GB"
|
12
13
|
A100_80GB = "A100-80GB"
|
13
|
-
H100_80GB = "H100"
|
14
|
+
H100_80GB = "H100-80GB"
|
14
15
|
TESLA_T4 = "T4"
|
16
|
+
A6000 = "A6000"
|
17
|
+
A10 = "A10"
|
15
18
|
|
16
19
|
|
17
20
|
class NvidiaGPUInfo(BaseModel):
|
@@ -54,28 +57,39 @@ def fetch_nvidia_gpu_infos(logger: Any) -> List[NvidiaGPUInfo]:
|
|
54
57
|
# 1, NVIDIA A100-PCIE-40GB, GPU-e9c9aa65-bff3-405a-ab7c-dc879cc88169
|
55
58
|
# 2, NVIDIA H100 80GB HBM3, GPU-8c35f4c9-4dff-c9a2-866f-afb5d82e1dd7
|
56
59
|
# 3, Tesla T4, GPU-2a7fadae-a692-1c44-2c57-6645a0d117e4
|
60
|
+
# 4, NVIDIA RTX A6000, GPU-efe4927a-743f-e4cc-28bb-da604f545b6d
|
61
|
+
# 5, NVIDIA A10, GPU-12463b8c-40bb-7322-6c7a-ef48bd7bd39b
|
57
62
|
parts = line.split(",")
|
58
63
|
index = parts[0].strip()
|
59
64
|
product_name = parts[1].strip()
|
60
65
|
uuid = parts[2].strip()
|
61
66
|
|
62
|
-
model =
|
63
|
-
if
|
64
|
-
model = NVIDIA_GPU_MODEL.A100_80GB
|
65
|
-
if product_name.startswith("NVIDIA A100") and product_name.endswith("40GB"):
|
66
|
-
model = NVIDIA_GPU_MODEL.A100_40GB
|
67
|
-
elif product_name.startswith("NVIDIA H100"):
|
68
|
-
model = NVIDIA_GPU_MODEL.H100_80GB
|
69
|
-
elif product_name.startswith("Tesla T4"):
|
70
|
-
model = NVIDIA_GPU_MODEL.TESLA_T4
|
71
|
-
else:
|
67
|
+
model = _product_name_to_model(product_name)
|
68
|
+
if model == NVIDIA_GPU_MODEL.UNKNOWN:
|
72
69
|
logger.warning(
|
73
70
|
"Unknown GPU model was detected, ignoring", nvidia_smi_output=line
|
74
71
|
)
|
75
|
-
|
76
72
|
infos.append(
|
77
73
|
NvidiaGPUInfo(
|
78
74
|
index=index, uuid=uuid, product_name=product_name, model=model
|
79
75
|
)
|
80
76
|
)
|
77
|
+
|
81
78
|
return infos
|
79
|
+
|
80
|
+
|
81
|
+
def _product_name_to_model(product_name: str) -> NVIDIA_GPU_MODEL:
|
82
|
+
if product_name.startswith("NVIDIA A100") and product_name.endswith("80GB"):
|
83
|
+
return NVIDIA_GPU_MODEL.A100_80GB
|
84
|
+
if product_name.startswith("NVIDIA A100") and product_name.endswith("40GB"):
|
85
|
+
return NVIDIA_GPU_MODEL.A100_40GB
|
86
|
+
elif product_name.startswith("NVIDIA H100") and "80GB" in product_name:
|
87
|
+
return NVIDIA_GPU_MODEL.H100_80GB
|
88
|
+
elif product_name.startswith("Tesla T4"):
|
89
|
+
return NVIDIA_GPU_MODEL.TESLA_T4
|
90
|
+
elif product_name.startswith("NVIDIA RTX A6000"):
|
91
|
+
return NVIDIA_GPU_MODEL.A6000
|
92
|
+
elif product_name.startswith("NVIDIA A10"):
|
93
|
+
return NVIDIA_GPU_MODEL.A10
|
94
|
+
else:
|
95
|
+
return NVIDIA_GPU_MODEL.UNKNOWN
|
@@ -19,7 +19,7 @@ _sym_db = _symbol_database.Default()
|
|
19
19
|
|
20
20
|
|
21
21
|
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
22
|
-
b'\n!indexify/proto/executor_api.proto\x12\x0f\x65xecutor_api_pb"\x87\x02\n\x0b\x44\x61taPayload\x12\x11\n\x04path\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x11\n\x04size\x18\x02 \x01(\x04H\x01\x88\x01\x01\x12\x18\n\x0bsha256_hash\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x10\n\x03uri\x18\x04 \x01(\tH\x03\x88\x01\x01\x12;\n\x08\x65ncoding\x18\x05 \x01(\x0e\x32$.executor_api_pb.DataPayloadEncodingH\x04\x88\x01\x01\x12\x1d\n\x10\x65ncoding_version\x18\x06 \x01(\x04H\x05\x88\x01\x01\x42\x07\n\x05_pathB\x07\n\x05_sizeB\x0e\n\x0c_sha256_hashB\x06\n\x04_uriB\x0b\n\t_encodingB\x13\n\x11_encoding_version"k\n\x0cGPUResources\x12\x12\n\x05\x63ount\x18\x01 \x01(\rH\x00\x88\x01\x01\x12-\n\x05model\x18\x02 \x01(\x0e\x32\x19.executor_api_pb.GPUModelH\x01\x88\x01\x01\x42\x08\n\x06_countB\x08\n\x06_modelJ\x04\x08\x03\x10\x04"\xc2\x01\n\rHostResources\x12\x16\n\tcpu_count\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x19\n\x0cmemory_bytes\x18\x02 \x01(\x04H\x01\x88\x01\x01\x12\x17\n\ndisk_bytes\x18\x03 \x01(\x04H\x02\x88\x01\x01\x12/\n\x03gpu\x18\x04 \x01(\x0b\x32\x1d.executor_api_pb.GPUResourcesH\x03\x88\x01\x01\x42\x0c\n\n_cpu_countB\x0f\n\r_memory_bytesB\r\n\x0b_disk_bytesB\x06\n\x04_gpu"\xbb\x01\n\x0f\x41llowedFunction\x12\x16\n\tnamespace\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x17\n\ngraph_name\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x42\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_function_nameB\x10\n\x0e_graph_version"\xc5\x01\n\x19\x46unctionExecutorResources\x12\x1b\n\x0e\x63pu_ms_per_sec\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x19\n\x0cmemory_bytes\x18\x02 \x01(\x04H\x01\x88\x01\x01\x12\x17\n\ndisk_bytes\x18\x03 \x01(\x04H\x02\x88\x01\x01\x12\x16\n\tgpu_count\x18\x04 \x01(\rH\x03\x88\x01\x01\x42\x11\n\x0f_cpu_ms_per_secB\x0f\n\r_memory_bytesB\r\n\x0b_disk_bytesB\x0c\n\n_gpu_count"\xbf\x04\n\x1b\x46unctionExecutorDescription\x12\x0f\n\x02id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tnamespace\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x17\n\ngraph_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x05 \x01(\tH\x04\x88\x01\x01\x12\x16\n\timage_uri\x18\x06 \x01(\tH\x05\x88\x01\x01\x12\x14\n\x0csecret_names\x18\x07 \x03(\t\x12<\n\x0fresource_limits\x18\x08 \x01(\x0b\x32\x1e.executor_api_pb.HostResourcesH\x06\x88\x01\x01\x12%\n\x18\x63ustomer_code_timeout_ms\x18\t \x01(\rH\x07\x88\x01\x01\x12\x30\n\x05graph\x18\n \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\x08\x88\x01\x01\x12\x42\n\tresources\x18\x0b \x01(\x0b\x32*.executor_api_pb.FunctionExecutorResourcesH\t\x88\x01\x01\x42\x05\n\x03_idB\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_graph_versionB\x10\n\x0e_function_nameB\x0c\n\n_image_uriB\x12\n\x10_resource_limitsB\x1b\n\x19_customer_code_timeout_msB\x08\n\x06_graphB\x0c\n\n_resources"\xbe\x01\n\x15\x46unctionExecutorState\x12\x46\n\x0b\x64\x65scription\x18\x01 \x01(\x0b\x32,.executor_api_pb.FunctionExecutorDescriptionH\x00\x88\x01\x01\x12<\n\x06status\x18\x02 \x01(\x0e\x32\'.executor_api_pb.FunctionExecutorStatusH\x01\x88\x01\x01\x42\x0e\n\x0c_descriptionB\t\n\x07_statusJ\x04\x08\x03\x10\x04"\xc3\x06\n\rExecutorState\x12\x18\n\x0b\x65xecutor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x1d\n\x10\x64\x65velopment_mode\x18\x02 \x01(\x08H\x01\x88\x01\x01\x12\x15\n\x08hostname\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x34\n\x06\x66lavor\x18\x04 \x01(\x0e\x32\x1f.executor_api_pb.ExecutorFlavorH\x03\x88\x01\x01\x12\x14\n\x07version\x18\x05 \x01(\tH\x04\x88\x01\x01\x12\x34\n\x06status\x18\x06 \x01(\x0e\x32\x1f.executor_api_pb.ExecutorStatusH\x05\x88\x01\x01\x12<\n\x0ftotal_resources\x18\r \x01(\x0b\x32\x1e.executor_api_pb.HostResourcesH\x06\x88\x01\x01\x12N\n!total_function_executor_resources\x18\x07 \x01(\x0b\x32\x1e.executor_api_pb.HostResourcesH\x07\x88\x01\x01\x12;\n\x11\x61llowed_functions\x18\x08 \x03(\x0b\x32 .executor_api_pb.AllowedFunction\x12H\n\x18\x66unction_executor_states\x18\t \x03(\x0b\x32&.executor_api_pb.FunctionExecutorState\x12:\n\x06labels\x18\n \x03(\x0b\x32*.executor_api_pb.ExecutorState.LabelsEntry\x12\x17\n\nstate_hash\x18\x0b \x01(\tH\x08\x88\x01\x01\x12\x19\n\x0cserver_clock\x18\x0c \x01(\x04H\t\x88\x01\x01\x1a-\n\x0bLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0e\n\x0c_executor_idB\x13\n\x11_development_modeB\x0b\n\t_hostnameB\t\n\x07_flavorB\n\n\x08_versionB\t\n\x07_statusB\x12\n\x10_total_resourcesB$\n"_total_function_executor_resourcesB\r\n\x0b_state_hashB\x0f\n\r_server_clock"l\n\x1aReportExecutorStateRequest\x12;\n\x0e\x65xecutor_state\x18\x01 \x01(\x0b\x32\x1e.executor_api_pb.ExecutorStateH\x00\x88\x01\x01\x42\x11\n\x0f_executor_state"\x1d\n\x1bReportExecutorStateResponse"\xcf\x01\n\x0fTaskRetryPolicy\x12\x18\n\x0bmax_retries\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x1d\n\x10initial_delay_ms\x18\x02 \x01(\rH\x01\x88\x01\x01\x12\x19\n\x0cmax_delay_ms\x18\x03 \x01(\rH\x02\x88\x01\x01\x12\x1d\n\x10\x64\x65lay_multiplier\x18\x04 \x01(\rH\x03\x88\x01\x01\x42\x0e\n\x0c_max_retriesB\x13\n\x11_initial_delay_msB\x0f\n\r_max_delay_msB\x13\n\x11_delay_multiplier"\xa4\x05\n\x04Task\x12\x0f\n\x02id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tnamespace\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x17\n\ngraph_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x05 \x01(\tH\x04\x88\x01\x01\x12 \n\x13graph_invocation_id\x18\x06 \x01(\tH\x05\x88\x01\x01\x12\x16\n\tinput_key\x18\x08 \x01(\tH\x06\x88\x01\x01\x12\x1f\n\x12reducer_output_key\x18\t \x01(\tH\x07\x88\x01\x01\x12\x17\n\ntimeout_ms\x18\n \x01(\rH\x08\x88\x01\x01\x12\x30\n\x05input\x18\x0b \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\t\x88\x01\x01\x12\x38\n\rreducer_input\x18\x0c \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\n\x88\x01\x01\x12&\n\x19output_payload_uri_prefix\x18\r \x01(\tH\x0b\x88\x01\x01\x12;\n\x0cretry_policy\x18\x0e \x01(\x0b\x32 .executor_api_pb.TaskRetryPolicyH\x0c\x88\x01\x01\x42\x05\n\x03_idB\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_graph_versionB\x10\n\x0e_function_nameB\x16\n\x14_graph_invocation_idB\x0c\n\n_input_keyB\x15\n\x13_reducer_output_keyB\r\n\x0b_timeout_msB\x08\n\x06_inputB\x10\n\x0e_reducer_inputB\x1c\n\x1a_output_payload_uri_prefixB\x0f\n\r_retry_policy"\x7f\n\x0eTaskAllocation\x12!\n\x14\x66unction_executor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12(\n\x04task\x18\x02 \x01(\x0b\x32\x15.executor_api_pb.TaskH\x01\x88\x01\x01\x42\x17\n\x15_function_executor_idB\x07\n\x05_task"K\n\x1fGetDesiredExecutorStatesRequest\x12\x18\n\x0b\x65xecutor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x42\x0e\n\x0c_executor_id"\xb9\x01\n\x14\x44\x65siredExecutorState\x12H\n\x12\x66unction_executors\x18\x01 \x03(\x0b\x32,.executor_api_pb.FunctionExecutorDescription\x12\x39\n\x10task_allocations\x18\x02 \x03(\x0b\x32\x1f.executor_api_pb.TaskAllocation\x12\x12\n\x05\x63lock\x18\x03 \x01(\x04H\x00\x88\x01\x01\x42\x08\n\x06_clock"\x87\x06\n\x18ReportTaskOutcomeRequest\x12\x14\n\x07task_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tnamespace\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x17\n\ngraph_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x04 \x01(\tH\x03\x88\x01\x01\x12 \n\x13graph_invocation_id\x18\x06 \x01(\tH\x04\x88\x01\x01\x12\x32\n\x07outcome\x18\x07 \x01(\x0e\x32\x1c.executor_api_pb.TaskOutcomeH\x05\x88\x01\x01\x12\x1a\n\rinvocation_id\x18\x08 \x01(\tH\x06\x88\x01\x01\x12\x18\n\x0b\x65xecutor_id\x18\t \x01(\tH\x07\x88\x01\x01\x12\x14\n\x07reducer\x18\n \x01(\x08H\x08\x88\x01\x01\x12\x16\n\x0enext_functions\x18\x0b \x03(\t\x12\x30\n\nfn_outputs\x18\x0c \x03(\x0b\x32\x1c.executor_api_pb.DataPayload\x12\x31\n\x06stdout\x18\x0e \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\t\x88\x01\x01\x12\x31\n\x06stderr\x18\x0f \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\n\x88\x01\x01\x12=\n\x0foutput_encoding\x18\r \x01(\x0e\x32\x1f.executor_api_pb.OutputEncodingH\x0b\x88\x01\x01\x12$\n\x17output_encoding_version\x18\x05 \x01(\x04H\x0c\x88\x01\x01\x42\n\n\x08_task_idB\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_function_nameB\x16\n\x14_graph_invocation_idB\n\n\x08_outcomeB\x10\n\x0e_invocation_idB\x0e\n\x0c_executor_idB\n\n\x08_reducerB\t\n\x07_stdoutB\t\n\x07_stderrB\x12\n\x10_output_encodingB\x1a\n\x18_output_encoding_version"\x1b\n\x19ReportTaskOutcomeResponse*\xab\x01\n\x13\x44\x61taPayloadEncoding\x12!\n\x1d\x44\x41TA_PAYLOAD_ENCODING_UNKNOWN\x10\x00\x12#\n\x1f\x44\x41TA_PAYLOAD_ENCODING_UTF8_JSON\x10\x01\x12#\n\x1f\x44\x41TA_PAYLOAD_ENCODING_UTF8_TEXT\x10\x02\x12\'\n#DATA_PAYLOAD_ENCODING_BINARY_PICKLE\x10\x03*\xa0\x01\n\x08GPUModel\x12\x15\n\x11GPU_MODEL_UNKNOWN\x10\x00\x12\x1e\n\x1aGPU_MODEL_NVIDIA_A100_40GB\x10\x01\x12\x1e\n\x1aGPU_MODEL_NVIDIA_A100_80GB\x10\x02\x12\x1e\n\x1aGPU_MODEL_NVIDIA_H100_80GB\x10\x03\x12\x1d\n\x19GPU_MODEL_NVIDIA_TESLA_T4\x10\x04*\xca\x03\n\x16\x46unctionExecutorStatus\x12$\n FUNCTION_EXECUTOR_STATUS_UNKNOWN\x10\x00\x12(\n$FUNCTION_EXECUTOR_STATUS_STARTING_UP\x10\x01\x12:\n6FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR\x10\x02\x12:\n6FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR\x10\x03\x12!\n\x1d\x46UNCTION_EXECUTOR_STATUS_IDLE\x10\x04\x12)\n%FUNCTION_EXECUTOR_STATUS_RUNNING_TASK\x10\x05\x12&\n"FUNCTION_EXECUTOR_STATUS_UNHEALTHY\x10\x06\x12%\n!FUNCTION_EXECUTOR_STATUS_STOPPING\x10\x07\x12$\n FUNCTION_EXECUTOR_STATUS_STOPPED\x10\x08\x12%\n!FUNCTION_EXECUTOR_STATUS_SHUTDOWN\x10\t*\xc3\x01\n\x0e\x45xecutorStatus\x12\x1b\n\x17\x45XECUTOR_STATUS_UNKNOWN\x10\x00\x12\x1f\n\x1b\x45XECUTOR_STATUS_STARTING_UP\x10\x01\x12\x1b\n\x17\x45XECUTOR_STATUS_RUNNING\x10\x02\x12\x1b\n\x17\x45XECUTOR_STATUS_DRAINED\x10\x03\x12\x1c\n\x18\x45XECUTOR_STATUS_STOPPING\x10\x04\x12\x1b\n\x17\x45XECUTOR_STATUS_STOPPED\x10\x05*d\n\x0e\x45xecutorFlavor\x12\x1b\n\x17\x45XECUTOR_FLAVOR_UNKNOWN\x10\x00\x12\x17\n\x13\x45XECUTOR_FLAVOR_OSS\x10\x01\x12\x1c\n\x18\x45XECUTOR_FLAVOR_PLATFORM\x10\x02*[\n\x0bTaskOutcome\x12\x18\n\x14TASK_OUTCOME_UNKNOWN\x10\x00\x12\x18\n\x14TASK_OUTCOME_SUCCESS\x10\x01\x12\x18\n\x14TASK_OUTCOME_FAILURE\x10\x02*\x7f\n\x0eOutputEncoding\x12\x1b\n\x17OUTPUT_ENCODING_UNKNOWN\x10\x00\x12\x18\n\x14OUTPUT_ENCODING_JSON\x10\x01\x12\x1a\n\x16OUTPUT_ENCODING_PICKLE\x10\x02\x12\x1a\n\x16OUTPUT_ENCODING_BINARY\x10\x03\x32\xef\x02\n\x0b\x45xecutorAPI\x12t\n\x15report_executor_state\x12+.executor_api_pb.ReportExecutorStateRequest\x1a,.executor_api_pb.ReportExecutorStateResponse"\x00\x12z\n\x1bget_desired_executor_states\x12\x30.executor_api_pb.GetDesiredExecutorStatesRequest\x1a%.executor_api_pb.DesiredExecutorState"\x00\x30\x01\x12n\n\x13report_task_outcome\x12).executor_api_pb.ReportTaskOutcomeRequest\x1a*.executor_api_pb.ReportTaskOutcomeResponse"\x00\x62\x06proto3'
|
22
|
+
b'\n!indexify/proto/executor_api.proto\x12\x0f\x65xecutor_api_pb"\x87\x02\n\x0b\x44\x61taPayload\x12\x11\n\x04path\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x11\n\x04size\x18\x02 \x01(\x04H\x01\x88\x01\x01\x12\x18\n\x0bsha256_hash\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x10\n\x03uri\x18\x04 \x01(\tH\x03\x88\x01\x01\x12;\n\x08\x65ncoding\x18\x05 \x01(\x0e\x32$.executor_api_pb.DataPayloadEncodingH\x04\x88\x01\x01\x12\x1d\n\x10\x65ncoding_version\x18\x06 \x01(\x04H\x05\x88\x01\x01\x42\x07\n\x05_pathB\x07\n\x05_sizeB\x0e\n\x0c_sha256_hashB\x06\n\x04_uriB\x0b\n\t_encodingB\x13\n\x11_encoding_version"k\n\x0cGPUResources\x12\x12\n\x05\x63ount\x18\x01 \x01(\rH\x00\x88\x01\x01\x12-\n\x05model\x18\x02 \x01(\x0e\x32\x19.executor_api_pb.GPUModelH\x01\x88\x01\x01\x42\x08\n\x06_countB\x08\n\x06_modelJ\x04\x08\x03\x10\x04"\xc2\x01\n\rHostResources\x12\x16\n\tcpu_count\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x19\n\x0cmemory_bytes\x18\x02 \x01(\x04H\x01\x88\x01\x01\x12\x17\n\ndisk_bytes\x18\x03 \x01(\x04H\x02\x88\x01\x01\x12/\n\x03gpu\x18\x04 \x01(\x0b\x32\x1d.executor_api_pb.GPUResourcesH\x03\x88\x01\x01\x42\x0c\n\n_cpu_countB\x0f\n\r_memory_bytesB\r\n\x0b_disk_bytesB\x06\n\x04_gpu"\xbb\x01\n\x0f\x41llowedFunction\x12\x16\n\tnamespace\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x17\n\ngraph_name\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x42\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_function_nameB\x10\n\x0e_graph_version"\xc5\x01\n\x19\x46unctionExecutorResources\x12\x1b\n\x0e\x63pu_ms_per_sec\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x19\n\x0cmemory_bytes\x18\x02 \x01(\x04H\x01\x88\x01\x01\x12\x17\n\ndisk_bytes\x18\x03 \x01(\x04H\x02\x88\x01\x01\x12\x16\n\tgpu_count\x18\x04 \x01(\rH\x03\x88\x01\x01\x42\x11\n\x0f_cpu_ms_per_secB\x0f\n\r_memory_bytesB\r\n\x0b_disk_bytesB\x0c\n\n_gpu_count"\xbf\x04\n\x1b\x46unctionExecutorDescription\x12\x0f\n\x02id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tnamespace\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x17\n\ngraph_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x05 \x01(\tH\x04\x88\x01\x01\x12\x16\n\timage_uri\x18\x06 \x01(\tH\x05\x88\x01\x01\x12\x14\n\x0csecret_names\x18\x07 \x03(\t\x12<\n\x0fresource_limits\x18\x08 \x01(\x0b\x32\x1e.executor_api_pb.HostResourcesH\x06\x88\x01\x01\x12%\n\x18\x63ustomer_code_timeout_ms\x18\t \x01(\rH\x07\x88\x01\x01\x12\x30\n\x05graph\x18\n \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\x08\x88\x01\x01\x12\x42\n\tresources\x18\x0b \x01(\x0b\x32*.executor_api_pb.FunctionExecutorResourcesH\t\x88\x01\x01\x42\x05\n\x03_idB\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_graph_versionB\x10\n\x0e_function_nameB\x0c\n\n_image_uriB\x12\n\x10_resource_limitsB\x1b\n\x19_customer_code_timeout_msB\x08\n\x06_graphB\x0c\n\n_resources"\xbe\x01\n\x15\x46unctionExecutorState\x12\x46\n\x0b\x64\x65scription\x18\x01 \x01(\x0b\x32,.executor_api_pb.FunctionExecutorDescriptionH\x00\x88\x01\x01\x12<\n\x06status\x18\x02 \x01(\x0e\x32\'.executor_api_pb.FunctionExecutorStatusH\x01\x88\x01\x01\x42\x0e\n\x0c_descriptionB\t\n\x07_statusJ\x04\x08\x03\x10\x04"\xc3\x06\n\rExecutorState\x12\x18\n\x0b\x65xecutor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x1d\n\x10\x64\x65velopment_mode\x18\x02 \x01(\x08H\x01\x88\x01\x01\x12\x15\n\x08hostname\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x34\n\x06\x66lavor\x18\x04 \x01(\x0e\x32\x1f.executor_api_pb.ExecutorFlavorH\x03\x88\x01\x01\x12\x14\n\x07version\x18\x05 \x01(\tH\x04\x88\x01\x01\x12\x34\n\x06status\x18\x06 \x01(\x0e\x32\x1f.executor_api_pb.ExecutorStatusH\x05\x88\x01\x01\x12<\n\x0ftotal_resources\x18\r \x01(\x0b\x32\x1e.executor_api_pb.HostResourcesH\x06\x88\x01\x01\x12N\n!total_function_executor_resources\x18\x07 \x01(\x0b\x32\x1e.executor_api_pb.HostResourcesH\x07\x88\x01\x01\x12;\n\x11\x61llowed_functions\x18\x08 \x03(\x0b\x32 .executor_api_pb.AllowedFunction\x12H\n\x18\x66unction_executor_states\x18\t \x03(\x0b\x32&.executor_api_pb.FunctionExecutorState\x12:\n\x06labels\x18\n \x03(\x0b\x32*.executor_api_pb.ExecutorState.LabelsEntry\x12\x17\n\nstate_hash\x18\x0b \x01(\tH\x08\x88\x01\x01\x12\x19\n\x0cserver_clock\x18\x0c \x01(\x04H\t\x88\x01\x01\x1a-\n\x0bLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0e\n\x0c_executor_idB\x13\n\x11_development_modeB\x0b\n\t_hostnameB\t\n\x07_flavorB\n\n\x08_versionB\t\n\x07_statusB\x12\n\x10_total_resourcesB$\n"_total_function_executor_resourcesB\r\n\x0b_state_hashB\x0f\n\r_server_clock"l\n\x1aReportExecutorStateRequest\x12;\n\x0e\x65xecutor_state\x18\x01 \x01(\x0b\x32\x1e.executor_api_pb.ExecutorStateH\x00\x88\x01\x01\x42\x11\n\x0f_executor_state"\x1d\n\x1bReportExecutorStateResponse"\xcf\x01\n\x0fTaskRetryPolicy\x12\x18\n\x0bmax_retries\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x1d\n\x10initial_delay_ms\x18\x02 \x01(\rH\x01\x88\x01\x01\x12\x19\n\x0cmax_delay_ms\x18\x03 \x01(\rH\x02\x88\x01\x01\x12\x1d\n\x10\x64\x65lay_multiplier\x18\x04 \x01(\rH\x03\x88\x01\x01\x42\x0e\n\x0c_max_retriesB\x13\n\x11_initial_delay_msB\x0f\n\r_max_delay_msB\x13\n\x11_delay_multiplier"\xa4\x05\n\x04Task\x12\x0f\n\x02id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tnamespace\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x17\n\ngraph_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x05 \x01(\tH\x04\x88\x01\x01\x12 \n\x13graph_invocation_id\x18\x06 \x01(\tH\x05\x88\x01\x01\x12\x16\n\tinput_key\x18\x08 \x01(\tH\x06\x88\x01\x01\x12\x1f\n\x12reducer_output_key\x18\t \x01(\tH\x07\x88\x01\x01\x12\x17\n\ntimeout_ms\x18\n \x01(\rH\x08\x88\x01\x01\x12\x30\n\x05input\x18\x0b \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\t\x88\x01\x01\x12\x38\n\rreducer_input\x18\x0c \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\n\x88\x01\x01\x12&\n\x19output_payload_uri_prefix\x18\r \x01(\tH\x0b\x88\x01\x01\x12;\n\x0cretry_policy\x18\x0e \x01(\x0b\x32 .executor_api_pb.TaskRetryPolicyH\x0c\x88\x01\x01\x42\x05\n\x03_idB\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_graph_versionB\x10\n\x0e_function_nameB\x16\n\x14_graph_invocation_idB\x0c\n\n_input_keyB\x15\n\x13_reducer_output_keyB\r\n\x0b_timeout_msB\x08\n\x06_inputB\x10\n\x0e_reducer_inputB\x1c\n\x1a_output_payload_uri_prefixB\x0f\n\r_retry_policy"\x7f\n\x0eTaskAllocation\x12!\n\x14\x66unction_executor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12(\n\x04task\x18\x02 \x01(\x0b\x32\x15.executor_api_pb.TaskH\x01\x88\x01\x01\x42\x17\n\x15_function_executor_idB\x07\n\x05_task"K\n\x1fGetDesiredExecutorStatesRequest\x12\x18\n\x0b\x65xecutor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x42\x0e\n\x0c_executor_id"\xb9\x01\n\x14\x44\x65siredExecutorState\x12H\n\x12\x66unction_executors\x18\x01 \x03(\x0b\x32,.executor_api_pb.FunctionExecutorDescription\x12\x39\n\x10task_allocations\x18\x02 \x03(\x0b\x32\x1f.executor_api_pb.TaskAllocation\x12\x12\n\x05\x63lock\x18\x03 \x01(\x04H\x00\x88\x01\x01\x42\x08\n\x06_clock"\x87\x06\n\x18ReportTaskOutcomeRequest\x12\x14\n\x07task_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tnamespace\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x17\n\ngraph_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x04 \x01(\tH\x03\x88\x01\x01\x12 \n\x13graph_invocation_id\x18\x06 \x01(\tH\x04\x88\x01\x01\x12\x32\n\x07outcome\x18\x07 \x01(\x0e\x32\x1c.executor_api_pb.TaskOutcomeH\x05\x88\x01\x01\x12\x1a\n\rinvocation_id\x18\x08 \x01(\tH\x06\x88\x01\x01\x12\x18\n\x0b\x65xecutor_id\x18\t \x01(\tH\x07\x88\x01\x01\x12\x14\n\x07reducer\x18\n \x01(\x08H\x08\x88\x01\x01\x12\x16\n\x0enext_functions\x18\x0b \x03(\t\x12\x30\n\nfn_outputs\x18\x0c \x03(\x0b\x32\x1c.executor_api_pb.DataPayload\x12\x31\n\x06stdout\x18\x0e \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\t\x88\x01\x01\x12\x31\n\x06stderr\x18\x0f \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\n\x88\x01\x01\x12=\n\x0foutput_encoding\x18\r \x01(\x0e\x32\x1f.executor_api_pb.OutputEncodingH\x0b\x88\x01\x01\x12$\n\x17output_encoding_version\x18\x05 \x01(\x04H\x0c\x88\x01\x01\x42\n\n\x08_task_idB\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_function_nameB\x16\n\x14_graph_invocation_idB\n\n\x08_outcomeB\x10\n\x0e_invocation_idB\x0e\n\x0c_executor_idB\n\n\x08_reducerB\t\n\x07_stdoutB\t\n\x07_stderrB\x12\n\x10_output_encodingB\x1a\n\x18_output_encoding_version"\x1b\n\x19ReportTaskOutcomeResponse*\xab\x01\n\x13\x44\x61taPayloadEncoding\x12!\n\x1d\x44\x41TA_PAYLOAD_ENCODING_UNKNOWN\x10\x00\x12#\n\x1f\x44\x41TA_PAYLOAD_ENCODING_UTF8_JSON\x10\x01\x12#\n\x1f\x44\x41TA_PAYLOAD_ENCODING_UTF8_TEXT\x10\x02\x12\'\n#DATA_PAYLOAD_ENCODING_BINARY_PICKLE\x10\x03*\xd6\x01\n\x08GPUModel\x12\x15\n\x11GPU_MODEL_UNKNOWN\x10\x00\x12\x1e\n\x1aGPU_MODEL_NVIDIA_A100_40GB\x10\x01\x12\x1e\n\x1aGPU_MODEL_NVIDIA_A100_80GB\x10\x02\x12\x1e\n\x1aGPU_MODEL_NVIDIA_H100_80GB\x10\x03\x12\x1d\n\x19GPU_MODEL_NVIDIA_TESLA_T4\x10\x04\x12\x1a\n\x16GPU_MODEL_NVIDIA_A6000\x10\x05\x12\x18\n\x14GPU_MODEL_NVIDIA_A10\x10\x06*\xca\x03\n\x16\x46unctionExecutorStatus\x12$\n FUNCTION_EXECUTOR_STATUS_UNKNOWN\x10\x00\x12(\n$FUNCTION_EXECUTOR_STATUS_STARTING_UP\x10\x01\x12:\n6FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR\x10\x02\x12:\n6FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR\x10\x03\x12!\n\x1d\x46UNCTION_EXECUTOR_STATUS_IDLE\x10\x04\x12)\n%FUNCTION_EXECUTOR_STATUS_RUNNING_TASK\x10\x05\x12&\n"FUNCTION_EXECUTOR_STATUS_UNHEALTHY\x10\x06\x12%\n!FUNCTION_EXECUTOR_STATUS_STOPPING\x10\x07\x12$\n FUNCTION_EXECUTOR_STATUS_STOPPED\x10\x08\x12%\n!FUNCTION_EXECUTOR_STATUS_SHUTDOWN\x10\t*\xc3\x01\n\x0e\x45xecutorStatus\x12\x1b\n\x17\x45XECUTOR_STATUS_UNKNOWN\x10\x00\x12\x1f\n\x1b\x45XECUTOR_STATUS_STARTING_UP\x10\x01\x12\x1b\n\x17\x45XECUTOR_STATUS_RUNNING\x10\x02\x12\x1b\n\x17\x45XECUTOR_STATUS_DRAINED\x10\x03\x12\x1c\n\x18\x45XECUTOR_STATUS_STOPPING\x10\x04\x12\x1b\n\x17\x45XECUTOR_STATUS_STOPPED\x10\x05*d\n\x0e\x45xecutorFlavor\x12\x1b\n\x17\x45XECUTOR_FLAVOR_UNKNOWN\x10\x00\x12\x17\n\x13\x45XECUTOR_FLAVOR_OSS\x10\x01\x12\x1c\n\x18\x45XECUTOR_FLAVOR_PLATFORM\x10\x02*[\n\x0bTaskOutcome\x12\x18\n\x14TASK_OUTCOME_UNKNOWN\x10\x00\x12\x18\n\x14TASK_OUTCOME_SUCCESS\x10\x01\x12\x18\n\x14TASK_OUTCOME_FAILURE\x10\x02*\x7f\n\x0eOutputEncoding\x12\x1b\n\x17OUTPUT_ENCODING_UNKNOWN\x10\x00\x12\x18\n\x14OUTPUT_ENCODING_JSON\x10\x01\x12\x1a\n\x16OUTPUT_ENCODING_PICKLE\x10\x02\x12\x1a\n\x16OUTPUT_ENCODING_BINARY\x10\x03\x32\xef\x02\n\x0b\x45xecutorAPI\x12t\n\x15report_executor_state\x12+.executor_api_pb.ReportExecutorStateRequest\x1a,.executor_api_pb.ReportExecutorStateResponse"\x00\x12z\n\x1bget_desired_executor_states\x12\x30.executor_api_pb.GetDesiredExecutorStatesRequest\x1a%.executor_api_pb.DesiredExecutorState"\x00\x30\x01\x12n\n\x13report_task_outcome\x12).executor_api_pb.ReportTaskOutcomeRequest\x1a*.executor_api_pb.ReportTaskOutcomeResponse"\x00\x62\x06proto3'
|
23
23
|
)
|
24
24
|
|
25
25
|
_globals = globals()
|
@@ -34,17 +34,17 @@ if not _descriptor._USE_C_DESCRIPTORS:
|
|
34
34
|
_globals["_DATAPAYLOADENCODING"]._serialized_start = 4857
|
35
35
|
_globals["_DATAPAYLOADENCODING"]._serialized_end = 5028
|
36
36
|
_globals["_GPUMODEL"]._serialized_start = 5031
|
37
|
-
_globals["_GPUMODEL"]._serialized_end =
|
38
|
-
_globals["_FUNCTIONEXECUTORSTATUS"]._serialized_start =
|
39
|
-
_globals["_FUNCTIONEXECUTORSTATUS"]._serialized_end =
|
40
|
-
_globals["_EXECUTORSTATUS"]._serialized_start =
|
41
|
-
_globals["_EXECUTORSTATUS"]._serialized_end =
|
42
|
-
_globals["_EXECUTORFLAVOR"]._serialized_start =
|
43
|
-
_globals["_EXECUTORFLAVOR"]._serialized_end =
|
44
|
-
_globals["_TASKOUTCOME"]._serialized_start =
|
45
|
-
_globals["_TASKOUTCOME"]._serialized_end =
|
46
|
-
_globals["_OUTPUTENCODING"]._serialized_start =
|
47
|
-
_globals["_OUTPUTENCODING"]._serialized_end =
|
37
|
+
_globals["_GPUMODEL"]._serialized_end = 5245
|
38
|
+
_globals["_FUNCTIONEXECUTORSTATUS"]._serialized_start = 5248
|
39
|
+
_globals["_FUNCTIONEXECUTORSTATUS"]._serialized_end = 5706
|
40
|
+
_globals["_EXECUTORSTATUS"]._serialized_start = 5709
|
41
|
+
_globals["_EXECUTORSTATUS"]._serialized_end = 5904
|
42
|
+
_globals["_EXECUTORFLAVOR"]._serialized_start = 5906
|
43
|
+
_globals["_EXECUTORFLAVOR"]._serialized_end = 6006
|
44
|
+
_globals["_TASKOUTCOME"]._serialized_start = 6008
|
45
|
+
_globals["_TASKOUTCOME"]._serialized_end = 6099
|
46
|
+
_globals["_OUTPUTENCODING"]._serialized_start = 6101
|
47
|
+
_globals["_OUTPUTENCODING"]._serialized_end = 6228
|
48
48
|
_globals["_DATAPAYLOAD"]._serialized_start = 55
|
49
49
|
_globals["_DATAPAYLOAD"]._serialized_end = 318
|
50
50
|
_globals["_GPURESOURCES"]._serialized_start = 320
|
@@ -81,6 +81,6 @@ if not _descriptor._USE_C_DESCRIPTORS:
|
|
81
81
|
_globals["_REPORTTASKOUTCOMEREQUEST"]._serialized_end = 4825
|
82
82
|
_globals["_REPORTTASKOUTCOMERESPONSE"]._serialized_start = 4827
|
83
83
|
_globals["_REPORTTASKOUTCOMERESPONSE"]._serialized_end = 4854
|
84
|
-
_globals["_EXECUTORAPI"]._serialized_start =
|
85
|
-
_globals["_EXECUTORAPI"]._serialized_end =
|
84
|
+
_globals["_EXECUTORAPI"]._serialized_start = 6231
|
85
|
+
_globals["_EXECUTORAPI"]._serialized_end = 6598
|
86
86
|
# @@protoc_insertion_point(module_scope)
|
@@ -25,6 +25,8 @@ class GPUModel(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
|
|
25
25
|
GPU_MODEL_NVIDIA_A100_80GB: _ClassVar[GPUModel]
|
26
26
|
GPU_MODEL_NVIDIA_H100_80GB: _ClassVar[GPUModel]
|
27
27
|
GPU_MODEL_NVIDIA_TESLA_T4: _ClassVar[GPUModel]
|
28
|
+
GPU_MODEL_NVIDIA_A6000: _ClassVar[GPUModel]
|
29
|
+
GPU_MODEL_NVIDIA_A10: _ClassVar[GPUModel]
|
28
30
|
|
29
31
|
class FunctionExecutorStatus(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
|
30
32
|
__slots__ = ()
|
@@ -80,6 +82,8 @@ GPU_MODEL_NVIDIA_A100_40GB: GPUModel
|
|
80
82
|
GPU_MODEL_NVIDIA_A100_80GB: GPUModel
|
81
83
|
GPU_MODEL_NVIDIA_H100_80GB: GPUModel
|
82
84
|
GPU_MODEL_NVIDIA_TESLA_T4: GPUModel
|
85
|
+
GPU_MODEL_NVIDIA_A6000: GPUModel
|
86
|
+
GPU_MODEL_NVIDIA_A10: GPUModel
|
83
87
|
FUNCTION_EXECUTOR_STATUS_UNKNOWN: FunctionExecutorStatus
|
84
88
|
FUNCTION_EXECUTOR_STATUS_STARTING_UP: FunctionExecutorStatus
|
85
89
|
FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR: FunctionExecutorStatus
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/function_executor.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/health_checker.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/function_executor/single_task_runner.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/grpc/function_executor_controller.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/host_resources/nvidia_gpu_allocator.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/monitoring/health_check_handler.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/monitoring/prometheus_metrics_handler.py
RENAMED
File without changes
|
File without changes
|
{indexify-0.3.28 → indexify-0.3.30}/src/indexify/executor/monitoring/startup_probe_handler.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|