indexify 0.3.17__tar.gz → 0.3.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {indexify-0.3.17 → indexify-0.3.18}/PKG-INFO +1 -1
- {indexify-0.3.17 → indexify-0.3.18}/pyproject.toml +1 -1
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/cli/cli.py +19 -2
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/executor.py +24 -9
- indexify-0.3.18/src/indexify/executor/executor_flavor.py +7 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/health_checker.py +20 -2
- indexify-0.3.18/src/indexify/executor/grpc/channel_manager.py +160 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/grpc/state_reconciler.py +14 -9
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/grpc/state_reporter.py +72 -14
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/task_fetcher.py +8 -3
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/task_reporter.py +17 -0
- indexify-0.3.17/src/indexify/proto/task_scheduler.proto → indexify-0.3.18/src/indexify/proto/executor_api.proto +23 -6
- indexify-0.3.18/src/indexify/proto/executor_api_pb2.py +70 -0
- indexify-0.3.17/src/indexify/proto/task_scheduler_pb2.pyi → indexify-0.3.18/src/indexify/proto/executor_api_pb2.pyi +44 -4
- indexify-0.3.17/src/indexify/proto/task_scheduler_pb2_grpc.py → indexify-0.3.18/src/indexify/proto/executor_api_pb2_grpc.py +36 -26
- indexify-0.3.17/src/indexify/executor/grpc/channel_creator.py +0 -53
- indexify-0.3.17/src/indexify/proto/task_scheduler_pb2.py +0 -64
- {indexify-0.3.17 → indexify-0.3.18}/README.md +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/README.md +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/api_objects.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/downloader.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/function_executor.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/function_executor_state.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/function_executor_states_container.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/function_executor_status.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/invocation_state_client.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/function_executor.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/function_executor_state.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/health_checker.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/invocation_state_client.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/single_task_runner.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/server/client_configuration.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/server/function_executor_server.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/server/function_executor_server_factory.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/server/subprocess_function_executor_server.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/single_task_runner.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/task_input.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/task_output.py +0 -0
- /indexify-0.3.17/src/indexify/executor/grpc/metrics/channel_creator.py → /indexify-0.3.18/src/indexify/executor/grpc/metrics/channel_manager.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/grpc/metrics/state_reporter.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/metrics/downloader.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/metrics/executor.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/metrics/task_fetcher.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/metrics/task_reporter.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/metrics/task_runner.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/function_allowlist.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/handler.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/health_check_handler.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/health_checker/generic_health_checker.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/health_checker/health_checker.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/metrics.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/prometheus_metrics_handler.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/server.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/startup_probe_handler.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/runtime_probes.py +0 -0
- {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/task_runner.py +0 -0
@@ -1,7 +1,7 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "indexify"
|
3
3
|
# Incremented if any of the components provided in this packages are updated.
|
4
|
-
version = "0.3.
|
4
|
+
version = "0.3.18"
|
5
5
|
description = "Open Source Indexify components and helper tools"
|
6
6
|
authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
|
7
7
|
license = "Apache 2.0"
|
@@ -13,7 +13,7 @@ import sys
|
|
13
13
|
from importlib.metadata import version
|
14
14
|
from pathlib import Path
|
15
15
|
from socket import gethostname
|
16
|
-
from typing import Annotated, List, Optional, Tuple
|
16
|
+
from typing import Annotated, Dict, List, Optional, Tuple
|
17
17
|
|
18
18
|
import nanoid
|
19
19
|
import prometheus_client
|
@@ -26,6 +26,7 @@ from tensorlake.functions_sdk.image import Image
|
|
26
26
|
|
27
27
|
from indexify.executor.api_objects import FunctionURI
|
28
28
|
from indexify.executor.executor import Executor
|
29
|
+
from indexify.executor.executor_flavor import ExecutorFlavor
|
29
30
|
from indexify.executor.function_executor.server.subprocess_function_executor_server_factory import (
|
30
31
|
SubprocessFunctionExecutorServerFactory,
|
31
32
|
)
|
@@ -119,7 +120,6 @@ def executor(
|
|
119
120
|
help="Port where to run Executor Monitoring server",
|
120
121
|
),
|
121
122
|
] = 7000,
|
122
|
-
# TODO: Figure out mTLS for gRPC.
|
123
123
|
grpc_server_addr: Annotated[
|
124
124
|
Optional[str],
|
125
125
|
typer.Option(
|
@@ -140,6 +140,15 @@ def executor(
|
|
140
140
|
),
|
141
141
|
),
|
142
142
|
] = False,
|
143
|
+
labels: Annotated[
|
144
|
+
List[str],
|
145
|
+
typer.Option(
|
146
|
+
"--label",
|
147
|
+
"-l",
|
148
|
+
help="Executor key-value label to be sent to the Server. "
|
149
|
+
"Specified as <key>=<value>",
|
150
|
+
),
|
151
|
+
] = [],
|
143
152
|
):
|
144
153
|
if dev:
|
145
154
|
configure_development_mode_logging()
|
@@ -162,6 +171,11 @@ def executor(
|
|
162
171
|
"--grpc-server-addr must be set when --enable-grpc-state-reconciler is set"
|
163
172
|
)
|
164
173
|
|
174
|
+
kv_labels: Dict[str, str] = {}
|
175
|
+
for label in labels:
|
176
|
+
key, value = label.split("=")
|
177
|
+
kv_labels[key] = value
|
178
|
+
|
165
179
|
executor_version = version("indexify")
|
166
180
|
logger = structlog.get_logger(module=__name__, executor_id=executor_id)
|
167
181
|
|
@@ -171,6 +185,7 @@ def executor(
|
|
171
185
|
server_addr=server_addr,
|
172
186
|
config_path=config_path,
|
173
187
|
executor_version=executor_version,
|
188
|
+
labels=kv_labels,
|
174
189
|
executor_cache=executor_cache,
|
175
190
|
ports=ports,
|
176
191
|
functions=function_uris,
|
@@ -205,7 +220,9 @@ def executor(
|
|
205
220
|
Executor(
|
206
221
|
id=executor_id,
|
207
222
|
development_mode=dev,
|
223
|
+
flavor=ExecutorFlavor.OSS,
|
208
224
|
version=executor_version,
|
225
|
+
labels=kv_labels,
|
209
226
|
health_checker=GenericHealthChecker(),
|
210
227
|
code_path=executor_cache,
|
211
228
|
function_allowlist=_parse_function_uris(function_uris),
|
@@ -9,17 +9,18 @@ import structlog
|
|
9
9
|
from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
|
10
10
|
from tensorlake.utils.logging import suppress as suppress_logging
|
11
11
|
|
12
|
-
from indexify.proto.
|
12
|
+
from indexify.proto.executor_api_pb2 import ExecutorStatus
|
13
13
|
|
14
14
|
from .api_objects import FunctionURI, Task
|
15
15
|
from .downloader import Downloader
|
16
|
+
from .executor_flavor import ExecutorFlavor
|
16
17
|
from .function_executor.function_executor_states_container import (
|
17
18
|
FunctionExecutorStatesContainer,
|
18
19
|
)
|
19
20
|
from .function_executor.server.function_executor_server_factory import (
|
20
21
|
FunctionExecutorServerFactory,
|
21
22
|
)
|
22
|
-
from .grpc.
|
23
|
+
from .grpc.channel_manager import ChannelManager
|
23
24
|
from .grpc.state_reconciler import ExecutorStateReconciler
|
24
25
|
from .grpc.state_reporter import ExecutorStateReporter
|
25
26
|
from .metrics.executor import (
|
@@ -55,7 +56,9 @@ class Executor:
|
|
55
56
|
self,
|
56
57
|
id: str,
|
57
58
|
development_mode: bool,
|
59
|
+
flavor: ExecutorFlavor,
|
58
60
|
version: str,
|
61
|
+
labels: Dict[str, str],
|
59
62
|
code_path: Path,
|
60
63
|
health_checker: HealthChecker,
|
61
64
|
function_allowlist: Optional[List[FunctionURI]],
|
@@ -106,18 +109,25 @@ class Executor:
|
|
106
109
|
self._task_runner: Optional[TaskRunner] = None
|
107
110
|
self._task_fetcher: Optional[TaskFetcher] = None
|
108
111
|
# gRPC mode services
|
109
|
-
self.
|
112
|
+
self._channel_manager: Optional[ChannelManager] = None
|
110
113
|
self._state_reporter: Optional[ExecutorStateReporter] = None
|
111
114
|
self._state_reconciler: Optional[ExecutorStateReconciler] = None
|
112
115
|
|
113
116
|
if grpc_server_addr is not None:
|
114
|
-
self.
|
117
|
+
self._channel_manager = ChannelManager(
|
118
|
+
server_address=grpc_server_addr,
|
119
|
+
config_path=config_path,
|
120
|
+
logger=self._logger,
|
121
|
+
)
|
115
122
|
self._state_reporter = ExecutorStateReporter(
|
116
123
|
executor_id=id,
|
124
|
+
flavor=flavor,
|
125
|
+
version=version,
|
126
|
+
labels=labels,
|
117
127
|
development_mode=development_mode,
|
118
128
|
function_allowlist=self._function_allowlist,
|
119
129
|
function_executor_states=self._function_executor_states,
|
120
|
-
|
130
|
+
channel_manager=self._channel_manager,
|
121
131
|
logger=self._logger,
|
122
132
|
)
|
123
133
|
self._state_reporter.update_executor_status(
|
@@ -133,7 +143,8 @@ class Executor:
|
|
133
143
|
config_path=config_path,
|
134
144
|
downloader=self._downloader,
|
135
145
|
task_reporter=self._task_reporter,
|
136
|
-
|
146
|
+
channel_manager=self._channel_manager,
|
147
|
+
state_reporter=self._state_reporter,
|
137
148
|
logger=self._logger,
|
138
149
|
)
|
139
150
|
else:
|
@@ -147,6 +158,7 @@ class Executor:
|
|
147
158
|
self._task_fetcher = TaskFetcher(
|
148
159
|
executor_id=id,
|
149
160
|
executor_version=version,
|
161
|
+
labels=labels,
|
150
162
|
function_allowlist=function_allowlist,
|
151
163
|
protocol=protocol,
|
152
164
|
indexify_server_addr=self._server_addr,
|
@@ -326,7 +338,9 @@ class Executor:
|
|
326
338
|
).inc()
|
327
339
|
|
328
340
|
async def _shutdown(self, loop):
|
329
|
-
self._logger.info(
|
341
|
+
self._logger.info(
|
342
|
+
"shutting down, all Executor logs are suppressed, no task outcomes will be reported to Server from this point"
|
343
|
+
)
|
330
344
|
if self._state_reporter is not None:
|
331
345
|
self._state_reporter.update_executor_status(
|
332
346
|
ExecutorStatus.EXECUTOR_STATUS_STOPPING
|
@@ -339,12 +353,13 @@ class Executor:
|
|
339
353
|
|
340
354
|
self._is_shutdown = True
|
341
355
|
await self._monitoring_server.shutdown()
|
356
|
+
await self._task_reporter.shutdown()
|
342
357
|
|
343
358
|
if self._task_runner is not None:
|
344
359
|
await self._task_runner.shutdown()
|
345
360
|
|
346
|
-
if self.
|
347
|
-
await self.
|
361
|
+
if self._channel_manager is not None:
|
362
|
+
await self._channel_manager.shutdown()
|
348
363
|
if self._state_reporter is not None:
|
349
364
|
await self._state_reporter.shutdown()
|
350
365
|
if self._state_reconciler is not None:
|
{indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/health_checker.py
RENAMED
@@ -70,8 +70,10 @@ class HealthChecker:
|
|
70
70
|
# code is not involved when TCP connections are established to FE. Problems reestablishing
|
71
71
|
# the TCP connection are usually due to the FE process crashing and its gRPC server socket
|
72
72
|
# not being available anymore or due to prolonged local networking failures on Executor.
|
73
|
-
|
74
|
-
|
73
|
+
if (
|
74
|
+
_channel_state(self._channel, self._logger)
|
75
|
+
== grpc.ChannelConnectivity.TRANSIENT_FAILURE
|
76
|
+
):
|
75
77
|
return HealthCheckResult(
|
76
78
|
is_healthy=False,
|
77
79
|
reason="Channel is in TRANSIENT_FAILURE state, assuming Function Executor crashed.",
|
@@ -126,3 +128,19 @@ class HealthChecker:
|
|
126
128
|
|
127
129
|
asyncio.create_task(self._health_check_failed_callback(result))
|
128
130
|
self._health_check_loop_task = None
|
131
|
+
|
132
|
+
|
133
|
+
def _channel_state(channel: grpc.aio.Channel, logger: Any) -> grpc.ChannelConnectivity:
|
134
|
+
"""Get channel connectivity state and suppresses all exceptions.
|
135
|
+
|
136
|
+
Suppressing the exceptions is important because the channel connectivity state is an experimental
|
137
|
+
feature. On error fallse back to READY state which assumes that the channel is okay.
|
138
|
+
"""
|
139
|
+
try:
|
140
|
+
return channel.get_state()
|
141
|
+
except Exception as e:
|
142
|
+
logger.error(
|
143
|
+
"Failed getting channel state, falling back to default READY state",
|
144
|
+
exc_info=e,
|
145
|
+
)
|
146
|
+
return grpc.ChannelConnectivity.READY
|
@@ -0,0 +1,160 @@
|
|
1
|
+
import asyncio
|
2
|
+
from typing import Any, Dict, Optional
|
3
|
+
|
4
|
+
import grpc.aio
|
5
|
+
import yaml
|
6
|
+
|
7
|
+
from .metrics.channel_manager import (
|
8
|
+
metric_grpc_server_channel_creation_latency,
|
9
|
+
metric_grpc_server_channel_creation_retries,
|
10
|
+
metric_grpc_server_channel_creations,
|
11
|
+
)
|
12
|
+
|
13
|
+
_RETRY_INTERVAL_SEC = 5
|
14
|
+
_CONNECT_TIMEOUT_SEC = 5
|
15
|
+
|
16
|
+
|
17
|
+
class ChannelManager:
|
18
|
+
def __init__(self, server_address: str, config_path: Optional[str], logger: Any):
|
19
|
+
self._logger: Any = logger.bind(module=__name__, server_address=server_address)
|
20
|
+
self._server_address: str = server_address
|
21
|
+
self._channel_credentials: Optional[grpc.ChannelCredentials] = None
|
22
|
+
# This lock protects the fields below.
|
23
|
+
self._lock = asyncio.Lock()
|
24
|
+
self._channel: Optional[grpc.aio.Channel] = None
|
25
|
+
|
26
|
+
self._init_tls(config_path)
|
27
|
+
|
28
|
+
def _init_tls(self, config_path: Optional[str]):
|
29
|
+
if config_path is None:
|
30
|
+
return
|
31
|
+
|
32
|
+
# The same config file format as in Tensorlake SDK HTTP client, see:
|
33
|
+
# https://github.com/tensorlakeai/tensorlake/blob/main/src/tensorlake/utils/http_client.py
|
34
|
+
with open(config_path, "r") as config_file:
|
35
|
+
config = yaml.safe_load(config_file)
|
36
|
+
|
37
|
+
if not config.get("use_tls", False):
|
38
|
+
return
|
39
|
+
|
40
|
+
tls_config: Dict[str, str] = config["tls_config"]
|
41
|
+
cert_path: Optional[str] = tls_config.get("cert_path", None)
|
42
|
+
key_path: Optional[str] = tls_config.get("key_path", None)
|
43
|
+
ca_bundle_path: Optional[str] = tls_config.get("ca_bundle_path", None)
|
44
|
+
|
45
|
+
self._logger = self._logger.bind(
|
46
|
+
cert_path=cert_path,
|
47
|
+
key_path=key_path,
|
48
|
+
ca_bundle_path=ca_bundle_path,
|
49
|
+
)
|
50
|
+
self._logger.info("TLS is enabled for grpc channels to server")
|
51
|
+
|
52
|
+
private_key: Optional[bytes] = None
|
53
|
+
certificate_chain: Optional[bytes] = None
|
54
|
+
root_certificates: Optional[bytes] = None
|
55
|
+
|
56
|
+
if cert_path is not None:
|
57
|
+
with open(cert_path, "rb") as cert_file:
|
58
|
+
certificate_chain = cert_file.read()
|
59
|
+
if key_path is not None:
|
60
|
+
with open(key_path, "rb") as key_file:
|
61
|
+
private_key = key_file.read()
|
62
|
+
if ca_bundle_path is not None:
|
63
|
+
with open(ca_bundle_path, "rb") as ca_bundle_file:
|
64
|
+
root_certificates = ca_bundle_file.read()
|
65
|
+
|
66
|
+
self._channel_credentials = grpc.ssl_channel_credentials(
|
67
|
+
root_certificates=root_certificates,
|
68
|
+
private_key=private_key,
|
69
|
+
certificate_chain=certificate_chain,
|
70
|
+
)
|
71
|
+
|
72
|
+
async def get_channel(self) -> grpc.aio.Channel:
|
73
|
+
"""Returns a channel to the gRPC server.
|
74
|
+
|
75
|
+
Returns a ready to use channel. Blocks until the channel is ready,
|
76
|
+
never raises any exceptions.
|
77
|
+
If previously returned channel is healthy then returns it again.
|
78
|
+
Otherwise, returns a new channel but closes the previously returned one.
|
79
|
+
"""
|
80
|
+
# Use the lock to ensure that we only create one channel without race conditions.
|
81
|
+
async with self._lock:
|
82
|
+
if self._channel is None:
|
83
|
+
self._channel = await self._create_channel()
|
84
|
+
elif not await self._locked_channel_is_healthy():
|
85
|
+
self._logger.info("grpc channel to server is unhealthy")
|
86
|
+
await self._destroy_locked_channel()
|
87
|
+
self._channel = await self._create_channel()
|
88
|
+
|
89
|
+
return self._channel
|
90
|
+
|
91
|
+
async def _create_channel(self) -> grpc.aio.Channel:
|
92
|
+
"""Creates a new channel to the gRPC server."
|
93
|
+
|
94
|
+
Returns a ready to use channel. Blocks until the channel
|
95
|
+
is ready, never raises any exceptions.
|
96
|
+
"""
|
97
|
+
self._logger.info("creating new grpc server channel")
|
98
|
+
|
99
|
+
with metric_grpc_server_channel_creation_latency.time():
|
100
|
+
metric_grpc_server_channel_creations.inc()
|
101
|
+
while True:
|
102
|
+
try:
|
103
|
+
if self._channel_credentials is None:
|
104
|
+
channel = grpc.aio.insecure_channel(target=self._server_address)
|
105
|
+
else:
|
106
|
+
channel = grpc.aio.secure_channel(
|
107
|
+
target=self._server_address,
|
108
|
+
credentials=self._channel_credentials,
|
109
|
+
)
|
110
|
+
|
111
|
+
await asyncio.wait_for(
|
112
|
+
channel.channel_ready(),
|
113
|
+
timeout=_CONNECT_TIMEOUT_SEC,
|
114
|
+
)
|
115
|
+
return channel
|
116
|
+
except Exception:
|
117
|
+
self._logger.error(
|
118
|
+
f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
|
119
|
+
)
|
120
|
+
try:
|
121
|
+
await channel.close()
|
122
|
+
except Exception as e:
|
123
|
+
self._logger.error(
|
124
|
+
"failed closing not established channel", exc_info=e
|
125
|
+
)
|
126
|
+
|
127
|
+
metric_grpc_server_channel_creation_retries.inc()
|
128
|
+
await asyncio.sleep(_RETRY_INTERVAL_SEC)
|
129
|
+
|
130
|
+
async def _locked_channel_is_healthy(self) -> bool:
|
131
|
+
"""Checks if the channel is healthy.
|
132
|
+
|
133
|
+
Returns True if the channel is healthy, False otherwise.
|
134
|
+
self._lock must be acquired before calling this method.
|
135
|
+
Never raises any exceptions.
|
136
|
+
"""
|
137
|
+
try:
|
138
|
+
return self._channel.get_state() == grpc.ChannelConnectivity.READY
|
139
|
+
except Exception as e:
|
140
|
+
# Assume that the channel is healthy because get_state() method is marked as experimental
|
141
|
+
# so we can't fully trust it.
|
142
|
+
self._logger.error(
|
143
|
+
"failed getting channel state, assuming channel is healthy", exc_info=e
|
144
|
+
)
|
145
|
+
return True
|
146
|
+
|
147
|
+
async def _destroy_locked_channel(self):
|
148
|
+
"""Closes the existing channel.
|
149
|
+
|
150
|
+
self._lock must be acquired before calling this method.
|
151
|
+
Never raises any exceptions.
|
152
|
+
"""
|
153
|
+
try:
|
154
|
+
await self._channel.close()
|
155
|
+
except Exception as e:
|
156
|
+
self._logger.error("failed closing channel", exc_info=e)
|
157
|
+
self._channel = None
|
158
|
+
|
159
|
+
async def shutdown(self):
|
160
|
+
pass
|
@@ -7,14 +7,14 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
|
|
7
7
|
SerializedObject,
|
8
8
|
)
|
9
9
|
|
10
|
-
from indexify.proto.
|
10
|
+
from indexify.proto.executor_api_pb2 import (
|
11
11
|
DesiredExecutorState,
|
12
12
|
FunctionExecutorDescription,
|
13
13
|
FunctionExecutorStatus,
|
14
14
|
GetDesiredExecutorStatesRequest,
|
15
15
|
)
|
16
|
-
from indexify.proto.
|
17
|
-
|
16
|
+
from indexify.proto.executor_api_pb2_grpc import (
|
17
|
+
ExecutorAPIStub,
|
18
18
|
)
|
19
19
|
|
20
20
|
from ..downloader import Downloader
|
@@ -43,7 +43,8 @@ from ..metrics.executor import (
|
|
43
43
|
metric_tasks_reporting_outcome,
|
44
44
|
)
|
45
45
|
from ..task_reporter import TaskReporter
|
46
|
-
from .
|
46
|
+
from .channel_manager import ChannelManager
|
47
|
+
from .state_reporter import ExecutorStateReporter
|
47
48
|
|
48
49
|
_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
|
49
50
|
|
@@ -58,7 +59,8 @@ class ExecutorStateReconciler:
|
|
58
59
|
config_path: Optional[str],
|
59
60
|
downloader: Downloader,
|
60
61
|
task_reporter: TaskReporter,
|
61
|
-
|
62
|
+
channel_manager: ChannelManager,
|
63
|
+
state_reporter: ExecutorStateReporter,
|
62
64
|
logger: Any,
|
63
65
|
):
|
64
66
|
self._executor_id: str = executor_id
|
@@ -72,7 +74,8 @@ class ExecutorStateReconciler:
|
|
72
74
|
self._function_executor_states: FunctionExecutorStatesContainer = (
|
73
75
|
function_executor_states
|
74
76
|
)
|
75
|
-
self.
|
77
|
+
self._channel_manager: ChannelManager = channel_manager
|
78
|
+
self._state_reporter: ExecutorStateReporter = state_reporter
|
76
79
|
self._logger: Any = logger.bind(module=__name__)
|
77
80
|
self._is_shutdown: bool = False
|
78
81
|
self._server_last_clock: Optional[int] = None
|
@@ -83,12 +86,14 @@ class ExecutorStateReconciler:
|
|
83
86
|
Never raises any exceptions.
|
84
87
|
"""
|
85
88
|
while not self._is_shutdown:
|
86
|
-
async with await self.
|
89
|
+
async with await self._channel_manager.get_channel() as server_channel:
|
87
90
|
server_channel: grpc.aio.Channel
|
88
|
-
stub =
|
91
|
+
stub = ExecutorAPIStub(server_channel)
|
89
92
|
while not self._is_shutdown:
|
90
93
|
try:
|
91
|
-
#
|
94
|
+
# Report state once before starting the stream so Server
|
95
|
+
# doesn't use old state it knew about this Executor in the past.
|
96
|
+
await self._state_reporter.report_state(stub)
|
92
97
|
desired_states_stream: AsyncGenerator[
|
93
98
|
DesiredExecutorState, None
|
94
99
|
] = stub.get_desired_executor_states(
|
@@ -1,37 +1,44 @@
|
|
1
1
|
import asyncio
|
2
|
+
import hashlib
|
3
|
+
from socket import gethostname
|
2
4
|
from typing import Any, Dict, List, Optional
|
3
5
|
|
4
6
|
import grpc
|
5
7
|
|
6
|
-
from indexify.proto.
|
8
|
+
from indexify.proto.executor_api_pb2 import (
|
7
9
|
AllowedFunction,
|
10
|
+
)
|
11
|
+
from indexify.proto.executor_api_pb2 import ExecutorFlavor as ExecutorFlavorProto
|
12
|
+
from indexify.proto.executor_api_pb2 import (
|
8
13
|
ExecutorState,
|
9
14
|
ExecutorStatus,
|
10
15
|
FunctionExecutorDescription,
|
11
16
|
)
|
12
|
-
from indexify.proto.
|
17
|
+
from indexify.proto.executor_api_pb2 import (
|
13
18
|
FunctionExecutorState as FunctionExecutorStateProto,
|
14
19
|
)
|
15
|
-
from indexify.proto.
|
20
|
+
from indexify.proto.executor_api_pb2 import (
|
16
21
|
FunctionExecutorStatus as FunctionExecutorStatusProto,
|
17
22
|
)
|
18
|
-
from indexify.proto.
|
23
|
+
from indexify.proto.executor_api_pb2 import (
|
19
24
|
GPUModel,
|
20
25
|
GPUResources,
|
21
26
|
HostResources,
|
22
27
|
ReportExecutorStateRequest,
|
23
28
|
)
|
24
|
-
from indexify.proto.
|
25
|
-
|
29
|
+
from indexify.proto.executor_api_pb2_grpc import (
|
30
|
+
ExecutorAPIStub,
|
26
31
|
)
|
27
32
|
|
28
33
|
from ..api_objects import FunctionURI
|
34
|
+
from ..executor_flavor import ExecutorFlavor
|
29
35
|
from ..function_executor.function_executor_state import FunctionExecutorState
|
30
36
|
from ..function_executor.function_executor_states_container import (
|
31
37
|
FunctionExecutorStatesContainer,
|
32
38
|
)
|
33
39
|
from ..function_executor.function_executor_status import FunctionExecutorStatus
|
34
|
-
from
|
40
|
+
from ..runtime_probes import RuntimeProbes
|
41
|
+
from .channel_manager import ChannelManager
|
35
42
|
from .metrics.state_reporter import (
|
36
43
|
metric_state_report_errors,
|
37
44
|
metric_state_report_latency,
|
@@ -47,24 +54,32 @@ class ExecutorStateReporter:
|
|
47
54
|
def __init__(
|
48
55
|
self,
|
49
56
|
executor_id: str,
|
57
|
+
flavor: ExecutorFlavor,
|
58
|
+
version: str,
|
59
|
+
labels: Dict[str, str],
|
50
60
|
development_mode: bool,
|
51
61
|
function_allowlist: Optional[List[FunctionURI]],
|
52
62
|
function_executor_states: FunctionExecutorStatesContainer,
|
53
|
-
|
63
|
+
channel_manager: ChannelManager,
|
54
64
|
logger: Any,
|
55
65
|
):
|
56
66
|
self._executor_id: str = executor_id
|
67
|
+
self._flavor: ExecutorFlavor = flavor
|
68
|
+
self._version: str = version
|
69
|
+
self._labels: Dict[str, str] = labels.copy()
|
57
70
|
self._development_mode: bool = development_mode
|
71
|
+
self._hostname: str = gethostname()
|
58
72
|
self._function_executor_states: FunctionExecutorStatesContainer = (
|
59
73
|
function_executor_states
|
60
74
|
)
|
61
|
-
self.
|
75
|
+
self._channel_manager = channel_manager
|
62
76
|
self._logger: Any = logger.bind(module=__name__)
|
63
77
|
self._is_shutdown: bool = False
|
64
78
|
self._executor_status: ExecutorStatus = ExecutorStatus.EXECUTOR_STATUS_UNKNOWN
|
65
79
|
self._allowed_functions: List[AllowedFunction] = _to_grpc_allowed_functions(
|
66
80
|
function_allowlist
|
67
81
|
)
|
82
|
+
self._labels.update(_label_values_to_strings(RuntimeProbes().probe().labels))
|
68
83
|
|
69
84
|
def update_executor_status(self, value: ExecutorStatus):
|
70
85
|
self._executor_status = value
|
@@ -75,12 +90,16 @@ class ExecutorStateReporter:
|
|
75
90
|
Never raises any exceptions.
|
76
91
|
"""
|
77
92
|
while not self._is_shutdown:
|
78
|
-
async with await self.
|
93
|
+
async with await self._channel_manager.get_channel() as server_channel:
|
79
94
|
server_channel: grpc.aio.Channel
|
80
|
-
stub =
|
95
|
+
stub = ExecutorAPIStub(server_channel)
|
81
96
|
while not self._is_shutdown:
|
82
97
|
try:
|
83
|
-
|
98
|
+
# The periodic state reports serve as channel health monitoring requests
|
99
|
+
# (same as TCP keep-alive). Channel Manager returns the same healthy channel
|
100
|
+
# for all RPCs that we do from Executor to Server. So all the RPCs benefit
|
101
|
+
# from this channel health monitoring.
|
102
|
+
await self.report_state(stub)
|
84
103
|
await asyncio.sleep(_REPORTING_INTERVAL_SEC)
|
85
104
|
except Exception as e:
|
86
105
|
self._logger.error(
|
@@ -92,7 +111,11 @@ class ExecutorStateReporter:
|
|
92
111
|
|
93
112
|
self._logger.info("State reporter shutdown")
|
94
113
|
|
95
|
-
async def
|
114
|
+
async def report_state(self, stub: ExecutorAPIStub):
|
115
|
+
"""Reports the current state to the server represented by the supplied stub.
|
116
|
+
|
117
|
+
Raises exceptions on failure.
|
118
|
+
"""
|
96
119
|
with (
|
97
120
|
metric_state_report_errors.count_exceptions(),
|
98
121
|
metric_state_report_latency.time(),
|
@@ -101,11 +124,16 @@ class ExecutorStateReporter:
|
|
101
124
|
state = ExecutorState(
|
102
125
|
executor_id=self._executor_id,
|
103
126
|
development_mode=self._development_mode,
|
104
|
-
|
127
|
+
hostname=self._hostname,
|
128
|
+
flavor=_to_grpc_executor_flavor(self._flavor, self._logger),
|
129
|
+
version=self._version,
|
130
|
+
status=self._executor_status,
|
105
131
|
free_resources=await self._fetch_free_host_resources(),
|
106
132
|
allowed_functions=self._allowed_functions,
|
107
133
|
function_executor_states=await self._fetch_function_executor_states(),
|
134
|
+
labels=self._labels,
|
108
135
|
)
|
136
|
+
state.state_hash = _state_hash(state)
|
109
137
|
|
110
138
|
await stub.report_executor_state(
|
111
139
|
ReportExecutorStateRequest(executor_state=state),
|
@@ -197,3 +225,33 @@ def _to_grpc_function_executor_status(
|
|
197
225
|
logger.error("Unexpected Function Executor status", status=status)
|
198
226
|
|
199
227
|
return result
|
228
|
+
|
229
|
+
|
230
|
+
_FLAVOR_MAPPING = {
|
231
|
+
ExecutorFlavor.OSS: ExecutorFlavorProto.EXECUTOR_FLAVOR_OSS,
|
232
|
+
ExecutorFlavor.PLATFORM: ExecutorFlavorProto.EXECUTOR_FLAVOR_PLATFORM,
|
233
|
+
}
|
234
|
+
|
235
|
+
|
236
|
+
def _to_grpc_executor_flavor(
|
237
|
+
flavor: ExecutorFlavor, logger: Any
|
238
|
+
) -> ExecutorFlavorProto:
|
239
|
+
result: ExecutorFlavorProto = _FLAVOR_MAPPING.get(
|
240
|
+
flavor, ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN
|
241
|
+
)
|
242
|
+
|
243
|
+
if result == ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN:
|
244
|
+
logger.error("Unexpected Executor flavor", flavor=flavor)
|
245
|
+
|
246
|
+
return result
|
247
|
+
|
248
|
+
|
249
|
+
def _label_values_to_strings(labels: Dict[str, Any]) -> Dict[str, str]:
|
250
|
+
return {k: str(v) for k, v in labels.items()}
|
251
|
+
|
252
|
+
|
253
|
+
def _state_hash(state: ExecutorState) -> str:
|
254
|
+
serialized_state: bytes = state.SerializeToString(deterministic=True)
|
255
|
+
hasher = hashlib.sha256(usedforsecurity=False)
|
256
|
+
hasher.update(serialized_state)
|
257
|
+
return hasher.hexdigest()
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import json
|
2
2
|
import time
|
3
|
-
from
|
3
|
+
from socket import gethostname
|
4
|
+
from typing import AsyncGenerator, Dict, List, Optional
|
4
5
|
|
5
6
|
import structlog
|
6
7
|
from httpx_sse import aconnect_sse
|
@@ -22,6 +23,7 @@ class TaskFetcher:
|
|
22
23
|
self,
|
23
24
|
executor_id: str,
|
24
25
|
executor_version: str,
|
26
|
+
labels: Dict[str, str],
|
25
27
|
function_allowlist: Optional[List[FunctionURI]],
|
26
28
|
protocol: str,
|
27
29
|
indexify_server_addr: str,
|
@@ -33,12 +35,15 @@ class TaskFetcher:
|
|
33
35
|
self._logger = structlog.get_logger(module=__name__)
|
34
36
|
|
35
37
|
probe_info: ProbeInfo = RuntimeProbes().probe()
|
38
|
+
all_labels = probe_info.labels.copy()
|
39
|
+
all_labels.update(labels)
|
40
|
+
|
36
41
|
self._executor_metadata: ExecutorMetadata = ExecutorMetadata(
|
37
42
|
id=executor_id,
|
38
43
|
executor_version=executor_version,
|
39
|
-
addr=
|
44
|
+
addr=gethostname(),
|
40
45
|
function_allowlist=function_allowlist,
|
41
|
-
labels=
|
46
|
+
labels=all_labels,
|
42
47
|
)
|
43
48
|
|
44
49
|
async def run(self) -> AsyncGenerator[Task, None]:
|