indexify 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/executor/channel_manager.py +75 -103
- indexify/executor/executor.py +1 -1
- indexify/executor/metrics/state_reporter.py +2 -2
- indexify/executor/monitoring/health_checker/generic_health_checker.py +4 -0
- indexify/executor/monitoring/health_checker/metrics/health_checker.py +5 -0
- indexify/executor/state_reconciler.py +37 -9
- indexify/executor/state_reporter.py +66 -48
- indexify/proto/executor_api.proto +0 -2
- indexify/proto/executor_api_pb2_grpc.py +0 -2
- {indexify-0.4.16.dist-info → indexify-0.4.18.dist-info}/METADATA +3 -3
- {indexify-0.4.16.dist-info → indexify-0.4.18.dist-info}/RECORD +13 -12
- {indexify-0.4.16.dist-info → indexify-0.4.18.dist-info}/WHEEL +0 -0
- {indexify-0.4.16.dist-info → indexify-0.4.18.dist-info}/entry_points.txt +0 -0
@@ -10,10 +10,8 @@ from .metrics.channel_manager import (
|
|
10
10
|
metric_grpc_server_channel_creation_retries,
|
11
11
|
metric_grpc_server_channel_creations,
|
12
12
|
)
|
13
|
-
from .monitoring.health_checker.health_checker import HealthChecker
|
14
13
|
|
15
14
|
_RETRY_INTERVAL_SEC = 5
|
16
|
-
_CONNECT_TIMEOUT_SEC = 5
|
17
15
|
|
18
16
|
|
19
17
|
class ChannelManager:
|
@@ -21,16 +19,14 @@ class ChannelManager:
|
|
21
19
|
self,
|
22
20
|
server_address: str,
|
23
21
|
config_path: Optional[str],
|
24
|
-
health_checker: HealthChecker,
|
25
22
|
logger: Any,
|
26
23
|
):
|
27
24
|
self._logger: Any = logger.bind(module=__name__, server_address=server_address)
|
28
25
|
self._server_address: str = server_address
|
29
|
-
self._health_checker: HealthChecker = health_checker
|
30
26
|
self._channel_credentials: Optional[grpc.ChannelCredentials] = None
|
31
|
-
#
|
32
|
-
self.
|
33
|
-
self.
|
27
|
+
# Shared channel used by different Executor components to communicate with Server.
|
28
|
+
self._shared_channel_lock = asyncio.Lock()
|
29
|
+
self._shared_channel: Optional[grpc.aio.Channel] = None
|
34
30
|
|
35
31
|
self._init_tls(config_path)
|
36
32
|
|
@@ -79,117 +75,93 @@ class ChannelManager:
|
|
79
75
|
)
|
80
76
|
|
81
77
|
async def destroy(self):
|
82
|
-
|
83
|
-
|
78
|
+
# Okay to not hold the lock here as we're destroying the server channel forever.
|
79
|
+
if self._shared_channel is not None:
|
80
|
+
await self._destroy_shared_channel()
|
84
81
|
|
85
|
-
async def
|
86
|
-
"""
|
82
|
+
async def fail_shared_channel(self) -> None:
|
83
|
+
"""Marks the shared channel as unhealthy and creates a new one.
|
87
84
|
|
88
|
-
|
89
|
-
never raises any exceptions.
|
90
|
-
If previously returned channel is healthy then returns it again.
|
91
|
-
Otherwise, returns a new channel but closes the previously returned one.
|
85
|
+
Doesn't raise any exceptions.
|
92
86
|
"""
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
self._channel = await self._create_ready_channel()
|
98
|
-
elif not await self._locked_channel_is_healthy():
|
99
|
-
self._logger.info("grpc channel to server is unhealthy")
|
100
|
-
self._health_checker.server_connection_state_changed(
|
101
|
-
is_healthy=False,
|
102
|
-
status_message="grpc channel to server is unhealthy",
|
103
|
-
)
|
104
|
-
await self._destroy_locked_channel()
|
105
|
-
self._channel = await self._create_ready_channel()
|
106
|
-
self._health_checker.server_connection_state_changed(
|
107
|
-
is_healthy=True, status_message="grpc channel to server is healthy"
|
87
|
+
async with self._shared_channel_lock:
|
88
|
+
if self._shared_channel is None:
|
89
|
+
self._logger.error(
|
90
|
+
"grpc server channel doesn't exist, can't mark it unhealthy"
|
108
91
|
)
|
92
|
+
return
|
109
93
|
|
110
|
-
|
94
|
+
self._logger.info("marking grpc server channel as unhealthy")
|
95
|
+
# All the channel users will see it failing cause we destroyed it and call get_channel() again.
|
96
|
+
await self._destroy_shared_channel()
|
111
97
|
|
112
|
-
def
|
113
|
-
"""
|
98
|
+
async def get_shared_channel(self) -> grpc.aio.Channel:
|
99
|
+
"""Returns shared channel to the gRPC server.
|
114
100
|
|
115
|
-
The channel is
|
101
|
+
The health of the shared channel is constantly monitored so it's more reliable than using a
|
102
|
+
standalone channel created for a particular short term need. Doesn't raise any exceptions.
|
116
103
|
"""
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
is ready, never raises any exceptions.
|
104
|
+
# Use the lock to ensure that we only create one channel without race conditions.
|
105
|
+
async with self._shared_channel_lock:
|
106
|
+
if self._shared_channel is None:
|
107
|
+
await self._create_shared_channel()
|
108
|
+
|
109
|
+
return self._shared_channel
|
110
|
+
|
111
|
+
def create_standalone_channel(self) -> grpc.aio.Channel:
|
112
|
+
"""Creates a new channel to the gRPC server.
|
113
|
+
|
114
|
+
Used for one-off RPCs where we don't need to monitor channel health or retry its creation indefinitely.
|
115
|
+
Raises an exception on failure.
|
130
116
|
"""
|
131
|
-
with
|
117
|
+
with (
|
118
|
+
metric_grpc_server_channel_creation_retries.count_exceptions(),
|
119
|
+
metric_grpc_server_channel_creation_latency.time(),
|
120
|
+
):
|
132
121
|
metric_grpc_server_channel_creations.inc()
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
self.
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
channel.channel_ready(),
|
146
|
-
timeout=_CONNECT_TIMEOUT_SEC,
|
147
|
-
)
|
148
|
-
self._logger.info(
|
149
|
-
"grpc server channel is established (ready)",
|
150
|
-
duration_sec=time.monotonic() - channel_ready_start,
|
151
|
-
)
|
152
|
-
|
153
|
-
return channel
|
154
|
-
except BaseException:
|
155
|
-
self._logger.error(
|
156
|
-
f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
|
157
|
-
)
|
158
|
-
try:
|
159
|
-
await channel.close()
|
160
|
-
except BaseException as e:
|
161
|
-
self._logger.error(
|
162
|
-
"failed closing not established channel", exc_info=e
|
163
|
-
)
|
164
|
-
|
165
|
-
metric_grpc_server_channel_creation_retries.inc()
|
166
|
-
await asyncio.sleep(_RETRY_INTERVAL_SEC)
|
167
|
-
|
168
|
-
async def _locked_channel_is_healthy(self) -> bool:
|
169
|
-
"""Checks if the channel is healthy.
|
170
|
-
|
171
|
-
Returns True if the channel is healthy, False otherwise.
|
172
|
-
self._lock must be acquired before calling this method.
|
122
|
+
if self._channel_credentials is None:
|
123
|
+
return grpc.aio.insecure_channel(target=self._server_address)
|
124
|
+
else:
|
125
|
+
return grpc.aio.secure_channel(
|
126
|
+
target=self._server_address,
|
127
|
+
credentials=self._channel_credentials,
|
128
|
+
)
|
129
|
+
|
130
|
+
async def _create_shared_channel(self) -> None:
|
131
|
+
"""Creates new shared channel.
|
132
|
+
|
133
|
+
self._shared_channel_lock must be acquired before calling this method.
|
173
134
|
Never raises any exceptions.
|
174
135
|
"""
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
136
|
+
while True:
|
137
|
+
try:
|
138
|
+
create_channel_start = time.monotonic()
|
139
|
+
self._logger.info("creating new grpc channel to server")
|
140
|
+
self._shared_channel = self.create_standalone_channel()
|
141
|
+
# Ensure the channel tried to connect to not get "channel closed errors" without actually trying to connect.
|
142
|
+
self._shared_channel.get_state(try_to_connect=True)
|
143
|
+
self._logger.info(
|
144
|
+
"created new grpc channel to server",
|
145
|
+
duration_sec=time.monotonic() - create_channel_start,
|
146
|
+
)
|
147
|
+
break
|
148
|
+
except Exception as e:
|
149
|
+
self._logger.error(
|
150
|
+
f"failed creating grpc channel to server, retrying in {_RETRY_INTERVAL_SEC} seconds",
|
151
|
+
exc_info=e,
|
152
|
+
)
|
153
|
+
await asyncio.sleep(_RETRY_INTERVAL_SEC)
|
184
154
|
|
185
|
-
async def
|
186
|
-
"""Closes the existing channel.
|
155
|
+
async def _destroy_shared_channel(self) -> None:
|
156
|
+
"""Closes the existing shared channel.
|
187
157
|
|
188
|
-
self.
|
158
|
+
self._shared_channel_lock must be acquired before calling this method.
|
189
159
|
Never raises any exceptions.
|
190
160
|
"""
|
191
161
|
try:
|
192
|
-
|
162
|
+
self._logger.info("closing grpc channel to server")
|
163
|
+
await self._shared_channel.close()
|
164
|
+
self._logger.info("closed grpc channel to server")
|
193
165
|
except Exception as e:
|
194
|
-
self._logger.error("failed closing channel", exc_info=e)
|
195
|
-
self.
|
166
|
+
self._logger.error("failed closing grpc channel to server", exc_info=e)
|
167
|
+
self._shared_channel = None
|
indexify/executor/executor.py
CHANGED
@@ -69,7 +69,6 @@ class Executor:
|
|
69
69
|
self._channel_manager = ChannelManager(
|
70
70
|
server_address=grpc_server_addr,
|
71
71
|
config_path=config_path,
|
72
|
-
health_checker=health_checker,
|
73
72
|
logger=self._logger,
|
74
73
|
)
|
75
74
|
function_allowlist: List[FunctionURI] = parse_function_uris(function_uris)
|
@@ -80,6 +79,7 @@ class Executor:
|
|
80
79
|
function_allowlist=function_allowlist,
|
81
80
|
channel_manager=self._channel_manager,
|
82
81
|
host_resources_provider=host_resources_provider,
|
82
|
+
health_checker=health_checker,
|
83
83
|
logger=self._logger,
|
84
84
|
)
|
85
85
|
self._state_reporter.update_executor_status(
|
@@ -6,11 +6,11 @@ metric_state_report_rpcs = prometheus_client.Counter(
|
|
6
6
|
"state_report_rpcs",
|
7
7
|
"Number of Executor state report RPCs to Server",
|
8
8
|
)
|
9
|
-
|
9
|
+
metric_state_report_rpc_errors = prometheus_client.Counter(
|
10
10
|
"state_report_rpc_errors",
|
11
11
|
"Number of Executor state report RPC errors",
|
12
12
|
)
|
13
|
-
|
13
|
+
metric_state_report_rpc_latency: prometheus_client.Histogram = (
|
14
14
|
latency_metric_for_fast_operation(
|
15
15
|
"state_report_rpc", "Executor state report rpc to Server"
|
16
16
|
)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from typing import Optional
|
2
2
|
|
3
3
|
from .health_checker import HealthChecker, HealthCheckResult
|
4
|
+
from .metrics.health_checker import metric_healthy
|
4
5
|
|
5
6
|
_HEALTH_CHECKER_NAME = "GenericHealthChecker"
|
6
7
|
|
@@ -13,13 +14,16 @@ class GenericHealthChecker(HealthChecker):
|
|
13
14
|
|
14
15
|
def __init__(self):
|
15
16
|
self._server_connection_unhealthy_status_message: Optional[str] = None
|
17
|
+
metric_healthy.set(1)
|
16
18
|
|
17
19
|
def server_connection_state_changed(self, is_healthy: bool, status_message: str):
|
18
20
|
"""Handle changes in server connection state."""
|
19
21
|
if is_healthy:
|
20
22
|
self._server_connection_unhealthy_status_message = None
|
23
|
+
metric_healthy.set(1)
|
21
24
|
else:
|
22
25
|
self._server_connection_unhealthy_status_message = status_message
|
26
|
+
metric_healthy.set(0)
|
23
27
|
|
24
28
|
async def check(self) -> HealthCheckResult:
|
25
29
|
if self._server_connection_unhealthy_status_message is not None:
|
@@ -1,6 +1,15 @@
|
|
1
1
|
import asyncio
|
2
2
|
from pathlib import Path
|
3
|
-
from typing import
|
3
|
+
from typing import (
|
4
|
+
Any,
|
5
|
+
AsyncIterable,
|
6
|
+
AsyncIterator,
|
7
|
+
Dict,
|
8
|
+
Iterable,
|
9
|
+
List,
|
10
|
+
Optional,
|
11
|
+
Set,
|
12
|
+
)
|
4
13
|
|
5
14
|
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
6
15
|
|
@@ -33,6 +42,10 @@ from .state_reporter import ExecutorStateReporter
|
|
33
42
|
|
34
43
|
_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
|
35
44
|
_RECONCILIATION_RETRIES = 3
|
45
|
+
# If we didn't get a new desired state from the stream within this timeout then the stream might
|
46
|
+
# not be healthy due to network disruption. In this case we need to recreate the stream to make
|
47
|
+
# sure that Server really doesn't want to send us a new state.
|
48
|
+
_DESIRED_EXECUTOR_STATES_TIMEOUT_SEC = 5 * 60 # 5 minutes
|
36
49
|
|
37
50
|
|
38
51
|
class ExecutorStateReconciler:
|
@@ -141,16 +154,15 @@ class ExecutorStateReconciler:
|
|
141
154
|
Never raises any exceptions. Get cancelled via aio task cancellation.
|
142
155
|
"""
|
143
156
|
while True:
|
157
|
+
desired_states_stream: Optional[AsyncIterable[DesiredExecutorState]] = None
|
144
158
|
try:
|
145
|
-
stub = ExecutorAPIStub(await self._channel_manager.
|
159
|
+
stub = ExecutorAPIStub(await self._channel_manager.get_shared_channel())
|
146
160
|
# Report state once before starting the stream so Server
|
147
161
|
# doesn't use stale state it knew about this Executor in the past.
|
148
162
|
await self._state_reporter.report_state_and_wait_for_completion()
|
149
163
|
|
150
|
-
desired_states_stream
|
151
|
-
|
152
|
-
GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
|
153
|
-
)
|
164
|
+
desired_states_stream = stub.get_desired_executor_states(
|
165
|
+
GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
|
154
166
|
)
|
155
167
|
self._logger.info("created new desired states stream")
|
156
168
|
await self._process_desired_states_stream(desired_states_stream)
|
@@ -159,6 +171,11 @@ class ExecutorStateReconciler:
|
|
159
171
|
f"error while processing desired states stream",
|
160
172
|
exc_info=e,
|
161
173
|
)
|
174
|
+
finally:
|
175
|
+
# Cleanly signal Server that the stream is closed by client.
|
176
|
+
# See https://stackoverflow.com/questions/72207914/how-to-stop-listening-on-a-stream-in-python-grpc-client
|
177
|
+
if desired_states_stream is not None:
|
178
|
+
desired_states_stream.cancel()
|
162
179
|
|
163
180
|
self._logger.info(
|
164
181
|
f"desired states stream closed, reconnecting in {self._server_backoff_interval_sec} sec"
|
@@ -166,10 +183,21 @@ class ExecutorStateReconciler:
|
|
166
183
|
await asyncio.sleep(self._server_backoff_interval_sec)
|
167
184
|
|
168
185
|
async def _process_desired_states_stream(
|
169
|
-
self, desired_states:
|
186
|
+
self, desired_states: AsyncIterable[DesiredExecutorState]
|
170
187
|
):
|
171
|
-
|
172
|
-
|
188
|
+
desired_states_iter: AsyncIterator[DesiredExecutorState] = aiter(desired_states)
|
189
|
+
while True:
|
190
|
+
try:
|
191
|
+
new_state: DesiredExecutorState = await asyncio.wait_for(
|
192
|
+
anext(desired_states_iter),
|
193
|
+
timeout=_DESIRED_EXECUTOR_STATES_TIMEOUT_SEC,
|
194
|
+
)
|
195
|
+
except asyncio.TimeoutError:
|
196
|
+
self._logger.info(
|
197
|
+
f"No desired state received from Server within {_DESIRED_EXECUTOR_STATES_TIMEOUT_SEC} sec, recreating the stream to ensure it is healthy"
|
198
|
+
)
|
199
|
+
break # Timeout reached, stream might be unhealthy, exit the loop to recreate the stream.
|
200
|
+
|
173
201
|
validator: MessageValidator = MessageValidator(new_state)
|
174
202
|
try:
|
175
203
|
validator.required_field("clock")
|
@@ -30,10 +30,11 @@ from .function_executor_controller.loggers import task_result_logger
|
|
30
30
|
from .host_resources.host_resources import HostResources, HostResourcesProvider
|
31
31
|
from .host_resources.nvidia_gpu import NVIDIA_GPU_MODEL
|
32
32
|
from .metrics.state_reporter import (
|
33
|
-
|
34
|
-
|
33
|
+
metric_state_report_rpc_errors,
|
34
|
+
metric_state_report_rpc_latency,
|
35
35
|
metric_state_report_rpcs,
|
36
36
|
)
|
37
|
+
from .monitoring.health_checker.health_checker import HealthChecker
|
37
38
|
|
38
39
|
_REPORTING_INTERVAL_SEC = 5
|
39
40
|
_REPORTING_BACKOFF_SEC = 5
|
@@ -49,6 +50,7 @@ class ExecutorStateReporter:
|
|
49
50
|
function_allowlist: List[FunctionURI],
|
50
51
|
channel_manager: ChannelManager,
|
51
52
|
host_resources_provider: HostResourcesProvider,
|
53
|
+
health_checker: HealthChecker,
|
52
54
|
logger: Any,
|
53
55
|
):
|
54
56
|
self._executor_id: str = executor_id
|
@@ -57,6 +59,7 @@ class ExecutorStateReporter:
|
|
57
59
|
self._labels.update(_executor_labels())
|
58
60
|
self._hostname: str = gethostname()
|
59
61
|
self._channel_manager = channel_manager
|
62
|
+
self._health_checker: HealthChecker = health_checker
|
60
63
|
self._logger: Any = logger.bind(module=__name__)
|
61
64
|
self._allowed_functions: List[AllowedFunction] = _to_allowed_function_protos(
|
62
65
|
function_allowlist
|
@@ -167,10 +170,15 @@ class ExecutorStateReporter:
|
|
167
170
|
# Don't retry state report if it failed during shutdown.
|
168
171
|
# We only do best effort last state report and Server might not be available.
|
169
172
|
try:
|
170
|
-
async with self._channel_manager.
|
171
|
-
|
172
|
-
|
173
|
-
|
173
|
+
async with self._channel_manager.create_standalone_channel() as channel:
|
174
|
+
await ExecutorAPIStub(channel).report_executor_state(
|
175
|
+
ReportExecutorStateRequest(
|
176
|
+
executor_state=self._current_executor_state(),
|
177
|
+
executor_update=self._remove_pending_update(),
|
178
|
+
),
|
179
|
+
timeout=_REPORT_RPC_TIMEOUT_SEC,
|
180
|
+
)
|
181
|
+
except Exception as e:
|
174
182
|
self._logger.error(
|
175
183
|
"failed to report state during shutdown",
|
176
184
|
exc_info=e,
|
@@ -187,60 +195,48 @@ class ExecutorStateReporter:
|
|
187
195
|
Never raises any exceptions.
|
188
196
|
"""
|
189
197
|
while True:
|
190
|
-
stub = ExecutorAPIStub(await self._channel_manager.
|
198
|
+
stub = ExecutorAPIStub(await self._channel_manager.get_shared_channel())
|
191
199
|
while True:
|
192
200
|
await self._state_report_scheduled_event.wait()
|
193
201
|
# Clear the event immidiately to report again asap if needed. This reduces latency in the system.
|
194
202
|
self._state_report_scheduled_event.clear()
|
195
203
|
try:
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
204
|
+
state: ExecutorState = self._current_executor_state()
|
205
|
+
update: ExecutorUpdate = self._remove_pending_update()
|
206
|
+
_log_reported_executor_update(update, self._logger)
|
207
|
+
|
208
|
+
with (
|
209
|
+
metric_state_report_rpc_errors.count_exceptions(),
|
210
|
+
metric_state_report_rpc_latency.time(),
|
211
|
+
):
|
212
|
+
metric_state_report_rpcs.inc()
|
213
|
+
await stub.report_executor_state(
|
214
|
+
ReportExecutorStateRequest(
|
215
|
+
executor_state=state, executor_update=update
|
216
|
+
),
|
217
|
+
timeout=_REPORT_RPC_TIMEOUT_SEC,
|
218
|
+
)
|
201
219
|
self._state_reported_event.set()
|
220
|
+
self._health_checker.server_connection_state_changed(
|
221
|
+
is_healthy=True, status_message="grpc server channel is healthy"
|
222
|
+
)
|
202
223
|
except Exception as e:
|
224
|
+
self._add_to_pending_update(update)
|
203
225
|
self._logger.error(
|
204
226
|
f"failed to report state to the server, backing-off for {_REPORTING_BACKOFF_SEC} sec.",
|
205
227
|
exc_info=e,
|
206
228
|
)
|
229
|
+
# The periodic state reports serve as channel health monitoring requests
|
230
|
+
# (same as TCP keep-alive). Channel Manager returns the same healthy channel
|
231
|
+
# for all RPCs that we do from Executor to Server. So all the RPCs benefit
|
232
|
+
# from this channel health monitoring.
|
233
|
+
self._health_checker.server_connection_state_changed(
|
234
|
+
is_healthy=False,
|
235
|
+
status_message="grpc server channel is unhealthy",
|
236
|
+
)
|
237
|
+
await self._channel_manager.fail_shared_channel()
|
207
238
|
await asyncio.sleep(_REPORTING_BACKOFF_SEC)
|
208
|
-
break # exit the inner loop to
|
209
|
-
|
210
|
-
async def _report_state(self, stub: ExecutorAPIStub):
|
211
|
-
"""Reports the current state to the server represented by the supplied stub.
|
212
|
-
|
213
|
-
Raises an exception on failure.
|
214
|
-
"""
|
215
|
-
with (
|
216
|
-
metric_state_report_errors.count_exceptions(),
|
217
|
-
metric_state_report_latency.time(),
|
218
|
-
):
|
219
|
-
metric_state_report_rpcs.inc()
|
220
|
-
state: ExecutorState = self._current_executor_state()
|
221
|
-
update: ExecutorUpdate = self._remove_pending_update()
|
222
|
-
|
223
|
-
for task_result in update.task_results:
|
224
|
-
task_result_logger(task_result, self._logger).info(
|
225
|
-
"reporting task outcome",
|
226
|
-
outcome_code=TaskOutcomeCode.Name(task_result.outcome_code),
|
227
|
-
failure_reason=(
|
228
|
-
TaskFailureReason.Name(task_result.failure_reason)
|
229
|
-
if task_result.HasField("failure_reason")
|
230
|
-
else "None"
|
231
|
-
),
|
232
|
-
)
|
233
|
-
|
234
|
-
try:
|
235
|
-
await stub.report_executor_state(
|
236
|
-
ReportExecutorStateRequest(
|
237
|
-
executor_state=state, executor_update=update
|
238
|
-
),
|
239
|
-
timeout=_REPORT_RPC_TIMEOUT_SEC,
|
240
|
-
)
|
241
|
-
except Exception as e:
|
242
|
-
self._add_to_pending_update(update)
|
243
|
-
raise
|
239
|
+
break # exit the inner loop to use the recreated channel
|
244
240
|
|
245
241
|
def _current_executor_state(self) -> ExecutorState:
|
246
242
|
"""Returns the current executor state."""
|
@@ -284,6 +280,28 @@ class ExecutorStateReporter:
|
|
284
280
|
self.add_function_executor_update(function_executor_update)
|
285
281
|
|
286
282
|
|
283
|
+
def _log_reported_executor_update(update: ExecutorUpdate, logger: Any) -> None:
|
284
|
+
"""Logs the reported executor update.
|
285
|
+
|
286
|
+
Doesn't raise any exceptions."""
|
287
|
+
try:
|
288
|
+
for task_result in update.task_results:
|
289
|
+
task_result_logger(task_result, logger).info(
|
290
|
+
"reporting task outcome",
|
291
|
+
outcome_code=TaskOutcomeCode.Name(task_result.outcome_code),
|
292
|
+
failure_reason=(
|
293
|
+
TaskFailureReason.Name(task_result.failure_reason)
|
294
|
+
if task_result.HasField("failure_reason")
|
295
|
+
else "None"
|
296
|
+
),
|
297
|
+
)
|
298
|
+
except Exception as e:
|
299
|
+
logger.error(
|
300
|
+
"failed to log reported executor update",
|
301
|
+
exc_info=e,
|
302
|
+
)
|
303
|
+
|
304
|
+
|
287
305
|
def _to_allowed_function_protos(
|
288
306
|
function_allowlist: List[FunctionURI],
|
289
307
|
) -> List[AllowedFunction]:
|
@@ -288,7 +288,5 @@ service ExecutorAPI {
|
|
288
288
|
|
289
289
|
// Called by Executor to open a stream of its desired states. When Server wants Executor to change something
|
290
290
|
// it puts a message on the stream with the new desired state of the Executor.
|
291
|
-
//
|
292
|
-
// Deprecated HTTP API is used to download the serialized graph and task inputs.
|
293
291
|
rpc get_desired_executor_states(GetDesiredExecutorStatesRequest) returns (stream DesiredExecutorState) {}
|
294
292
|
}
|
@@ -79,8 +79,6 @@ class ExecutorAPIServicer(object):
|
|
79
79
|
def get_desired_executor_states(self, request, context):
|
80
80
|
"""Called by Executor to open a stream of its desired states. When Server wants Executor to change something
|
81
81
|
it puts a message on the stream with the new desired state of the Executor.
|
82
|
-
|
83
|
-
Deprecated HTTP API is used to download the serialized graph and task inputs.
|
84
82
|
"""
|
85
83
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
86
84
|
context.set_details("Method not implemented!")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: indexify
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.18
|
4
4
|
Summary: Open Source Indexify components and helper tools
|
5
5
|
Home-page: https://github.com/tensorlakeai/indexify
|
6
6
|
License: Apache 2.0
|
@@ -14,10 +14,10 @@ Classifier: Programming Language :: Python :: 3.11
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
15
15
|
Classifier: Programming Language :: Python :: 3.13
|
16
16
|
Requires-Dist: aiohttp (>=3.12.14,<4.0.0)
|
17
|
-
Requires-Dist: boto3 (>=1.39.
|
17
|
+
Requires-Dist: boto3 (>=1.39.8,<2.0.0)
|
18
18
|
Requires-Dist: prometheus-client (>=0.22.1,<0.23.0)
|
19
19
|
Requires-Dist: psutil (>=7.0.0,<8.0.0)
|
20
|
-
Requires-Dist: tensorlake (==0.2.
|
20
|
+
Requires-Dist: tensorlake (==0.2.27)
|
21
21
|
Project-URL: Repository, https://github.com/tensorlakeai/indexify
|
22
22
|
Description-Content-Type: text/markdown
|
23
23
|
|
@@ -7,8 +7,8 @@ indexify/executor/blob_store/blob_store.py,sha256=XViw_KRfFSNqwcFYwMZixZF-EYCjXK
|
|
7
7
|
indexify/executor/blob_store/local_fs_blob_store.py,sha256=6LexqMBGXp8f6Ka95R6xMIUyDutrZJABOMNcp-ssa98,1809
|
8
8
|
indexify/executor/blob_store/metrics/blob_store.py,sha256=5_xiPREeHWFtxFh1NupDsF8zP4pmUPgLNNn-UE9Uzvc,1008
|
9
9
|
indexify/executor/blob_store/s3_blob_store.py,sha256=G3B_V3gUE7XbUY42lDtBczUKuA7q8S7MD43tx1aHrJo,3445
|
10
|
-
indexify/executor/channel_manager.py,sha256=
|
11
|
-
indexify/executor/executor.py,sha256=
|
10
|
+
indexify/executor/channel_manager.py,sha256=ihKfWJmUqQvh4UKXewZLzyJWW_f50P4fnwPqPonrozw,6651
|
11
|
+
indexify/executor/executor.py,sha256=rM7BmJDqC_YwdwPfDGFGiFO2WxOW3Nj8Z7rwRw8UcFk,6353
|
12
12
|
indexify/executor/function_allowlist.py,sha256=PCelCW6qIe_2sH11BCKr7LDqarRV5kwNsrfB2EV7Zwo,1772
|
13
13
|
indexify/executor/function_executor/function_executor.py,sha256=Hz_dT_2i1m9akUGfULWQpDlMsn0CI1AX4Mdt7-oOknI,13598
|
14
14
|
indexify/executor/function_executor/health_checker.py,sha256=IxE0jnC99K_lvnizFLjXqS1942H8-FNAN4AlhLIjg2Y,6373
|
@@ -48,22 +48,23 @@ indexify/executor/host_resources/nvidia_gpu_allocator.py,sha256=AOcXKglLyRD-GrZz
|
|
48
48
|
indexify/executor/metrics/channel_manager.py,sha256=1dU9bzF3xqBy1nY9Sc66GfQQWnWZSNip4lEH1vjoWdI,648
|
49
49
|
indexify/executor/metrics/executor.py,sha256=8dJXmyGqKlBSrPuyWXW7O2I21uxQ687l-2dYTvz4fmk,398
|
50
50
|
indexify/executor/metrics/state_reconciler.py,sha256=BSlRgvgtwih6QcYrsFU5P2ylaXAsC_X70DbzDuv9NsU,584
|
51
|
-
indexify/executor/metrics/state_reporter.py,sha256=
|
51
|
+
indexify/executor/metrics/state_reporter.py,sha256=JvyP_IUfJQetEjzmoWe9q6HCA4Ao1GLocaa7Od_jl2g,550
|
52
52
|
indexify/executor/monitoring/handler.py,sha256=Cj1cu_LcsAP0tdviqNhoEtGm4h0OJAxxzW9C2YdNXYU,240
|
53
53
|
indexify/executor/monitoring/health_check_handler.py,sha256=e1pEtWFKaVs6H57Z4YLejNECrJtC38PweZc7xTJeqVw,695
|
54
|
-
indexify/executor/monitoring/health_checker/generic_health_checker.py,sha256=
|
54
|
+
indexify/executor/monitoring/health_checker/generic_health_checker.py,sha256=vJRV879GrdZFqwTnM9pRLA97LRMutGz2sWRy-KS-tfg,1493
|
55
55
|
indexify/executor/monitoring/health_checker/health_checker.py,sha256=B-Q4KM1iEUSMA2fr9PBhBLdA7sYII_NuTRmPuRILGSo,665
|
56
|
+
indexify/executor/monitoring/health_checker/metrics/health_checker.py,sha256=50JS4JaOdAgSk7iYaBV4J3tGXkRTzmIVR_jVOV66YOc,129
|
56
57
|
indexify/executor/monitoring/metrics.py,sha256=Dx2wPcTKvbd5Y5rGOfeyscFtAQ2DZ16_s5BX6d4nhI8,6660
|
57
58
|
indexify/executor/monitoring/prometheus_metrics_handler.py,sha256=KiGqSf7rkXTfbDwThyXFpFe2jnuZD5q-5SBP_0GDo8Y,591
|
58
59
|
indexify/executor/monitoring/server.py,sha256=yzdYhcxnmY6uTQUMt3vatF5jilN52ZtfFseOmHyQpTo,1254
|
59
60
|
indexify/executor/monitoring/startup_probe_handler.py,sha256=zXXsBU15SMlBx1bSFpxWDfed1VHtKKnwvLQ8-frpG98,425
|
60
|
-
indexify/executor/state_reconciler.py,sha256=
|
61
|
-
indexify/executor/state_reporter.py,sha256=
|
62
|
-
indexify/proto/executor_api.proto,sha256=
|
61
|
+
indexify/executor/state_reconciler.py,sha256=hPTjCUkXQV0HIwa5JczYpb5gvTGonQpkxqOvQXf-QU4,20057
|
62
|
+
indexify/executor/state_reporter.py,sha256=zf5UBhBZVv9SQ1Ju_bY8w6D_t1hBZ5YVXhjeFMEgRms,15208
|
63
|
+
indexify/proto/executor_api.proto,sha256=nIgc1vDbKVpmqCrWOQzF3NFoHxnMcYiaFC3OyA2CsnE,12117
|
63
64
|
indexify/proto/executor_api_pb2.py,sha256=zdl00UOqgOB1KeRIAceh_43RpAOVLEs9RSbzxQ0hmKY,16163
|
64
65
|
indexify/proto/executor_api_pb2.pyi,sha256=adD5mqqJhmTgRCa_4v1cR6GcOY-VOLOBV9k8T5iaqPc,22647
|
65
|
-
indexify/proto/executor_api_pb2_grpc.py,sha256=
|
66
|
-
indexify-0.4.
|
67
|
-
indexify-0.4.
|
68
|
-
indexify-0.4.
|
69
|
-
indexify-0.4.
|
66
|
+
indexify/proto/executor_api_pb2_grpc.py,sha256=cd-oqWdNGwQwIBA0qdPzd4gvVeLGZXoFis0nlqOlxiU,7520
|
67
|
+
indexify-0.4.18.dist-info/METADATA,sha256=CDj1zRZp5NarWJm0mgajWtPozeQUN8jhqXAFro2aS_g,1116
|
68
|
+
indexify-0.4.18.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
|
69
|
+
indexify-0.4.18.dist-info/entry_points.txt,sha256=rMJqbE5KPZIXTPIfAtVIM4zpUElqYVgEYd6i7N23zzg,49
|
70
|
+
indexify-0.4.18.dist-info/RECORD,,
|
File without changes
|
File without changes
|