indexify 0.3.19__py3-none-any.whl → 0.3.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +12 -0
- indexify/executor/api_objects.py +11 -6
- indexify/executor/blob_store/blob_store.py +69 -0
- indexify/executor/blob_store/local_fs_blob_store.py +48 -0
- indexify/executor/blob_store/metrics/blob_store.py +33 -0
- indexify/executor/blob_store/s3_blob_store.py +88 -0
- indexify/executor/downloader.py +192 -27
- indexify/executor/executor.py +29 -13
- indexify/executor/function_executor/function_executor.py +1 -1
- indexify/executor/function_executor/function_executor_states_container.py +5 -0
- indexify/executor/function_executor/function_executor_status.py +2 -0
- indexify/executor/function_executor/health_checker.py +7 -2
- indexify/executor/function_executor/invocation_state_client.py +4 -2
- indexify/executor/function_executor/single_task_runner.py +2 -0
- indexify/executor/function_executor/task_output.py +8 -1
- indexify/executor/grpc/channel_manager.py +4 -3
- indexify/executor/grpc/function_executor_controller.py +163 -193
- indexify/executor/grpc/metrics/state_reconciler.py +17 -0
- indexify/executor/grpc/metrics/task_controller.py +8 -0
- indexify/executor/grpc/state_reconciler.py +305 -188
- indexify/executor/grpc/state_reporter.py +18 -10
- indexify/executor/grpc/task_controller.py +247 -189
- indexify/executor/metrics/task_reporter.py +17 -0
- indexify/executor/task_reporter.py +217 -94
- indexify/executor/task_runner.py +1 -0
- indexify/proto/executor_api.proto +37 -11
- indexify/proto/executor_api_pb2.py +49 -47
- indexify/proto/executor_api_pb2.pyi +55 -15
- {indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/METADATA +2 -1
- {indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/RECORD +32 -27
- indexify/executor/grpc/completed_tasks_container.py +0 -26
- {indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/WHEEL +0 -0
- {indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/entry_points.txt +0 -0
indexify/executor/executor.py
CHANGED
@@ -12,6 +12,7 @@ from tensorlake.utils.logging import suppress as suppress_logging
|
|
12
12
|
from indexify.proto.executor_api_pb2 import ExecutorStatus
|
13
13
|
|
14
14
|
from .api_objects import FunctionURI, Task
|
15
|
+
from .blob_store.blob_store import BLOBStore
|
15
16
|
from .downloader import Downloader
|
16
17
|
from .executor_flavor import ExecutorFlavor
|
17
18
|
from .function_executor.function_executor_states_container import (
|
@@ -69,6 +70,7 @@ class Executor:
|
|
69
70
|
monitoring_server_host: str,
|
70
71
|
monitoring_server_port: int,
|
71
72
|
enable_grpc_state_reconciler: bool,
|
73
|
+
blob_store: BLOBStore,
|
72
74
|
):
|
73
75
|
self._logger = structlog.get_logger(module=__name__)
|
74
76
|
self._is_shutdown: bool = False
|
@@ -95,7 +97,10 @@ class Executor:
|
|
95
97
|
self._function_executor_states
|
96
98
|
)
|
97
99
|
self._downloader = Downloader(
|
98
|
-
code_path=code_path,
|
100
|
+
code_path=code_path,
|
101
|
+
base_url=self._base_url,
|
102
|
+
blob_store=blob_store,
|
103
|
+
config_path=config_path,
|
99
104
|
)
|
100
105
|
self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
|
101
106
|
self._function_executor_server_factory = function_executor_server_factory
|
@@ -123,6 +128,7 @@ class Executor:
|
|
123
128
|
executor_id=id,
|
124
129
|
config_path=config_path,
|
125
130
|
channel_manager=self._channel_manager,
|
131
|
+
blob_store=blob_store,
|
126
132
|
)
|
127
133
|
|
128
134
|
# HTTP mode task runner
|
@@ -189,12 +195,15 @@ class Executor:
|
|
189
195
|
signum, self.shutdown, asyncio.get_event_loop()
|
190
196
|
)
|
191
197
|
|
192
|
-
asyncio.get_event_loop().create_task(
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
+
asyncio.get_event_loop().create_task(
|
199
|
+
self._monitoring_server.run(), name="monitoring server runner"
|
200
|
+
)
|
201
|
+
self._state_reporter.update_executor_status(
|
202
|
+
ExecutorStatus.EXECUTOR_STATUS_RUNNING
|
203
|
+
)
|
204
|
+
asyncio.get_event_loop().create_task(
|
205
|
+
self._state_reporter.run(), name="state reporter runner"
|
206
|
+
)
|
198
207
|
|
199
208
|
metric_executor_state.state("running")
|
200
209
|
self._startup_probe_handler.set_ready()
|
@@ -215,7 +224,6 @@ class Executor:
|
|
215
224
|
"""Runs the gRPC state reconciler and state reporter.
|
216
225
|
|
217
226
|
Never raises any exceptions."""
|
218
|
-
asyncio.create_task(self._state_reporter.run())
|
219
227
|
await self._state_reconciler.run()
|
220
228
|
|
221
229
|
async def _http_task_runner_loop(self):
|
@@ -224,11 +232,15 @@ class Executor:
|
|
224
232
|
async for task in self._task_fetcher.run():
|
225
233
|
metric_tasks_fetched.inc()
|
226
234
|
if not self._is_shutdown:
|
227
|
-
asyncio.create_task(
|
235
|
+
asyncio.create_task(
|
236
|
+
self._run_task(task), name="task runner (http mode)"
|
237
|
+
)
|
238
|
+
self._logger.info("fetching tasks finished, reconnecting in 5 seconds")
|
228
239
|
except Exception as e:
|
229
240
|
self._logger.error(
|
230
241
|
"failed fetching tasks, retrying in 5 seconds", exc_info=e
|
231
242
|
)
|
243
|
+
if not self._is_shutdown:
|
232
244
|
await asyncio.sleep(5)
|
233
245
|
|
234
246
|
async def _run_task(self, task: Task) -> None:
|
@@ -250,6 +262,7 @@ class Executor:
|
|
250
262
|
function_name=task.compute_fn,
|
251
263
|
graph_version=task.graph_version,
|
252
264
|
graph_invocation_id=task.invocation_id,
|
265
|
+
output_payload_uri_prefix=task.output_payload_uri_prefix,
|
253
266
|
)
|
254
267
|
logger.error("task execution failed", exc_info=e)
|
255
268
|
|
@@ -293,17 +306,19 @@ class Executor:
|
|
293
306
|
graph_name=task.compute_graph,
|
294
307
|
graph_version=task.graph_version,
|
295
308
|
logger=logger,
|
309
|
+
data_payload=task.graph_payload,
|
296
310
|
)
|
297
311
|
input: SerializedObject = await self._downloader.download_input(
|
298
312
|
namespace=task.namespace,
|
299
313
|
graph_name=task.compute_graph,
|
300
314
|
graph_invocation_id=task.invocation_id,
|
301
315
|
input_key=task.input_key,
|
316
|
+
data_payload=task.input_payload,
|
302
317
|
logger=logger,
|
303
318
|
)
|
304
319
|
init_value: Optional[SerializedObject] = (
|
305
320
|
None
|
306
|
-
if task.reducer_output_id is None
|
321
|
+
if task.reducer_output_id is None and task.reducer_input_payload is None
|
307
322
|
else (
|
308
323
|
await self._downloader.download_init_value(
|
309
324
|
namespace=task.namespace,
|
@@ -311,6 +326,7 @@ class Executor:
|
|
311
326
|
function_name=task.compute_fn,
|
312
327
|
graph_invocation_id=task.invocation_id,
|
313
328
|
reducer_output_key=task.reducer_output_id,
|
329
|
+
data_payload=task.reducer_input_payload,
|
314
330
|
logger=logger,
|
315
331
|
)
|
316
332
|
)
|
@@ -380,12 +396,12 @@ class Executor:
|
|
380
396
|
if self._task_runner is not None:
|
381
397
|
await self._task_runner.shutdown()
|
382
398
|
|
383
|
-
if self._channel_manager is not None:
|
384
|
-
await self._channel_manager.shutdown()
|
385
399
|
if self._state_reporter is not None:
|
386
400
|
await self._state_reporter.shutdown()
|
387
401
|
if self._state_reconciler is not None:
|
388
402
|
await self._state_reconciler.shutdown()
|
403
|
+
if self._channel_manager is not None:
|
404
|
+
await self._channel_manager.destroy()
|
389
405
|
|
390
406
|
# We need to shutdown all users of FE states first,
|
391
407
|
# otherwise states might disappear unexpectedly and we might
|
@@ -397,7 +413,7 @@ class Executor:
|
|
397
413
|
# The current task is cancelled, the code after this line will not run.
|
398
414
|
|
399
415
|
def shutdown(self, loop):
|
400
|
-
loop.create_task(self._shutdown(loop))
|
416
|
+
loop.create_task(self._shutdown(loop), name="executor shutdown")
|
401
417
|
|
402
418
|
def _task_logger(self, task: Task) -> Any:
|
403
419
|
return self._logger.bind(
|
@@ -318,6 +318,6 @@ async def _initialize_server(
|
|
318
318
|
except grpc.aio.AioRpcError as e:
|
319
319
|
if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
|
320
320
|
raise CustomerError(
|
321
|
-
f"Customer code timeout {customer_code_timeout_sec} sec expired"
|
321
|
+
f"Customer code timeout of {customer_code_timeout_sec:.3f} sec expired"
|
322
322
|
) from e
|
323
323
|
raise
|
@@ -54,6 +54,11 @@ class FunctionExecutorStatesContainer:
|
|
54
54
|
|
55
55
|
return self._states[id]
|
56
56
|
|
57
|
+
async def get(self, id: str) -> FunctionExecutorState:
|
58
|
+
"""Get the state with the given ID. Raises Exception if the state does not exist."""
|
59
|
+
async with self._lock:
|
60
|
+
return self._states[id]
|
61
|
+
|
57
62
|
async def __aiter__(self) -> AsyncGenerator[FunctionExecutorState, None]:
|
58
63
|
async with self._lock:
|
59
64
|
for state in self._states.values():
|
@@ -34,6 +34,8 @@ class FunctionExecutorStatus(Enum):
|
|
34
34
|
SHUTDOWN = "Shutdown" # Permanent stop state
|
35
35
|
|
36
36
|
|
37
|
+
# TODO: After removing HTTP code simplify state transitions by not allowing to
|
38
|
+
# startup an FE after it was destroyed. grpc protocol treats FEs as ephimeral and never revives them.
|
37
39
|
def is_status_change_allowed(
|
38
40
|
current_status: FunctionExecutorStatus, new_status: FunctionExecutorStatus
|
39
41
|
) -> bool:
|
@@ -107,7 +107,9 @@ class HealthChecker:
|
|
107
107
|
return
|
108
108
|
|
109
109
|
self._health_check_failed_callback = callback
|
110
|
-
self._health_check_loop_task = asyncio.create_task(
|
110
|
+
self._health_check_loop_task = asyncio.create_task(
|
111
|
+
self._health_check_loop(), name="function executor health checker loop"
|
112
|
+
)
|
111
113
|
|
112
114
|
def stop(self) -> None:
|
113
115
|
"""Stops the periodic health checks.
|
@@ -126,7 +128,10 @@ class HealthChecker:
|
|
126
128
|
break
|
127
129
|
await asyncio.sleep(HEALTH_CHECK_POLL_PERIOD_SEC)
|
128
130
|
|
129
|
-
asyncio.create_task(
|
131
|
+
asyncio.create_task(
|
132
|
+
self._health_check_failed_callback(result),
|
133
|
+
name="function executor health check failure callback",
|
134
|
+
)
|
130
135
|
self._health_check_loop_task = None
|
131
136
|
|
132
137
|
|
@@ -67,7 +67,8 @@ class InvocationStateClient:
|
|
67
67
|
self._response_generator()
|
68
68
|
)
|
69
69
|
self._request_loop_task = asyncio.create_task(
|
70
|
-
self._request_loop(server_requests)
|
70
|
+
self._request_loop(server_requests),
|
71
|
+
name="graph invocation state client request processing loop",
|
71
72
|
)
|
72
73
|
|
73
74
|
def add_task_to_invocation_id_entry(self, task_id: str, invocation_id: str) -> None:
|
@@ -100,7 +101,8 @@ class InvocationStateClient:
|
|
100
101
|
pass
|
101
102
|
except asyncio.CancelledError:
|
102
103
|
# This async task was cancelled by destroy(). Normal situation too.
|
103
|
-
|
104
|
+
# This exception should not be suppressed, see Python asyncio docs.
|
105
|
+
raise
|
104
106
|
except Exception as e:
|
105
107
|
metric_request_read_errors.inc()
|
106
108
|
self._logger.error(
|
@@ -96,6 +96,7 @@ class SingleTaskRunner:
|
|
96
96
|
graph_invocation_id=self._task_input.task.invocation_id,
|
97
97
|
stderr=str(e),
|
98
98
|
success=False,
|
99
|
+
output_payload_uri_prefix=self._task_input.task.output_payload_uri_prefix,
|
99
100
|
)
|
100
101
|
|
101
102
|
try:
|
@@ -311,6 +312,7 @@ def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
|
|
311
312
|
reducer=response.is_reducer,
|
312
313
|
success=response.success,
|
313
314
|
metrics=metrics,
|
315
|
+
output_payload_uri_prefix=task.output_payload_uri_prefix,
|
314
316
|
)
|
315
317
|
|
316
318
|
if response.HasField("function_output"):
|
@@ -25,6 +25,7 @@ class TaskOutput:
|
|
25
25
|
function_name: str,
|
26
26
|
graph_version: str,
|
27
27
|
graph_invocation_id: str,
|
28
|
+
output_payload_uri_prefix: Optional[str],
|
28
29
|
output_encoding: Optional[str] = None,
|
29
30
|
function_output: Optional[FunctionOutput] = None,
|
30
31
|
router_output: Optional[RouterOutput] = None,
|
@@ -50,6 +51,7 @@ class TaskOutput:
|
|
50
51
|
self.is_internal_error = is_internal_error
|
51
52
|
self.metrics = metrics
|
52
53
|
self.output_encoding = output_encoding
|
54
|
+
self.output_payload_uri_prefix = output_payload_uri_prefix
|
53
55
|
|
54
56
|
@classmethod
|
55
57
|
def internal_error(
|
@@ -60,6 +62,7 @@ class TaskOutput:
|
|
60
62
|
function_name: str,
|
61
63
|
graph_version: str,
|
62
64
|
graph_invocation_id: str,
|
65
|
+
output_payload_uri_prefix: Optional[str],
|
63
66
|
) -> "TaskOutput":
|
64
67
|
"""Creates a TaskOutput for an internal error."""
|
65
68
|
# We are not sharing internal error messages with the customer.
|
@@ -72,6 +75,7 @@ class TaskOutput:
|
|
72
75
|
graph_invocation_id=graph_invocation_id,
|
73
76
|
stderr="Platform failed to execute the function.",
|
74
77
|
is_internal_error=True,
|
78
|
+
output_payload_uri_prefix=output_payload_uri_prefix,
|
75
79
|
)
|
76
80
|
|
77
81
|
@classmethod
|
@@ -83,6 +87,8 @@ class TaskOutput:
|
|
83
87
|
function_name: str,
|
84
88
|
graph_version: str,
|
85
89
|
graph_invocation_id: str,
|
90
|
+
timeout_sec: float,
|
91
|
+
output_payload_uri_prefix: Optional[str],
|
86
92
|
) -> "TaskOutput":
|
87
93
|
"""Creates a TaskOutput for an function timeout error."""
|
88
94
|
# Task stdout, stderr is not available.
|
@@ -93,6 +99,7 @@ class TaskOutput:
|
|
93
99
|
function_name=function_name,
|
94
100
|
graph_version=graph_version,
|
95
101
|
graph_invocation_id=graph_invocation_id,
|
96
|
-
stderr="Function
|
102
|
+
stderr=f"Function exceeded its configured timeout of {timeout_sec:.3f} sec.",
|
97
103
|
is_internal_error=False,
|
104
|
+
output_payload_uri_prefix=output_payload_uri_prefix,
|
98
105
|
)
|
@@ -69,6 +69,10 @@ class ChannelManager:
|
|
69
69
|
certificate_chain=certificate_chain,
|
70
70
|
)
|
71
71
|
|
72
|
+
async def destroy(self):
|
73
|
+
if self._channel is not None:
|
74
|
+
await self._destroy_locked_channel()
|
75
|
+
|
72
76
|
async def get_channel(self) -> grpc.aio.Channel:
|
73
77
|
"""Returns a channel to the gRPC server.
|
74
78
|
|
@@ -155,6 +159,3 @@ class ChannelManager:
|
|
155
159
|
except Exception as e:
|
156
160
|
self._logger.error("failed closing channel", exc_info=e)
|
157
161
|
self._channel = None
|
158
|
-
|
159
|
-
async def shutdown(self):
|
160
|
-
pass
|