indexify 0.3.30__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/__init__.py +18 -0
- indexify/cli/build_image.py +51 -0
- indexify/cli/deploy.py +57 -0
- indexify/cli/executor.py +205 -0
- indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
- indexify/executor/executor.py +57 -311
- indexify/executor/function_allowlist.py +59 -0
- indexify/executor/function_executor/function_executor.py +12 -6
- indexify/executor/function_executor/invocation_state_client.py +25 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
- indexify/executor/function_executor_controller/__init__.py +13 -0
- indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
- indexify/executor/function_executor_controller/create_function_executor.py +154 -0
- indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
- indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
- indexify/executor/function_executor_controller/downloads.py +199 -0
- indexify/executor/function_executor_controller/events.py +172 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
- indexify/executor/function_executor_controller/loggers.py +57 -0
- indexify/executor/function_executor_controller/message_validators.py +65 -0
- indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
- indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
- indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
- indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
- indexify/executor/function_executor_controller/prepare_task.py +38 -0
- indexify/executor/function_executor_controller/run_task.py +201 -0
- indexify/executor/function_executor_controller/task_info.py +33 -0
- indexify/executor/function_executor_controller/task_output.py +122 -0
- indexify/executor/function_executor_controller/upload_task_output.py +234 -0
- indexify/executor/host_resources/host_resources.py +20 -25
- indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
- indexify/executor/metrics/executor.py +0 -47
- indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
- indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
- indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
- indexify/executor/monitoring/health_checker/health_checker.py +0 -11
- indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
- indexify/executor/state_reporter.py +364 -0
- indexify/proto/executor_api.proto +67 -59
- indexify/proto/executor_api_pb2.py +52 -52
- indexify/proto/executor_api_pb2.pyi +125 -104
- indexify/proto/executor_api_pb2_grpc.py +0 -47
- {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/METADATA +1 -3
- indexify-0.4.2.dist-info/RECORD +68 -0
- indexify-0.4.2.dist-info/entry_points.txt +3 -0
- indexify/cli/cli.py +0 -267
- indexify/executor/api_objects.py +0 -92
- indexify/executor/downloader.py +0 -417
- indexify/executor/executor_flavor.py +0 -7
- indexify/executor/function_executor/function_executor_state.py +0 -107
- indexify/executor/function_executor/function_executor_states_container.py +0 -93
- indexify/executor/function_executor/function_executor_status.py +0 -95
- indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
- indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
- indexify/executor/function_executor/single_task_runner.py +0 -345
- indexify/executor/function_executor/task_input.py +0 -21
- indexify/executor/function_executor/task_output.py +0 -105
- indexify/executor/grpc/function_executor_controller.py +0 -418
- indexify/executor/grpc/metrics/task_controller.py +0 -8
- indexify/executor/grpc/state_reporter.py +0 -314
- indexify/executor/grpc/task_controller.py +0 -508
- indexify/executor/metrics/task_fetcher.py +0 -21
- indexify/executor/metrics/task_reporter.py +0 -53
- indexify/executor/metrics/task_runner.py +0 -52
- indexify/executor/monitoring/function_allowlist.py +0 -25
- indexify/executor/runtime_probes.py +0 -68
- indexify/executor/task_fetcher.py +0 -96
- indexify/executor/task_reporter.py +0 -459
- indexify/executor/task_runner.py +0 -177
- indexify-0.3.30.dist-info/RECORD +0 -68
- indexify-0.3.30.dist-info/entry_points.txt +0 -3
- {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,364 @@
|
|
1
|
+
import asyncio
|
2
|
+
import hashlib
|
3
|
+
import platform
|
4
|
+
import sys
|
5
|
+
from socket import gethostname
|
6
|
+
from typing import Any, Dict, List, Optional
|
7
|
+
|
8
|
+
from indexify.proto.executor_api_pb2 import (
|
9
|
+
AllowedFunction,
|
10
|
+
ExecutorState,
|
11
|
+
ExecutorStatus,
|
12
|
+
FunctionExecutorState,
|
13
|
+
GPUModel,
|
14
|
+
GPUResources,
|
15
|
+
)
|
16
|
+
from indexify.proto.executor_api_pb2 import HostResources as HostResourcesProto
|
17
|
+
from indexify.proto.executor_api_pb2 import (
|
18
|
+
ReportExecutorStateRequest,
|
19
|
+
TaskFailureReason,
|
20
|
+
TaskOutcomeCode,
|
21
|
+
TaskResult,
|
22
|
+
)
|
23
|
+
from indexify.proto.executor_api_pb2_grpc import ExecutorAPIStub
|
24
|
+
|
25
|
+
from .channel_manager import ChannelManager
|
26
|
+
from .function_allowlist import FunctionURI
|
27
|
+
from .function_executor_controller.loggers import task_logger
|
28
|
+
from .function_executor_controller.task_output import TaskOutput
|
29
|
+
from .host_resources.host_resources import HostResources, HostResourcesProvider
|
30
|
+
from .host_resources.nvidia_gpu import NVIDIA_GPU_MODEL
|
31
|
+
from .metrics.state_reporter import (
|
32
|
+
metric_state_report_errors,
|
33
|
+
metric_state_report_latency,
|
34
|
+
metric_state_report_rpcs,
|
35
|
+
)
|
36
|
+
|
37
|
+
_REPORTING_INTERVAL_SEC = 5
|
38
|
+
_REPORT_RPC_TIMEOUT_SEC = 5
|
39
|
+
|
40
|
+
|
41
|
+
class ExecutorStateReporter:
|
42
|
+
def __init__(
|
43
|
+
self,
|
44
|
+
executor_id: str,
|
45
|
+
version: str,
|
46
|
+
labels: Dict[str, str],
|
47
|
+
function_allowlist: List[FunctionURI],
|
48
|
+
channel_manager: ChannelManager,
|
49
|
+
host_resources_provider: HostResourcesProvider,
|
50
|
+
logger: Any,
|
51
|
+
reporting_interval_sec: int = _REPORTING_INTERVAL_SEC,
|
52
|
+
):
|
53
|
+
self._executor_id: str = executor_id
|
54
|
+
self._version: str = version
|
55
|
+
self._labels: Dict[str, str] = labels.copy()
|
56
|
+
self._labels.update(_executor_labels())
|
57
|
+
self._hostname: str = gethostname()
|
58
|
+
self._channel_manager = channel_manager
|
59
|
+
self._logger: Any = logger.bind(module=__name__)
|
60
|
+
self._reporting_interval_sec: int = reporting_interval_sec
|
61
|
+
self._allowed_functions: List[AllowedFunction] = _to_allowed_function_protos(
|
62
|
+
function_allowlist
|
63
|
+
)
|
64
|
+
# We need to fetch total resources only once, because they are not changing.
|
65
|
+
self._total_host_resources: HostResources = (
|
66
|
+
host_resources_provider.total_host_resources(self._logger)
|
67
|
+
)
|
68
|
+
self._total_function_executor_resources: HostResources = (
|
69
|
+
host_resources_provider.total_function_executor_resources(self._logger)
|
70
|
+
)
|
71
|
+
self._logger.info(
|
72
|
+
"detected host resources",
|
73
|
+
total_host_resources=self._total_host_resources,
|
74
|
+
total_function_executor_resources=self._total_function_executor_resources,
|
75
|
+
)
|
76
|
+
self._state_report_worker: Optional[asyncio.Task] = None
|
77
|
+
self._periodic_state_report_scheduler: Optional[asyncio.Task] = None
|
78
|
+
|
79
|
+
# Mutable fields
|
80
|
+
self._state_report_scheduled_event: asyncio.Event = asyncio.Event()
|
81
|
+
self._state_reported_event: asyncio.Event = asyncio.Event()
|
82
|
+
self._executor_status: ExecutorStatus = ExecutorStatus.EXECUTOR_STATUS_UNKNOWN
|
83
|
+
self._last_server_clock: int = (
|
84
|
+
0 # Server expects initial value to be 0 until it is set by Server.
|
85
|
+
)
|
86
|
+
self._completed_task_outputs: List[TaskOutput] = []
|
87
|
+
self._function_executor_states: Dict[str, FunctionExecutorState] = {}
|
88
|
+
|
89
|
+
def update_executor_status(self, value: ExecutorStatus) -> None:
|
90
|
+
self._executor_status = value
|
91
|
+
|
92
|
+
def update_last_server_clock(self, value: int) -> None:
|
93
|
+
self._last_server_clock = value
|
94
|
+
|
95
|
+
def update_function_executor_state(
|
96
|
+
self,
|
97
|
+
state: FunctionExecutorState,
|
98
|
+
) -> None:
|
99
|
+
self._function_executor_states[state.description.id] = state
|
100
|
+
|
101
|
+
def remove_function_executor_info(self, function_executor_id: str) -> None:
|
102
|
+
if function_executor_id not in self._function_executor_states:
|
103
|
+
self._logger.warning(
|
104
|
+
"attempted to remove non-existing function executor state",
|
105
|
+
function_executor_id=function_executor_id,
|
106
|
+
)
|
107
|
+
return
|
108
|
+
|
109
|
+
self._function_executor_states.pop(function_executor_id)
|
110
|
+
|
111
|
+
def add_completed_task_output(self, task_output: TaskOutput) -> None:
|
112
|
+
self._completed_task_outputs.append(task_output)
|
113
|
+
|
114
|
+
def schedule_state_report(self) -> None:
|
115
|
+
"""Schedules a state report to be sent to the server asap.
|
116
|
+
|
117
|
+
This method is called when the executor state changes and it needs to get reported.
|
118
|
+
The call doesn't block and returns immediately.
|
119
|
+
"""
|
120
|
+
self._state_report_scheduled_event.set()
|
121
|
+
|
122
|
+
async def report_state_and_wait_for_completion(self) -> None:
|
123
|
+
"""Schedules a state report to be sent to the server asap and waits for the completion of the reporting."""
|
124
|
+
self._state_reported_event.clear()
|
125
|
+
self.schedule_state_report()
|
126
|
+
await self._state_reported_event.wait()
|
127
|
+
|
128
|
+
def run(self) -> None:
|
129
|
+
"""Runs the state reporter.
|
130
|
+
|
131
|
+
This method is called when the executor starts and it needs to start reporting its state
|
132
|
+
periodically.
|
133
|
+
"""
|
134
|
+
self._state_report_worker = asyncio.create_task(
|
135
|
+
self._state_report_worker_loop(), name="state_reporter_worker"
|
136
|
+
)
|
137
|
+
self._periodic_state_report_scheduler = asyncio.create_task(
|
138
|
+
self._periodic_state_report_scheduler_loop(),
|
139
|
+
name="state_reporter_periodic_scheduler",
|
140
|
+
)
|
141
|
+
|
142
|
+
async def shutdown(self) -> None:
|
143
|
+
"""Tries to do one last state report and shuts down the state reporter.
|
144
|
+
|
145
|
+
Doesn't raise any exceptions."""
|
146
|
+
if self._state_report_worker is not None:
|
147
|
+
self._state_report_worker.cancel()
|
148
|
+
try:
|
149
|
+
await self._state_report_worker
|
150
|
+
except asyncio.CancelledError:
|
151
|
+
pass # Expected exception
|
152
|
+
self._state_report_worker = None
|
153
|
+
|
154
|
+
if self._periodic_state_report_scheduler is not None:
|
155
|
+
self._periodic_state_report_scheduler.cancel()
|
156
|
+
try:
|
157
|
+
await self._periodic_state_report_scheduler
|
158
|
+
except asyncio.CancelledError:
|
159
|
+
pass
|
160
|
+
self._periodic_state_report_scheduler = None
|
161
|
+
|
162
|
+
# Don't retry state report if it failed during shutdown.
|
163
|
+
# We only do best effort last state report and Server might not be available.
|
164
|
+
try:
|
165
|
+
async with self._channel_manager.create_channel() as channel:
|
166
|
+
stub = ExecutorAPIStub(channel)
|
167
|
+
await self._report_state(stub)
|
168
|
+
except BaseException as e:
|
169
|
+
self._logger.error(
|
170
|
+
"failed to report state during shutdown",
|
171
|
+
exc_info=e,
|
172
|
+
)
|
173
|
+
|
174
|
+
async def _periodic_state_report_scheduler_loop(self) -> None:
|
175
|
+
while True:
|
176
|
+
self._state_report_scheduled_event.set()
|
177
|
+
await asyncio.sleep(self._reporting_interval_sec)
|
178
|
+
|
179
|
+
async def _state_report_worker_loop(self) -> None:
|
180
|
+
"""Runs the state reporter.
|
181
|
+
|
182
|
+
Never raises any exceptions.
|
183
|
+
"""
|
184
|
+
while True:
|
185
|
+
stub = ExecutorAPIStub(await self._channel_manager.get_channel())
|
186
|
+
while True:
|
187
|
+
await self._state_report_scheduled_event.wait()
|
188
|
+
# Clear the event immidiately to report again asap if needed. This reduces latency in the system.
|
189
|
+
self._state_report_scheduled_event.clear()
|
190
|
+
try:
|
191
|
+
# The periodic state reports serve as channel health monitoring requests
|
192
|
+
# (same as TCP keep-alive). Channel Manager returns the same healthy channel
|
193
|
+
# for all RPCs that we do from Executor to Server. So all the RPCs benefit
|
194
|
+
# from this channel health monitoring.
|
195
|
+
await self._report_state(stub)
|
196
|
+
self._state_reported_event.set()
|
197
|
+
except Exception as e:
|
198
|
+
self._logger.error(
|
199
|
+
f"failed to report state to the server, retrying in {self._reporting_interval_sec} sec.",
|
200
|
+
exc_info=e,
|
201
|
+
)
|
202
|
+
break # exit the inner loop to recreate the channel if needed
|
203
|
+
|
204
|
+
async def _report_state(self, stub: ExecutorAPIStub):
|
205
|
+
"""Reports the current state to the server represented by the supplied stub.
|
206
|
+
|
207
|
+
Raises an exception on failure.
|
208
|
+
"""
|
209
|
+
with (
|
210
|
+
metric_state_report_errors.count_exceptions(),
|
211
|
+
metric_state_report_latency.time(),
|
212
|
+
):
|
213
|
+
metric_state_report_rpcs.inc()
|
214
|
+
state: ExecutorState = self._current_executor_state()
|
215
|
+
task_outputs: List[TaskOutput] = self._remove_completed_tasks()
|
216
|
+
task_results: List[TaskResult] = _to_task_result_protos(task_outputs)
|
217
|
+
|
218
|
+
try:
|
219
|
+
await stub.report_executor_state(
|
220
|
+
ReportExecutorStateRequest(
|
221
|
+
executor_state=state, task_results=task_results
|
222
|
+
),
|
223
|
+
timeout=_REPORT_RPC_TIMEOUT_SEC,
|
224
|
+
)
|
225
|
+
except Exception as e:
|
226
|
+
for task_output in task_outputs:
|
227
|
+
self.add_completed_task_output(task_output)
|
228
|
+
raise
|
229
|
+
|
230
|
+
def _current_executor_state(self) -> ExecutorState:
|
231
|
+
"""Returns the current executor state."""
|
232
|
+
state = ExecutorState(
|
233
|
+
executor_id=self._executor_id,
|
234
|
+
hostname=self._hostname,
|
235
|
+
version=self._version,
|
236
|
+
status=self._executor_status,
|
237
|
+
total_function_executor_resources=_to_host_resources_proto(
|
238
|
+
self._total_function_executor_resources
|
239
|
+
),
|
240
|
+
total_resources=_to_host_resources_proto(self._total_host_resources),
|
241
|
+
allowed_functions=self._allowed_functions,
|
242
|
+
function_executor_states=list(self._function_executor_states.values()),
|
243
|
+
labels=self._labels,
|
244
|
+
)
|
245
|
+
state.state_hash = _state_hash(state)
|
246
|
+
# Set fields not included in the state hash.
|
247
|
+
state.server_clock = self._last_server_clock
|
248
|
+
return state
|
249
|
+
|
250
|
+
def _remove_completed_tasks(self) -> List[TaskOutput]:
|
251
|
+
task_outputs: List[TaskOutput] = []
|
252
|
+
while len(self._completed_task_outputs) > 0:
|
253
|
+
task_output = self._completed_task_outputs.pop()
|
254
|
+
task_outputs.append(task_output)
|
255
|
+
task_logger(task_output.task, self._logger).info(
|
256
|
+
"reporting task outcome",
|
257
|
+
outcome_code=TaskOutcomeCode.Name(task_output.outcome_code),
|
258
|
+
failure_reason=(
|
259
|
+
"None"
|
260
|
+
if task_output.failure_reason is None
|
261
|
+
else TaskFailureReason.Name(task_output.failure_reason)
|
262
|
+
),
|
263
|
+
)
|
264
|
+
return task_outputs
|
265
|
+
|
266
|
+
|
267
|
+
def _to_allowed_function_protos(
|
268
|
+
function_allowlist: List[FunctionURI],
|
269
|
+
) -> List[AllowedFunction]:
|
270
|
+
allowed_functions: List[AllowedFunction] = []
|
271
|
+
for function_uri in function_allowlist:
|
272
|
+
function_uri: FunctionURI
|
273
|
+
allowed_function = AllowedFunction(
|
274
|
+
namespace=function_uri.namespace,
|
275
|
+
graph_name=function_uri.compute_graph,
|
276
|
+
function_name=function_uri.compute_fn,
|
277
|
+
)
|
278
|
+
if function_uri.version is not None:
|
279
|
+
allowed_function.graph_version = function_uri.version
|
280
|
+
allowed_functions.append(allowed_function)
|
281
|
+
|
282
|
+
return allowed_functions
|
283
|
+
|
284
|
+
|
285
|
+
def _state_hash(state: ExecutorState) -> str:
|
286
|
+
serialized_state: bytes = state.SerializeToString(deterministic=True)
|
287
|
+
hasher = hashlib.sha256(usedforsecurity=False)
|
288
|
+
hasher.update(serialized_state)
|
289
|
+
return hasher.hexdigest()
|
290
|
+
|
291
|
+
|
292
|
+
def _to_host_resources_proto(host_resources: HostResources) -> HostResourcesProto:
|
293
|
+
proto = HostResourcesProto(
|
294
|
+
cpu_count=host_resources.cpu_count,
|
295
|
+
memory_bytes=host_resources.memory_mb * 1024 * 1024,
|
296
|
+
disk_bytes=host_resources.disk_mb * 1024 * 1024,
|
297
|
+
)
|
298
|
+
if len(host_resources.gpus) > 0:
|
299
|
+
proto.gpu.CopyFrom(
|
300
|
+
GPUResources(
|
301
|
+
count=len(host_resources.gpus),
|
302
|
+
model=_to_gpu_model_proto(
|
303
|
+
host_resources.gpus[0].model
|
304
|
+
), # All GPUs have the same model
|
305
|
+
)
|
306
|
+
)
|
307
|
+
return proto
|
308
|
+
|
309
|
+
|
310
|
+
def _to_gpu_model_proto(nvidia_gpu_model: NVIDIA_GPU_MODEL) -> GPUModel:
|
311
|
+
if nvidia_gpu_model == NVIDIA_GPU_MODEL.A100_40GB:
|
312
|
+
return GPUModel.GPU_MODEL_NVIDIA_A100_40GB
|
313
|
+
elif nvidia_gpu_model == NVIDIA_GPU_MODEL.A100_80GB:
|
314
|
+
return GPUModel.GPU_MODEL_NVIDIA_A100_80GB
|
315
|
+
elif nvidia_gpu_model == NVIDIA_GPU_MODEL.H100_80GB:
|
316
|
+
return GPUModel.GPU_MODEL_NVIDIA_H100_80GB
|
317
|
+
elif nvidia_gpu_model == NVIDIA_GPU_MODEL.TESLA_T4:
|
318
|
+
return GPUModel.GPU_MODEL_NVIDIA_TESLA_T4
|
319
|
+
elif nvidia_gpu_model == NVIDIA_GPU_MODEL.A6000:
|
320
|
+
return GPUModel.GPU_MODEL_NVIDIA_A6000
|
321
|
+
elif nvidia_gpu_model == NVIDIA_GPU_MODEL.A10:
|
322
|
+
return GPUModel.GPU_MODEL_NVIDIA_A10
|
323
|
+
else:
|
324
|
+
return GPUModel.GPU_MODEL_UNKNOWN
|
325
|
+
|
326
|
+
|
327
|
+
def _to_task_result_protos(task_outputs: List[TaskOutput]) -> List[TaskResult]:
|
328
|
+
task_results: List[TaskResult] = []
|
329
|
+
|
330
|
+
for output in task_outputs:
|
331
|
+
task_result = TaskResult(
|
332
|
+
task_id=output.task.id,
|
333
|
+
allocation_id=output.allocation_id,
|
334
|
+
namespace=output.task.namespace,
|
335
|
+
graph_name=output.task.graph_name,
|
336
|
+
function_name=output.task.function_name,
|
337
|
+
graph_invocation_id=output.task.graph_invocation_id,
|
338
|
+
reducer=output.reducer,
|
339
|
+
outcome_code=output.outcome_code,
|
340
|
+
next_functions=(output.router_output.edges if output.router_output else []),
|
341
|
+
function_outputs=output.uploaded_data_payloads,
|
342
|
+
)
|
343
|
+
if output.failure_reason is not None:
|
344
|
+
task_result.failure_reason = output.failure_reason
|
345
|
+
if output.uploaded_stdout is not None:
|
346
|
+
task_result.stdout.CopyFrom(output.uploaded_stdout)
|
347
|
+
if output.uploaded_stderr is not None:
|
348
|
+
task_result.stderr.CopyFrom(output.uploaded_stderr)
|
349
|
+
if output.router_output is not None:
|
350
|
+
task_result.routing.next_functions[:] = output.router_output.edges
|
351
|
+
|
352
|
+
task_results.append(task_result)
|
353
|
+
|
354
|
+
return task_results
|
355
|
+
|
356
|
+
|
357
|
+
def _executor_labels() -> Dict[str, str]:
|
358
|
+
"""Returns standard executor labels always added to user supplied labels."""
|
359
|
+
return {
|
360
|
+
"os": platform.system(),
|
361
|
+
"architecture": platform.machine(),
|
362
|
+
"python_major_version": str(sys.version_info.major),
|
363
|
+
"python_minor_version": str(sys.version_info.minor),
|
364
|
+
}
|
@@ -15,7 +15,6 @@ enum DataPayloadEncoding {
|
|
15
15
|
}
|
16
16
|
|
17
17
|
message DataPayload {
|
18
|
-
optional string path = 1; // deprecated, TODO: remove when URI us used everywhere
|
19
18
|
optional uint64 size = 2;
|
20
19
|
optional string sha256_hash = 3;
|
21
20
|
// URI of the data.
|
@@ -41,7 +40,6 @@ enum GPUModel {
|
|
41
40
|
message GPUResources {
|
42
41
|
optional uint32 count = 1;
|
43
42
|
optional GPUModel model = 2;
|
44
|
-
reserved 3;
|
45
43
|
}
|
46
44
|
|
47
45
|
// Resources that we're currently tracking and limiting on Executor.
|
@@ -64,17 +62,29 @@ message AllowedFunction {
|
|
64
62
|
|
65
63
|
enum FunctionExecutorStatus {
|
66
64
|
FUNCTION_EXECUTOR_STATUS_UNKNOWN = 0;
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
65
|
+
// Function Executor is being created.
|
66
|
+
FUNCTION_EXECUTOR_STATUS_PENDING = 1;
|
67
|
+
// Function Executor is running and ready to accept tasks.
|
68
|
+
FUNCTION_EXECUTOR_STATUS_RUNNING = 2;
|
69
|
+
// Function Executor is terminated, all resources are freed.
|
70
|
+
FUNCTION_EXECUTOR_STATUS_TERMINATED = 3;
|
71
|
+
}
|
72
|
+
|
73
|
+
enum FunctionExecutorTerminationReason {
|
74
|
+
FUNCTION_EXECUTOR_TERMINATION_REASON_UNKNOWN = 0;
|
75
|
+
FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR = 1;
|
76
|
+
FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_ERROR = 2;
|
77
|
+
// Timeout on FE startup while running the function constructor.
|
78
|
+
FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_TIMEOUT = 3;
|
79
|
+
|
80
|
+
FUNCTION_EXECUTOR_TERMINATION_REASON_EXECUTOR_SHUTDOWN = 10;
|
81
|
+
FUNCTION_EXECUTOR_TERMINATION_REASON_REMOVED_FROM_DESIRED_STATE = 11;
|
82
|
+
FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY = 12;
|
83
|
+
FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR = 13;
|
84
|
+
// Timeout while running the function.
|
85
|
+
FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT = 14;
|
86
|
+
// The running function allocation was removed from the desired state.
|
87
|
+
FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED = 15;
|
78
88
|
}
|
79
89
|
|
80
90
|
// Immutable information that identifies and describes a Function Executor.
|
@@ -95,7 +105,6 @@ message FunctionExecutorDescription {
|
|
95
105
|
optional string function_name = 5;
|
96
106
|
optional string image_uri = 6;
|
97
107
|
repeated string secret_names = 7;
|
98
|
-
optional HostResources resource_limits = 8;
|
99
108
|
// Timeout for customer code duration during FE creation.
|
100
109
|
optional uint32 customer_code_timeout_ms = 9;
|
101
110
|
optional DataPayload graph = 10;
|
@@ -105,7 +114,7 @@ message FunctionExecutorDescription {
|
|
105
114
|
message FunctionExecutorState {
|
106
115
|
optional FunctionExecutorDescription description = 1;
|
107
116
|
optional FunctionExecutorStatus status = 2;
|
108
|
-
|
117
|
+
optional FunctionExecutorTerminationReason termination_reason = 3;
|
109
118
|
}
|
110
119
|
|
111
120
|
enum ExecutorStatus {
|
@@ -113,21 +122,12 @@ enum ExecutorStatus {
|
|
113
122
|
EXECUTOR_STATUS_STARTING_UP = 1;
|
114
123
|
EXECUTOR_STATUS_RUNNING = 2;
|
115
124
|
EXECUTOR_STATUS_DRAINED = 3;
|
116
|
-
|
117
|
-
EXECUTOR_STATUS_STOPPED = 5;
|
118
|
-
}
|
119
|
-
|
120
|
-
enum ExecutorFlavor {
|
121
|
-
EXECUTOR_FLAVOR_UNKNOWN = 0;
|
122
|
-
EXECUTOR_FLAVOR_OSS = 1;
|
123
|
-
EXECUTOR_FLAVOR_PLATFORM = 2;
|
125
|
+
EXECUTOR_STATUS_STOPPED = 4;
|
124
126
|
}
|
125
127
|
|
126
128
|
message ExecutorState {
|
127
129
|
optional string executor_id = 1;
|
128
|
-
optional bool development_mode = 2;
|
129
130
|
optional string hostname = 3;
|
130
|
-
optional ExecutorFlavor flavor = 4;
|
131
131
|
optional string version = 5;
|
132
132
|
optional ExecutorStatus status = 6;
|
133
133
|
// Total resources at the Executor.
|
@@ -148,6 +148,7 @@ message ExecutorState {
|
|
148
148
|
// A message sent by Executor to report its up to date state to Server.
|
149
149
|
message ReportExecutorStateRequest {
|
150
150
|
optional ExecutorState executor_state = 1;
|
151
|
+
repeated TaskResult task_results = 2;
|
151
152
|
}
|
152
153
|
|
153
154
|
// A message sent by Server to Executor to acknowledge the receipt of Executor state.
|
@@ -170,8 +171,6 @@ message Task {
|
|
170
171
|
optional string graph_version = 4;
|
171
172
|
optional string function_name = 5;
|
172
173
|
optional string graph_invocation_id = 6;
|
173
|
-
optional string input_key = 8; // deprecated. TODO: remove when input is used everywhere
|
174
|
-
optional string reducer_output_key = 9; // deprecated. TODO: remove when reducer_input is used everywhere
|
175
174
|
optional uint32 timeout_ms = 10;
|
176
175
|
optional DataPayload input = 11;
|
177
176
|
optional DataPayload reducer_input = 12;
|
@@ -185,6 +184,7 @@ message Task {
|
|
185
184
|
message TaskAllocation {
|
186
185
|
optional string function_executor_id = 1;
|
187
186
|
optional Task task = 2;
|
187
|
+
optional string allocation_id = 3;
|
188
188
|
}
|
189
189
|
|
190
190
|
// A message sent by Executor to Server to open the stream of desired Executor States for the Executor.
|
@@ -203,46 +203,57 @@ message DesiredExecutorState {
|
|
203
203
|
}
|
204
204
|
|
205
205
|
// ===== report_task_outcome RPC =====
|
206
|
-
enum
|
207
|
-
|
208
|
-
|
209
|
-
|
206
|
+
enum TaskOutcomeCode {
|
207
|
+
TASK_OUTCOME_CODE_UNKNOWN = 0;
|
208
|
+
TASK_OUTCOME_CODE_SUCCESS = 1;
|
209
|
+
TASK_OUTCOME_CODE_FAILURE = 2;
|
210
|
+
}
|
211
|
+
|
212
|
+
enum TaskFailureReason {
|
213
|
+
TASK_FAILURE_REASON_UNKNOWN = 0;
|
214
|
+
TASK_FAILURE_REASON_INTERNAL_ERROR = 1;
|
215
|
+
TASK_FAILURE_REASON_FUNCTION_ERROR = 2;
|
216
|
+
TASK_FAILURE_REASON_FUNCTION_TIMEOUT = 3;
|
217
|
+
TASK_FAILURE_REASON_TASK_CANCELLED = 4;
|
218
|
+
TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED = 5;
|
210
219
|
}
|
211
220
|
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
OUTPUT_ENCODING_BINARY = 3;
|
221
|
+
message ResultRouting {
|
222
|
+
// The list of next functions.
|
223
|
+
// NB: An empty list indicates that no routing should be performed.
|
224
|
+
repeated string next_functions = 1;
|
217
225
|
}
|
218
226
|
|
219
|
-
message
|
227
|
+
message TaskResult {
|
220
228
|
optional string task_id = 1;
|
221
229
|
optional string namespace = 2;
|
222
230
|
optional string graph_name = 3;
|
223
231
|
optional string function_name = 4;
|
224
|
-
optional string graph_invocation_id =
|
225
|
-
optional
|
226
|
-
optional
|
227
|
-
optional
|
228
|
-
optional bool reducer = 10;
|
229
|
-
|
232
|
+
optional string graph_invocation_id = 5;
|
233
|
+
optional bool reducer = 6;
|
234
|
+
optional TaskOutcomeCode outcome_code = 7;
|
235
|
+
optional TaskFailureReason failure_reason = 8;
|
230
236
|
// Edges that the function wants the invocation to be routed to.
|
231
237
|
// Previously called router_edges.
|
232
|
-
|
233
|
-
//
|
234
|
-
repeated
|
238
|
+
// NB: An empty list indicates that the graph's route definitions should be used,
|
239
|
+
// unless this field is overridden by the presence of the `routing` field.
|
240
|
+
repeated string next_functions = 9;
|
241
|
+
repeated DataPayload function_outputs = 10;
|
235
242
|
// Standard output and error streams of the function.
|
236
|
-
optional DataPayload stdout =
|
237
|
-
optional DataPayload stderr =
|
238
|
-
|
239
|
-
optional
|
240
|
-
// This allows us to change how we encode the output from functions
|
241
|
-
// and serialize them into storage.
|
242
|
-
optional uint64 output_encoding_version = 5; // deprecated. TODO: remove when DataPayload.encoding_version is used everywhere
|
243
|
-
}
|
243
|
+
optional DataPayload stdout = 11;
|
244
|
+
optional DataPayload stderr = 12;
|
245
|
+
|
246
|
+
optional string allocation_id = 13;
|
244
247
|
|
245
|
-
|
248
|
+
// Indicates how the results should be routed.
|
249
|
+
// If this is present, it replaces `next_functions`.
|
250
|
+
//
|
251
|
+
// If absent, `next_functions` will be used; note that if no
|
252
|
+
// routes are defined in `next_functions`, this will use the
|
253
|
+
// graph's routing. The long-term goal is to deprecate
|
254
|
+
// `next_functions`, so that if `routing` is not present, the
|
255
|
+
// graph's routing definitions will always be used.
|
256
|
+
ResultRouting routing = 14;
|
246
257
|
}
|
247
258
|
|
248
259
|
// Internal API for scheduling and running tasks on Executors. Executors are acting as clients of this API.
|
@@ -262,7 +273,4 @@ service ExecutorAPI {
|
|
262
273
|
//
|
263
274
|
// Deprecated HTTP API is used to download the serialized graph and task inputs.
|
264
275
|
rpc get_desired_executor_states(GetDesiredExecutorStatesRequest) returns (stream DesiredExecutorState) {}
|
265
|
-
|
266
|
-
// Report the outcome of a task.
|
267
|
-
rpc report_task_outcome(ReportTaskOutcomeRequest) returns (ReportTaskOutcomeResponse) {}
|
268
|
-
}
|
276
|
+
}
|