indexify 0.3.31__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. indexify/cli/__init__.py +18 -0
  2. indexify/cli/build_image.py +51 -0
  3. indexify/cli/deploy.py +57 -0
  4. indexify/cli/executor.py +205 -0
  5. indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
  6. indexify/executor/executor.py +57 -313
  7. indexify/executor/function_allowlist.py +59 -0
  8. indexify/executor/function_executor/function_executor.py +12 -6
  9. indexify/executor/function_executor/invocation_state_client.py +25 -3
  10. indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
  11. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
  12. indexify/executor/function_executor_controller/__init__.py +13 -0
  13. indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
  14. indexify/executor/function_executor_controller/create_function_executor.py +158 -0
  15. indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
  16. indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
  17. indexify/executor/function_executor_controller/downloads.py +199 -0
  18. indexify/executor/function_executor_controller/events.py +172 -0
  19. indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
  20. indexify/executor/function_executor_controller/loggers.py +57 -0
  21. indexify/executor/function_executor_controller/message_validators.py +69 -0
  22. indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
  23. indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
  24. indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
  25. indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
  26. indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
  27. indexify/executor/function_executor_controller/prepare_task.py +38 -0
  28. indexify/executor/function_executor_controller/run_task.py +201 -0
  29. indexify/executor/function_executor_controller/task_info.py +33 -0
  30. indexify/executor/function_executor_controller/task_output.py +122 -0
  31. indexify/executor/function_executor_controller/upload_task_output.py +234 -0
  32. indexify/executor/host_resources/host_resources.py +20 -25
  33. indexify/executor/host_resources/nvidia_gpu_allocator.py +8 -1
  34. indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
  35. indexify/executor/metrics/executor.py +0 -47
  36. indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
  37. indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
  38. indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
  39. indexify/executor/monitoring/health_checker/health_checker.py +0 -11
  40. indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
  41. indexify/executor/state_reporter.py +364 -0
  42. indexify/proto/executor_api.proto +68 -60
  43. indexify/proto/executor_api_pb2.py +52 -52
  44. indexify/proto/executor_api_pb2.pyi +129 -108
  45. indexify/proto/executor_api_pb2_grpc.py +0 -47
  46. {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/METADATA +2 -5
  47. indexify-0.4.3.dist-info/RECORD +68 -0
  48. indexify-0.4.3.dist-info/entry_points.txt +3 -0
  49. indexify/cli/cli.py +0 -268
  50. indexify/executor/api_objects.py +0 -92
  51. indexify/executor/downloader.py +0 -417
  52. indexify/executor/executor_flavor.py +0 -7
  53. indexify/executor/function_executor/function_executor_state.py +0 -107
  54. indexify/executor/function_executor/function_executor_states_container.py +0 -93
  55. indexify/executor/function_executor/function_executor_status.py +0 -95
  56. indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
  57. indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
  58. indexify/executor/function_executor/single_task_runner.py +0 -345
  59. indexify/executor/function_executor/task_input.py +0 -21
  60. indexify/executor/function_executor/task_output.py +0 -105
  61. indexify/executor/grpc/function_executor_controller.py +0 -418
  62. indexify/executor/grpc/metrics/task_controller.py +0 -8
  63. indexify/executor/grpc/state_reporter.py +0 -317
  64. indexify/executor/grpc/task_controller.py +0 -508
  65. indexify/executor/metrics/task_fetcher.py +0 -21
  66. indexify/executor/metrics/task_reporter.py +0 -53
  67. indexify/executor/metrics/task_runner.py +0 -52
  68. indexify/executor/monitoring/function_allowlist.py +0 -25
  69. indexify/executor/runtime_probes.py +0 -68
  70. indexify/executor/task_fetcher.py +0 -96
  71. indexify/executor/task_reporter.py +0 -459
  72. indexify/executor/task_runner.py +0 -177
  73. indexify-0.3.31.dist-info/RECORD +0 -68
  74. indexify-0.3.31.dist-info/entry_points.txt +0 -3
  75. {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/WHEEL +0 -0
@@ -0,0 +1,364 @@
1
+ import asyncio
2
+ import hashlib
3
+ import platform
4
+ import sys
5
+ from socket import gethostname
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ from indexify.proto.executor_api_pb2 import (
9
+ AllowedFunction,
10
+ ExecutorState,
11
+ ExecutorStatus,
12
+ FunctionExecutorState,
13
+ GPUModel,
14
+ GPUResources,
15
+ )
16
+ from indexify.proto.executor_api_pb2 import HostResources as HostResourcesProto
17
+ from indexify.proto.executor_api_pb2 import (
18
+ ReportExecutorStateRequest,
19
+ TaskFailureReason,
20
+ TaskOutcomeCode,
21
+ TaskResult,
22
+ )
23
+ from indexify.proto.executor_api_pb2_grpc import ExecutorAPIStub
24
+
25
+ from .channel_manager import ChannelManager
26
+ from .function_allowlist import FunctionURI
27
+ from .function_executor_controller.loggers import task_logger
28
+ from .function_executor_controller.task_output import TaskOutput
29
+ from .host_resources.host_resources import HostResources, HostResourcesProvider
30
+ from .host_resources.nvidia_gpu import NVIDIA_GPU_MODEL
31
+ from .metrics.state_reporter import (
32
+ metric_state_report_errors,
33
+ metric_state_report_latency,
34
+ metric_state_report_rpcs,
35
+ )
36
+
37
+ _REPORTING_INTERVAL_SEC = 5
38
+ _REPORT_RPC_TIMEOUT_SEC = 5
39
+
40
+
41
+ class ExecutorStateReporter:
42
+ def __init__(
43
+ self,
44
+ executor_id: str,
45
+ version: str,
46
+ labels: Dict[str, str],
47
+ function_allowlist: List[FunctionURI],
48
+ channel_manager: ChannelManager,
49
+ host_resources_provider: HostResourcesProvider,
50
+ logger: Any,
51
+ reporting_interval_sec: int = _REPORTING_INTERVAL_SEC,
52
+ ):
53
+ self._executor_id: str = executor_id
54
+ self._version: str = version
55
+ self._labels: Dict[str, str] = labels.copy()
56
+ self._labels.update(_executor_labels())
57
+ self._hostname: str = gethostname()
58
+ self._channel_manager = channel_manager
59
+ self._logger: Any = logger.bind(module=__name__)
60
+ self._reporting_interval_sec: int = reporting_interval_sec
61
+ self._allowed_functions: List[AllowedFunction] = _to_allowed_function_protos(
62
+ function_allowlist
63
+ )
64
+ # We need to fetch total resources only once, because they are not changing.
65
+ self._total_host_resources: HostResources = (
66
+ host_resources_provider.total_host_resources(self._logger)
67
+ )
68
+ self._total_function_executor_resources: HostResources = (
69
+ host_resources_provider.total_function_executor_resources(self._logger)
70
+ )
71
+ self._logger.info(
72
+ "detected host resources",
73
+ total_host_resources=self._total_host_resources,
74
+ total_function_executor_resources=self._total_function_executor_resources,
75
+ )
76
+ self._state_report_worker: Optional[asyncio.Task] = None
77
+ self._periodic_state_report_scheduler: Optional[asyncio.Task] = None
78
+
79
+ # Mutable fields
80
+ self._state_report_scheduled_event: asyncio.Event = asyncio.Event()
81
+ self._state_reported_event: asyncio.Event = asyncio.Event()
82
+ self._executor_status: ExecutorStatus = ExecutorStatus.EXECUTOR_STATUS_UNKNOWN
83
+ self._last_server_clock: int = (
84
+ 0 # Server expects initial value to be 0 until it is set by Server.
85
+ )
86
+ self._completed_task_outputs: List[TaskOutput] = []
87
+ self._function_executor_states: Dict[str, FunctionExecutorState] = {}
88
+
89
+ def update_executor_status(self, value: ExecutorStatus) -> None:
90
+ self._executor_status = value
91
+
92
+ def update_last_server_clock(self, value: int) -> None:
93
+ self._last_server_clock = value
94
+
95
+ def update_function_executor_state(
96
+ self,
97
+ state: FunctionExecutorState,
98
+ ) -> None:
99
+ self._function_executor_states[state.description.id] = state
100
+
101
+ def remove_function_executor_info(self, function_executor_id: str) -> None:
102
+ if function_executor_id not in self._function_executor_states:
103
+ self._logger.warning(
104
+ "attempted to remove non-existing function executor state",
105
+ function_executor_id=function_executor_id,
106
+ )
107
+ return
108
+
109
+ self._function_executor_states.pop(function_executor_id)
110
+
111
+ def add_completed_task_output(self, task_output: TaskOutput) -> None:
112
+ self._completed_task_outputs.append(task_output)
113
+
114
+ def schedule_state_report(self) -> None:
115
+ """Schedules a state report to be sent to the server asap.
116
+
117
+ This method is called when the executor state changes and it needs to get reported.
118
+ The call doesn't block and returns immediately.
119
+ """
120
+ self._state_report_scheduled_event.set()
121
+
122
+ async def report_state_and_wait_for_completion(self) -> None:
123
+ """Schedules a state report to be sent to the server asap and waits for the completion of the reporting."""
124
+ self._state_reported_event.clear()
125
+ self.schedule_state_report()
126
+ await self._state_reported_event.wait()
127
+
128
+ def run(self) -> None:
129
+ """Runs the state reporter.
130
+
131
+ This method is called when the executor starts and it needs to start reporting its state
132
+ periodically.
133
+ """
134
+ self._state_report_worker = asyncio.create_task(
135
+ self._state_report_worker_loop(), name="state_reporter_worker"
136
+ )
137
+ self._periodic_state_report_scheduler = asyncio.create_task(
138
+ self._periodic_state_report_scheduler_loop(),
139
+ name="state_reporter_periodic_scheduler",
140
+ )
141
+
142
+ async def shutdown(self) -> None:
143
+ """Tries to do one last state report and shuts down the state reporter.
144
+
145
+ Doesn't raise any exceptions."""
146
+ if self._state_report_worker is not None:
147
+ self._state_report_worker.cancel()
148
+ try:
149
+ await self._state_report_worker
150
+ except asyncio.CancelledError:
151
+ pass # Expected exception
152
+ self._state_report_worker = None
153
+
154
+ if self._periodic_state_report_scheduler is not None:
155
+ self._periodic_state_report_scheduler.cancel()
156
+ try:
157
+ await self._periodic_state_report_scheduler
158
+ except asyncio.CancelledError:
159
+ pass
160
+ self._periodic_state_report_scheduler = None
161
+
162
+ # Don't retry state report if it failed during shutdown.
163
+ # We only do best effort last state report and Server might not be available.
164
+ try:
165
+ async with self._channel_manager.create_channel() as channel:
166
+ stub = ExecutorAPIStub(channel)
167
+ await self._report_state(stub)
168
+ except BaseException as e:
169
+ self._logger.error(
170
+ "failed to report state during shutdown",
171
+ exc_info=e,
172
+ )
173
+
174
+ async def _periodic_state_report_scheduler_loop(self) -> None:
175
+ while True:
176
+ self._state_report_scheduled_event.set()
177
+ await asyncio.sleep(self._reporting_interval_sec)
178
+
179
+ async def _state_report_worker_loop(self) -> None:
180
+ """Runs the state reporter.
181
+
182
+ Never raises any exceptions.
183
+ """
184
+ while True:
185
+ stub = ExecutorAPIStub(await self._channel_manager.get_channel())
186
+ while True:
187
+ await self._state_report_scheduled_event.wait()
188
+ # Clear the event immidiately to report again asap if needed. This reduces latency in the system.
189
+ self._state_report_scheduled_event.clear()
190
+ try:
191
+ # The periodic state reports serve as channel health monitoring requests
192
+ # (same as TCP keep-alive). Channel Manager returns the same healthy channel
193
+ # for all RPCs that we do from Executor to Server. So all the RPCs benefit
194
+ # from this channel health monitoring.
195
+ await self._report_state(stub)
196
+ self._state_reported_event.set()
197
+ except Exception as e:
198
+ self._logger.error(
199
+ f"failed to report state to the server, retrying in {self._reporting_interval_sec} sec.",
200
+ exc_info=e,
201
+ )
202
+ break # exit the inner loop to recreate the channel if needed
203
+
204
+ async def _report_state(self, stub: ExecutorAPIStub):
205
+ """Reports the current state to the server represented by the supplied stub.
206
+
207
+ Raises an exception on failure.
208
+ """
209
+ with (
210
+ metric_state_report_errors.count_exceptions(),
211
+ metric_state_report_latency.time(),
212
+ ):
213
+ metric_state_report_rpcs.inc()
214
+ state: ExecutorState = self._current_executor_state()
215
+ task_outputs: List[TaskOutput] = self._remove_completed_tasks()
216
+ task_results: List[TaskResult] = _to_task_result_protos(task_outputs)
217
+
218
+ try:
219
+ await stub.report_executor_state(
220
+ ReportExecutorStateRequest(
221
+ executor_state=state, task_results=task_results
222
+ ),
223
+ timeout=_REPORT_RPC_TIMEOUT_SEC,
224
+ )
225
+ except Exception as e:
226
+ for task_output in task_outputs:
227
+ self.add_completed_task_output(task_output)
228
+ raise
229
+
230
+ def _current_executor_state(self) -> ExecutorState:
231
+ """Returns the current executor state."""
232
+ state = ExecutorState(
233
+ executor_id=self._executor_id,
234
+ hostname=self._hostname,
235
+ version=self._version,
236
+ status=self._executor_status,
237
+ total_function_executor_resources=_to_host_resources_proto(
238
+ self._total_function_executor_resources
239
+ ),
240
+ total_resources=_to_host_resources_proto(self._total_host_resources),
241
+ allowed_functions=self._allowed_functions,
242
+ function_executor_states=list(self._function_executor_states.values()),
243
+ labels=self._labels,
244
+ )
245
+ state.state_hash = _state_hash(state)
246
+ # Set fields not included in the state hash.
247
+ state.server_clock = self._last_server_clock
248
+ return state
249
+
250
+ def _remove_completed_tasks(self) -> List[TaskOutput]:
251
+ task_outputs: List[TaskOutput] = []
252
+ while len(self._completed_task_outputs) > 0:
253
+ task_output = self._completed_task_outputs.pop()
254
+ task_outputs.append(task_output)
255
+ task_logger(task_output.task, self._logger).info(
256
+ "reporting task outcome",
257
+ outcome_code=TaskOutcomeCode.Name(task_output.outcome_code),
258
+ failure_reason=(
259
+ "None"
260
+ if task_output.failure_reason is None
261
+ else TaskFailureReason.Name(task_output.failure_reason)
262
+ ),
263
+ )
264
+ return task_outputs
265
+
266
+
267
+ def _to_allowed_function_protos(
268
+ function_allowlist: List[FunctionURI],
269
+ ) -> List[AllowedFunction]:
270
+ allowed_functions: List[AllowedFunction] = []
271
+ for function_uri in function_allowlist:
272
+ function_uri: FunctionURI
273
+ allowed_function = AllowedFunction(
274
+ namespace=function_uri.namespace,
275
+ graph_name=function_uri.compute_graph,
276
+ function_name=function_uri.compute_fn,
277
+ )
278
+ if function_uri.version is not None:
279
+ allowed_function.graph_version = function_uri.version
280
+ allowed_functions.append(allowed_function)
281
+
282
+ return allowed_functions
283
+
284
+
285
+ def _state_hash(state: ExecutorState) -> str:
286
+ serialized_state: bytes = state.SerializeToString(deterministic=True)
287
+ hasher = hashlib.sha256(usedforsecurity=False)
288
+ hasher.update(serialized_state)
289
+ return hasher.hexdigest()
290
+
291
+
292
+ def _to_host_resources_proto(host_resources: HostResources) -> HostResourcesProto:
293
+ proto = HostResourcesProto(
294
+ cpu_count=host_resources.cpu_count,
295
+ memory_bytes=host_resources.memory_mb * 1024 * 1024,
296
+ disk_bytes=host_resources.disk_mb * 1024 * 1024,
297
+ )
298
+ if len(host_resources.gpus) > 0:
299
+ proto.gpu.CopyFrom(
300
+ GPUResources(
301
+ count=len(host_resources.gpus),
302
+ model=_to_gpu_model_proto(
303
+ host_resources.gpus[0].model
304
+ ), # All GPUs have the same model
305
+ )
306
+ )
307
+ return proto
308
+
309
+
310
+ def _to_gpu_model_proto(nvidia_gpu_model: NVIDIA_GPU_MODEL) -> GPUModel:
311
+ if nvidia_gpu_model == NVIDIA_GPU_MODEL.A100_40GB:
312
+ return GPUModel.GPU_MODEL_NVIDIA_A100_40GB
313
+ elif nvidia_gpu_model == NVIDIA_GPU_MODEL.A100_80GB:
314
+ return GPUModel.GPU_MODEL_NVIDIA_A100_80GB
315
+ elif nvidia_gpu_model == NVIDIA_GPU_MODEL.H100_80GB:
316
+ return GPUModel.GPU_MODEL_NVIDIA_H100_80GB
317
+ elif nvidia_gpu_model == NVIDIA_GPU_MODEL.TESLA_T4:
318
+ return GPUModel.GPU_MODEL_NVIDIA_TESLA_T4
319
+ elif nvidia_gpu_model == NVIDIA_GPU_MODEL.A6000:
320
+ return GPUModel.GPU_MODEL_NVIDIA_A6000
321
+ elif nvidia_gpu_model == NVIDIA_GPU_MODEL.A10:
322
+ return GPUModel.GPU_MODEL_NVIDIA_A10
323
+ else:
324
+ return GPUModel.GPU_MODEL_UNKNOWN
325
+
326
+
327
+ def _to_task_result_protos(task_outputs: List[TaskOutput]) -> List[TaskResult]:
328
+ task_results: List[TaskResult] = []
329
+
330
+ for output in task_outputs:
331
+ task_result = TaskResult(
332
+ task_id=output.task.id,
333
+ allocation_id=output.allocation_id,
334
+ namespace=output.task.namespace,
335
+ graph_name=output.task.graph_name,
336
+ function_name=output.task.function_name,
337
+ graph_invocation_id=output.task.graph_invocation_id,
338
+ reducer=output.reducer,
339
+ outcome_code=output.outcome_code,
340
+ next_functions=(output.router_output.edges if output.router_output else []),
341
+ function_outputs=output.uploaded_data_payloads,
342
+ )
343
+ if output.failure_reason is not None:
344
+ task_result.failure_reason = output.failure_reason
345
+ if output.uploaded_stdout is not None:
346
+ task_result.stdout.CopyFrom(output.uploaded_stdout)
347
+ if output.uploaded_stderr is not None:
348
+ task_result.stderr.CopyFrom(output.uploaded_stderr)
349
+ if output.router_output is not None:
350
+ task_result.routing.next_functions[:] = output.router_output.edges
351
+
352
+ task_results.append(task_result)
353
+
354
+ return task_results
355
+
356
+
357
+ def _executor_labels() -> Dict[str, str]:
358
+ """Returns standard executor labels always added to user supplied labels."""
359
+ return {
360
+ "os": platform.system(),
361
+ "architecture": platform.machine(),
362
+ "python_major_version": str(sys.version_info.major),
363
+ "python_minor_version": str(sys.version_info.minor),
364
+ }
@@ -15,7 +15,6 @@ enum DataPayloadEncoding {
15
15
  }
16
16
 
17
17
  message DataPayload {
18
- optional string path = 1; // deprecated, TODO: remove when URI us used everywhere
19
18
  optional uint64 size = 2;
20
19
  optional string sha256_hash = 3;
21
20
  // URI of the data.
@@ -41,7 +40,6 @@ enum GPUModel {
41
40
  message GPUResources {
42
41
  optional uint32 count = 1;
43
42
  optional GPUModel model = 2;
44
- reserved 3;
45
43
  }
46
44
 
47
45
  // Resources that we're currently tracking and limiting on Executor.
@@ -64,17 +62,29 @@ message AllowedFunction {
64
62
 
65
63
  enum FunctionExecutorStatus {
66
64
  FUNCTION_EXECUTOR_STATUS_UNKNOWN = 0;
67
- FUNCTION_EXECUTOR_STATUS_STARTING_UP = 1;
68
- FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR = 2;
69
- FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR = 3;
70
- FUNCTION_EXECUTOR_STATUS_IDLE = 4;
71
- FUNCTION_EXECUTOR_STATUS_RUNNING_TASK = 5;
72
- FUNCTION_EXECUTOR_STATUS_UNHEALTHY = 6;
73
- FUNCTION_EXECUTOR_STATUS_STOPPING = 7;
74
- // FE is stopped but can be started up.
75
- FUNCTION_EXECUTOR_STATUS_STOPPED = 8;
76
- // FE is stopped forever, all resources are freed.
77
- FUNCTION_EXECUTOR_STATUS_SHUTDOWN = 9;
65
+ // Function Executor is being created.
66
+ FUNCTION_EXECUTOR_STATUS_PENDING = 1;
67
+ // Function Executor is running and ready to accept tasks.
68
+ FUNCTION_EXECUTOR_STATUS_RUNNING = 2;
69
+ // Function Executor is terminated, all resources are freed.
70
+ FUNCTION_EXECUTOR_STATUS_TERMINATED = 3;
71
+ }
72
+
73
+ enum FunctionExecutorTerminationReason {
74
+ FUNCTION_EXECUTOR_TERMINATION_REASON_UNKNOWN = 0;
75
+ FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR = 1;
76
+ FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_ERROR = 2;
77
+ // Timeout on FE startup while running the function constructor.
78
+ FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_TIMEOUT = 3;
79
+
80
+ FUNCTION_EXECUTOR_TERMINATION_REASON_EXECUTOR_SHUTDOWN = 10;
81
+ FUNCTION_EXECUTOR_TERMINATION_REASON_REMOVED_FROM_DESIRED_STATE = 11;
82
+ FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY = 12;
83
+ FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR = 13;
84
+ // Timeout while running the function.
85
+ FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT = 14;
86
+ // The running function allocation was removed from the desired state.
87
+ FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED = 15;
78
88
  }
79
89
 
80
90
  // Immutable information that identifies and describes a Function Executor.
@@ -84,7 +94,7 @@ message FunctionExecutorResources {
84
94
  optional uint32 cpu_ms_per_sec = 1;
85
95
  optional uint64 memory_bytes = 2;
86
96
  optional uint64 disk_bytes = 3;
87
- optional uint32 gpu_count = 4;
97
+ optional GPUResources gpu = 4;
88
98
  }
89
99
 
90
100
  message FunctionExecutorDescription {
@@ -95,7 +105,6 @@ message FunctionExecutorDescription {
95
105
  optional string function_name = 5;
96
106
  optional string image_uri = 6;
97
107
  repeated string secret_names = 7;
98
- optional HostResources resource_limits = 8;
99
108
  // Timeout for customer code duration during FE creation.
100
109
  optional uint32 customer_code_timeout_ms = 9;
101
110
  optional DataPayload graph = 10;
@@ -105,7 +114,7 @@ message FunctionExecutorDescription {
105
114
  message FunctionExecutorState {
106
115
  optional FunctionExecutorDescription description = 1;
107
116
  optional FunctionExecutorStatus status = 2;
108
- reserved 3;
117
+ optional FunctionExecutorTerminationReason termination_reason = 3;
109
118
  }
110
119
 
111
120
  enum ExecutorStatus {
@@ -113,21 +122,12 @@ enum ExecutorStatus {
113
122
  EXECUTOR_STATUS_STARTING_UP = 1;
114
123
  EXECUTOR_STATUS_RUNNING = 2;
115
124
  EXECUTOR_STATUS_DRAINED = 3;
116
- EXECUTOR_STATUS_STOPPING = 4;
117
- EXECUTOR_STATUS_STOPPED = 5;
118
- }
119
-
120
- enum ExecutorFlavor {
121
- EXECUTOR_FLAVOR_UNKNOWN = 0;
122
- EXECUTOR_FLAVOR_OSS = 1;
123
- EXECUTOR_FLAVOR_PLATFORM = 2;
125
+ EXECUTOR_STATUS_STOPPED = 4;
124
126
  }
125
127
 
126
128
  message ExecutorState {
127
129
  optional string executor_id = 1;
128
- optional bool development_mode = 2;
129
130
  optional string hostname = 3;
130
- optional ExecutorFlavor flavor = 4;
131
131
  optional string version = 5;
132
132
  optional ExecutorStatus status = 6;
133
133
  // Total resources at the Executor.
@@ -148,6 +148,7 @@ message ExecutorState {
148
148
  // A message sent by Executor to report its up to date state to Server.
149
149
  message ReportExecutorStateRequest {
150
150
  optional ExecutorState executor_state = 1;
151
+ repeated TaskResult task_results = 2;
151
152
  }
152
153
 
153
154
  // A message sent by Server to Executor to acknowledge the receipt of Executor state.
@@ -170,8 +171,6 @@ message Task {
170
171
  optional string graph_version = 4;
171
172
  optional string function_name = 5;
172
173
  optional string graph_invocation_id = 6;
173
- optional string input_key = 8; // deprecated. TODO: remove when input is used everywhere
174
- optional string reducer_output_key = 9; // deprecated. TODO: remove when reducer_input is used everywhere
175
174
  optional uint32 timeout_ms = 10;
176
175
  optional DataPayload input = 11;
177
176
  optional DataPayload reducer_input = 12;
@@ -185,6 +184,7 @@ message Task {
185
184
  message TaskAllocation {
186
185
  optional string function_executor_id = 1;
187
186
  optional Task task = 2;
187
+ optional string allocation_id = 3;
188
188
  }
189
189
 
190
190
  // A message sent by Executor to Server to open the stream of desired Executor States for the Executor.
@@ -203,46 +203,57 @@ message DesiredExecutorState {
203
203
  }
204
204
 
205
205
  // ===== report_task_outcome RPC =====
206
- enum TaskOutcome {
207
- TASK_OUTCOME_UNKNOWN = 0;
208
- TASK_OUTCOME_SUCCESS = 1;
209
- TASK_OUTCOME_FAILURE = 2;
206
+ enum TaskOutcomeCode {
207
+ TASK_OUTCOME_CODE_UNKNOWN = 0;
208
+ TASK_OUTCOME_CODE_SUCCESS = 1;
209
+ TASK_OUTCOME_CODE_FAILURE = 2;
210
+ }
211
+
212
+ enum TaskFailureReason {
213
+ TASK_FAILURE_REASON_UNKNOWN = 0;
214
+ TASK_FAILURE_REASON_INTERNAL_ERROR = 1;
215
+ TASK_FAILURE_REASON_FUNCTION_ERROR = 2;
216
+ TASK_FAILURE_REASON_FUNCTION_TIMEOUT = 3;
217
+ TASK_FAILURE_REASON_TASK_CANCELLED = 4;
218
+ TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED = 5;
210
219
  }
211
220
 
212
- enum OutputEncoding {
213
- OUTPUT_ENCODING_UNKNOWN = 0;
214
- OUTPUT_ENCODING_JSON = 1;
215
- OUTPUT_ENCODING_PICKLE = 2;
216
- OUTPUT_ENCODING_BINARY = 3;
221
+ message ResultRouting {
222
+ // The list of next functions.
223
+ // NB: An empty list indicates that no routing should be performed.
224
+ repeated string next_functions = 1;
217
225
  }
218
226
 
219
- message ReportTaskOutcomeRequest {
227
+ message TaskResult {
220
228
  optional string task_id = 1;
221
229
  optional string namespace = 2;
222
230
  optional string graph_name = 3;
223
231
  optional string function_name = 4;
224
- optional string graph_invocation_id = 6;
225
- optional TaskOutcome outcome = 7;
226
- optional string invocation_id = 8; // deprecated. TODO: remove when graph_invocation_id is used everywhere
227
- optional string executor_id = 9;
228
- optional bool reducer = 10;
229
-
232
+ optional string graph_invocation_id = 5;
233
+ optional bool reducer = 6;
234
+ optional TaskOutcomeCode outcome_code = 7;
235
+ optional TaskFailureReason failure_reason = 8;
230
236
  // Edges that the function wants the invocation to be routed to.
231
237
  // Previously called router_edges.
232
- repeated string next_functions = 11;
233
- // Outputs of the function.
234
- repeated DataPayload fn_outputs = 12;
238
+ // NB: An empty list indicates that the graph's route definitions should be used,
239
+ // unless this field is overridden by the presence of the `routing` field.
240
+ repeated string next_functions = 9;
241
+ repeated DataPayload function_outputs = 10;
235
242
  // Standard output and error streams of the function.
236
- optional DataPayload stdout = 14;
237
- optional DataPayload stderr = 15;
238
- // Output encoding of all the outputs of a function have to be same.
239
- optional OutputEncoding output_encoding = 13; // deprecated. TODO: remove when DataPayload.encoding is used everywhere
240
- // This allows us to change how we encode the output from functions
241
- // and serialize them into storage.
242
- optional uint64 output_encoding_version = 5; // deprecated. TODO: remove when DataPayload.encoding_version is used everywhere
243
- }
243
+ optional DataPayload stdout = 11;
244
+ optional DataPayload stderr = 12;
245
+
246
+ optional string allocation_id = 13;
244
247
 
245
- message ReportTaskOutcomeResponse {
248
+ // Indicates how the results should be routed.
249
+ // If this is present, it replaces `next_functions`.
250
+ //
251
+ // If absent, `next_functions` will be used; note that if no
252
+ // routes are defined in `next_functions`, this will use the
253
+ // graph's routing. The long-term goal is to deprecate
254
+ // `next_functions`, so that if `routing` is not present, the
255
+ // graph's routing definitions will always be used.
256
+ ResultRouting routing = 14;
246
257
  }
247
258
 
248
259
  // Internal API for scheduling and running tasks on Executors. Executors are acting as clients of this API.
@@ -262,7 +273,4 @@ service ExecutorAPI {
262
273
  //
263
274
  // Deprecated HTTP API is used to download the serialized graph and task inputs.
264
275
  rpc get_desired_executor_states(GetDesiredExecutorStatesRequest) returns (stream DesiredExecutorState) {}
265
-
266
- // Report the outcome of a task.
267
- rpc report_task_outcome(ReportTaskOutcomeRequest) returns (ReportTaskOutcomeResponse) {}
268
- }
276
+ }