indexify 0.4.6__py3-none-any.whl → 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. indexify/executor/function_executor/function_executor.py +30 -25
  2. indexify/executor/function_executor_controller/__init__.py +7 -4
  3. indexify/executor/function_executor_controller/create_function_executor.py +125 -27
  4. indexify/executor/function_executor_controller/destroy_function_executor.py +1 -1
  5. indexify/executor/function_executor_controller/events.py +10 -14
  6. indexify/executor/function_executor_controller/function_executor_controller.py +108 -66
  7. indexify/executor/function_executor_controller/function_executor_startup_output.py +21 -0
  8. indexify/executor/function_executor_controller/loggers.py +55 -7
  9. indexify/executor/function_executor_controller/message_validators.py +16 -1
  10. indexify/executor/function_executor_controller/prepare_task.py +3 -3
  11. indexify/executor/function_executor_controller/run_task.py +19 -27
  12. indexify/executor/function_executor_controller/task_info.py +2 -3
  13. indexify/executor/function_executor_controller/task_output.py +12 -24
  14. indexify/executor/function_executor_controller/upload_task_output.py +7 -7
  15. indexify/executor/state_reconciler.py +5 -33
  16. indexify/executor/state_reporter.py +46 -56
  17. indexify/proto/executor_api.proto +34 -17
  18. indexify/proto/executor_api_pb2.py +46 -42
  19. indexify/proto/executor_api_pb2.pyi +50 -8
  20. {indexify-0.4.6.dist-info → indexify-0.4.8.dist-info}/METADATA +2 -2
  21. {indexify-0.4.6.dist-info → indexify-0.4.8.dist-info}/RECORD +23 -22
  22. {indexify-0.4.6.dist-info → indexify-0.4.8.dist-info}/WHEEL +0 -0
  23. {indexify-0.4.6.dist-info → indexify-0.4.8.dist-info}/entry_points.txt +0 -0
@@ -16,7 +16,7 @@ from indexify.executor.function_executor.function_executor import FunctionExecut
16
16
  from indexify.executor.function_executor.health_checker import HealthCheckResult
17
17
  from indexify.proto.executor_api_pb2 import (
18
18
  FunctionExecutorTerminationReason,
19
- Task,
19
+ TaskAllocation,
20
20
  TaskFailureReason,
21
21
  TaskOutcomeCode,
22
22
  )
@@ -41,12 +41,13 @@ async def run_task_on_function_executor(
41
41
  """
42
42
  logger = logger.bind(module=__name__)
43
43
  request: RunTaskRequest = RunTaskRequest(
44
- namespace=task_info.task.namespace,
45
- graph_name=task_info.task.graph_name,
46
- graph_version=task_info.task.graph_version,
47
- function_name=task_info.task.function_name,
48
- graph_invocation_id=task_info.task.graph_invocation_id,
49
- task_id=task_info.task.id,
44
+ namespace=task_info.allocation.task.namespace,
45
+ graph_name=task_info.allocation.task.graph_name,
46
+ graph_version=task_info.allocation.task.graph_version,
47
+ function_name=task_info.allocation.task.function_name,
48
+ graph_invocation_id=task_info.allocation.task.graph_invocation_id,
49
+ task_id=task_info.allocation.task.id,
50
+ allocation_id=task_info.allocation.allocation_id,
50
51
  function_input=task_info.input,
51
52
  )
52
53
  # Don't keep the input in memory after we started running the task.
@@ -58,8 +59,8 @@ async def run_task_on_function_executor(
58
59
  task_info.init_value = None
59
60
 
60
61
  function_executor.invocation_state_client().add_task_to_invocation_id_entry(
61
- task_id=task_info.task.id,
62
- invocation_id=task_info.task.graph_invocation_id,
62
+ task_id=task_info.allocation.task.id,
63
+ invocation_id=task_info.allocation.task.graph_invocation_id,
63
64
  )
64
65
 
65
66
  metric_function_executor_run_task_rpcs.inc()
@@ -73,16 +74,15 @@ async def run_task_on_function_executor(
73
74
  # If this RPC failed due to customer code crashing the server we won't be
74
75
  # able to detect this. We'll treat this as our own error for now and thus
75
76
  # let the AioRpcError to be raised here.
76
- timeout_sec = task_info.task.timeout_ms / 1000.0
77
+ timeout_sec = task_info.allocation.task.timeout_ms / 1000.0
77
78
  try:
78
79
  channel: grpc.aio.Channel = function_executor.channel()
79
80
  response: RunTaskResponse = await FunctionExecutorStub(channel).run_task(
80
81
  request, timeout=timeout_sec
81
82
  )
82
83
  task_info.output = _task_output_from_function_executor_response(
83
- task=task_info.task,
84
+ allocation=task_info.allocation,
84
85
  response=response,
85
- allocation_id=task_info.allocation_id,
86
86
  )
87
87
  except grpc.aio.AioRpcError as e:
88
88
  if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
@@ -91,36 +91,29 @@ async def run_task_on_function_executor(
91
91
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT
92
92
  )
93
93
  task_info.output = TaskOutput.function_timeout(
94
- task=task_info.task,
95
- allocation_id=task_info.allocation_id,
94
+ allocation=task_info.allocation,
96
95
  timeout_sec=timeout_sec,
97
96
  )
98
97
  else:
99
98
  metric_function_executor_run_task_rpc_errors.inc()
100
99
  logger.error("task execution failed", exc_info=e)
101
- task_info.output = TaskOutput.internal_error(
102
- task=task_info.task, allocation_id=task_info.allocation_id
103
- )
100
+ task_info.output = TaskOutput.internal_error(task_info.allocation)
104
101
  except asyncio.CancelledError:
105
102
  # The task is still running in FE, we only cancelled the client-side RPC.
106
103
  function_executor_termination_reason = (
107
104
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED
108
105
  )
109
- task_info.output = TaskOutput.task_cancelled(
110
- task=task_info.task, allocation_id=task_info.allocation_id
111
- )
106
+ task_info.output = TaskOutput.task_cancelled(task_info.allocation)
112
107
  except Exception as e:
113
108
  metric_function_executor_run_task_rpc_errors.inc()
114
109
  logger.error("task execution failed", exc_info=e)
115
- task_info.output = TaskOutput.internal_error(
116
- task=task_info.task, allocation_id=task_info.allocation_id
117
- )
110
+ task_info.output = TaskOutput.internal_error(task_info.allocation)
118
111
 
119
112
  metric_function_executor_run_task_rpc_latency.observe(time.monotonic() - start_time)
120
113
  metric_function_executor_run_task_rpcs_in_progress.dec()
121
114
 
122
115
  function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
123
- task_id=task_info.task.id,
116
+ task_id=task_info.allocation.task.id,
124
117
  )
125
118
 
126
119
  if (
@@ -147,7 +140,7 @@ async def run_task_on_function_executor(
147
140
 
148
141
 
149
142
  def _task_output_from_function_executor_response(
150
- task: Task, response: RunTaskResponse, allocation_id: str
143
+ allocation: TaskAllocation, response: RunTaskResponse
151
144
  ) -> TaskOutput:
152
145
  response_validator = MessageValidator(response)
153
146
  response_validator.required_field("stdout")
@@ -162,8 +155,7 @@ def _task_output_from_function_executor_response(
162
155
  metrics.timers = dict(response.metrics.timers)
163
156
 
164
157
  output = TaskOutput(
165
- task=task,
166
- allocation_id=allocation_id,
158
+ allocation=allocation,
167
159
  outcome_code=(
168
160
  TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS
169
161
  if response.success
@@ -4,7 +4,7 @@ from typing import Optional
4
4
 
5
5
  from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
6
6
 
7
- from indexify.proto.executor_api_pb2 import Task
7
+ from indexify.proto.executor_api_pb2 import TaskAllocation
8
8
 
9
9
  from .task_output import TaskOutput
10
10
 
@@ -13,8 +13,7 @@ from .task_output import TaskOutput
13
13
  class TaskInfo:
14
14
  """Object used to track a task during its full lifecycle in the FunctionExecutorController."""
15
15
 
16
- task: Task
17
- allocation_id: str
16
+ allocation: TaskAllocation
18
17
  # time.monotonic() timestamp
19
18
  start_time: float
20
19
  # time.monotonic() timestamp when the task was prepared for execution
@@ -7,7 +7,7 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
7
7
 
8
8
  from indexify.proto.executor_api_pb2 import (
9
9
  DataPayload,
10
- Task,
10
+ TaskAllocation,
11
11
  TaskFailureReason,
12
12
  TaskOutcomeCode,
13
13
  )
@@ -26,8 +26,7 @@ class TaskOutput:
26
26
 
27
27
  def __init__(
28
28
  self,
29
- task: Task,
30
- allocation_id: str,
29
+ allocation: TaskAllocation,
31
30
  outcome_code: TaskOutcomeCode,
32
31
  # Optional[TaskFailureReason] is not supported in python 3.9
33
32
  failure_reason: TaskFailureReason = None,
@@ -42,8 +41,8 @@ class TaskOutput:
42
41
  uploaded_stdout: Optional[DataPayload] = None,
43
42
  uploaded_stderr: Optional[DataPayload] = None,
44
43
  ):
45
- self.task = task
46
- self.allocation_id = allocation_id
44
+ self.task = allocation.task
45
+ self.allocation = allocation
47
46
  self.function_output = function_output
48
47
  self.router_output = router_output
49
48
  self.stdout = stdout
@@ -60,14 +59,12 @@ class TaskOutput:
60
59
  @classmethod
61
60
  def internal_error(
62
61
  cls,
63
- task: Task,
64
- allocation_id: str,
62
+ allocation: TaskAllocation,
65
63
  ) -> "TaskOutput":
66
64
  """Creates a TaskOutput for an internal error."""
67
65
  # We are not sharing internal error messages with the customer.
68
66
  return TaskOutput(
69
- task=task,
70
- allocation_id=allocation_id,
67
+ allocation=allocation,
71
68
  outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
72
69
  failure_reason=TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR,
73
70
  stderr="Platform failed to execute the function.",
@@ -76,15 +73,13 @@ class TaskOutput:
76
73
  @classmethod
77
74
  def function_timeout(
78
75
  cls,
79
- task: Task,
80
- allocation_id: str,
76
+ allocation: TaskAllocation,
81
77
  timeout_sec: float,
82
78
  ) -> "TaskOutput":
83
79
  """Creates a TaskOutput for an function timeout error."""
84
80
  # Task stdout, stderr is not available.
85
81
  return TaskOutput(
86
- task=task,
87
- allocation_id=allocation_id,
82
+ allocation=allocation,
88
83
  outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
89
84
  failure_reason=TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_TIMEOUT,
90
85
  stderr=f"Function exceeded its configured timeout of {timeout_sec:.3f} sec.",
@@ -93,13 +88,11 @@ class TaskOutput:
93
88
  @classmethod
94
89
  def task_cancelled(
95
90
  cls,
96
- task: Task,
97
- allocation_id: str,
91
+ allocation: TaskAllocation,
98
92
  ) -> "TaskOutput":
99
93
  """Creates a TaskOutput for the case when task didn't finish because its allocation was removed by Server."""
100
94
  return TaskOutput(
101
- task=task,
102
- allocation_id=allocation_id,
95
+ allocation=allocation,
103
96
  outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
104
97
  failure_reason=TaskFailureReason.TASK_FAILURE_REASON_TASK_CANCELLED,
105
98
  )
@@ -107,16 +100,11 @@ class TaskOutput:
107
100
  @classmethod
108
101
  def function_executor_terminated(
109
102
  cls,
110
- task: Task,
111
- allocation_id: str,
103
+ allocation: TaskAllocation,
112
104
  ) -> "TaskOutput":
113
105
  """Creates a TaskOutput for the case when task didn't run because its FE terminated."""
114
106
  return TaskOutput(
115
- task=task,
116
- allocation_id=allocation_id,
107
+ allocation=allocation,
117
108
  outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
118
109
  failure_reason=TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED,
119
- # TODO: add FE startup stdout, stderr to the task output if FE failed to startup.
120
- stdout="",
121
- stderr="Can't execute the function because its Function Executor terminated.",
122
110
  )
@@ -121,13 +121,13 @@ async def _upload_to_blob_store(
121
121
  output: TaskOutput, blob_store: BLOBStore, logger: Any
122
122
  ) -> None:
123
123
  if output.stdout is not None:
124
- stdout_url = f"{output.task.output_payload_uri_prefix}.{output.task.id}.stdout"
124
+ stdout_url = f"{output.allocation.task.output_payload_uri_prefix}.{output.allocation.task.id}.stdout"
125
125
  stdout_bytes: bytes = output.stdout.encode()
126
126
  await blob_store.put(stdout_url, stdout_bytes, logger)
127
127
  output.uploaded_stdout = DataPayload(
128
128
  uri=stdout_url,
129
129
  size=len(stdout_bytes),
130
- sha256_hash=_compute_hash(stdout_bytes),
130
+ sha256_hash=compute_hash(stdout_bytes),
131
131
  encoding=DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT,
132
132
  encoding_version=0,
133
133
  )
@@ -135,13 +135,13 @@ async def _upload_to_blob_store(
135
135
  output.stdout = None
136
136
 
137
137
  if output.stderr is not None:
138
- stderr_url = f"{output.task.output_payload_uri_prefix}.{output.task.id}.stderr"
138
+ stderr_url = f"{output.allocation.task.output_payload_uri_prefix}.{output.allocation.task.id}.stderr"
139
139
  stderr_bytes: bytes = output.stderr.encode()
140
140
  await blob_store.put(stderr_url, stderr_bytes, logger)
141
141
  output.uploaded_stderr = DataPayload(
142
142
  uri=stderr_url,
143
143
  size=len(stderr_bytes),
144
- sha256_hash=_compute_hash(stderr_bytes),
144
+ sha256_hash=compute_hash(stderr_bytes),
145
145
  encoding=DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT,
146
146
  encoding_version=0,
147
147
  )
@@ -153,7 +153,7 @@ async def _upload_to_blob_store(
153
153
  uploaded_data_payloads = []
154
154
  for func_output_item in output.function_output.outputs:
155
155
  node_output_sequence = len(uploaded_data_payloads)
156
- output_url = f"{output.task.output_payload_uri_prefix}.{output.task.id}.{node_output_sequence}"
156
+ output_url = f"{output.allocation.task.output_payload_uri_prefix}.{output.allocation.task.id}.{node_output_sequence}"
157
157
  output_bytes: bytes = (
158
158
  func_output_item.bytes
159
159
  if func_output_item.HasField("bytes")
@@ -164,7 +164,7 @@ async def _upload_to_blob_store(
164
164
  DataPayload(
165
165
  uri=output_url,
166
166
  size=len(output_bytes),
167
- sha256_hash=_compute_hash(output_bytes),
167
+ sha256_hash=compute_hash(output_bytes),
168
168
  encoding=_to_grpc_data_payload_encoding(output),
169
169
  encoding_version=0,
170
170
  )
@@ -214,7 +214,7 @@ def _to_grpc_data_payload_encoding(task_output: TaskOutput) -> DataPayloadEncodi
214
214
  return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_BINARY_PICKLE
215
215
 
216
216
 
217
- def _compute_hash(data: bytes) -> str:
217
+ def compute_hash(data: bytes) -> str:
218
218
  hasher = hashlib.sha256(usedforsecurity=False)
219
219
  hasher.update(data)
220
220
  return hasher.hexdigest()
@@ -21,9 +21,9 @@ from .function_executor.server.function_executor_server_factory import (
21
21
  from .function_executor_controller import (
22
22
  FunctionExecutorController,
23
23
  function_executor_logger,
24
- task_logger,
24
+ task_allocation_logger,
25
25
  validate_function_executor_description,
26
- validate_task,
26
+ validate_task_allocation,
27
27
  )
28
28
  from .metrics.state_reconciler import (
29
29
  metric_state_reconciliation_errors,
@@ -376,31 +376,16 @@ class ExecutorStateReconciler:
376
376
  # Nothing to do, task already exists and it's immutable.
377
377
  return
378
378
 
379
- function_executor_controller.add_task(
380
- task=task_allocation.task,
381
- allocation_id=task_allocation.allocation_id,
382
- )
379
+ function_executor_controller.add_task_allocation(task_allocation)
383
380
 
384
381
  def _valid_task_allocations(self, task_allocations: Iterable[TaskAllocation]):
385
382
  valid_task_allocations: List[TaskAllocation] = []
386
383
  for task_allocation in task_allocations:
387
384
  task_allocation: TaskAllocation
388
- logger = self._task_allocation_logger(task_allocation)
389
-
390
- try:
391
- validate_task(task_allocation.task)
392
- except ValueError as e:
393
- # There's no way to report this error to Server so just log it.
394
- logger.error(
395
- "received invalid TaskAllocation from Server, dropping it from desired state",
396
- exc_info=e,
397
- )
398
- continue
385
+ logger = task_allocation_logger(task_allocation, self._logger)
399
386
 
400
- validator = MessageValidator(task_allocation)
401
387
  try:
402
- validator.required_field("function_executor_id")
403
- validator.required_field("allocation_id")
388
+ validate_task_allocation(task_allocation)
404
389
  except ValueError as e:
405
390
  # There's no way to report this error to Server so just log it.
406
391
  logger.error(
@@ -423,16 +408,3 @@ class ExecutorStateReconciler:
423
408
  valid_task_allocations.append(task_allocation)
424
409
 
425
410
  return valid_task_allocations
426
-
427
- def _task_allocation_logger(self, task_allocation: TaskAllocation) -> Any:
428
- """Returns a logger for the given TaskAllocation.
429
-
430
- Doesn't assume that the supplied TaskAllocation is valid.
431
- """
432
- return task_logger(task_allocation.task, self._logger).bind(
433
- function_executor_id=(
434
- task_allocation.function_executor_id
435
- if task_allocation.HasField("function_executor_id")
436
- else None
437
- )
438
- )
@@ -9,7 +9,9 @@ from indexify.proto.executor_api_pb2 import (
9
9
  AllowedFunction,
10
10
  ExecutorState,
11
11
  ExecutorStatus,
12
+ ExecutorUpdate,
12
13
  FunctionExecutorState,
14
+ FunctionExecutorUpdate,
13
15
  GPUModel,
14
16
  GPUResources,
15
17
  )
@@ -24,8 +26,7 @@ from indexify.proto.executor_api_pb2_grpc import ExecutorAPIStub
24
26
 
25
27
  from .channel_manager import ChannelManager
26
28
  from .function_allowlist import FunctionURI
27
- from .function_executor_controller.loggers import task_logger
28
- from .function_executor_controller.task_output import TaskOutput
29
+ from .function_executor_controller.loggers import task_result_logger
29
30
  from .host_resources.host_resources import HostResources, HostResourcesProvider
30
31
  from .host_resources.nvidia_gpu import NVIDIA_GPU_MODEL
31
32
  from .metrics.state_reporter import (
@@ -83,7 +84,8 @@ class ExecutorStateReporter:
83
84
  self._last_server_clock: int = (
84
85
  0 # Server expects initial value to be 0 until it is set by Server.
85
86
  )
86
- self._completed_task_outputs: List[TaskOutput] = []
87
+ self._pending_task_results: List[TaskResult] = []
88
+ self._pending_fe_updates: List[FunctionExecutorUpdate] = []
87
89
  self._function_executor_states: Dict[str, FunctionExecutorState] = {}
88
90
 
89
91
  def update_executor_status(self, value: ExecutorStatus) -> None:
@@ -98,7 +100,7 @@ class ExecutorStateReporter:
98
100
  ) -> None:
99
101
  self._function_executor_states[state.description.id] = state
100
102
 
101
- def remove_function_executor_info(self, function_executor_id: str) -> None:
103
+ def remove_function_executor_state(self, function_executor_id: str) -> None:
102
104
  if function_executor_id not in self._function_executor_states:
103
105
  self._logger.warning(
104
106
  "attempted to remove non-existing function executor state",
@@ -108,8 +110,12 @@ class ExecutorStateReporter:
108
110
 
109
111
  self._function_executor_states.pop(function_executor_id)
110
112
 
111
- def add_completed_task_output(self, task_output: TaskOutput) -> None:
112
- self._completed_task_outputs.append(task_output)
113
+ def add_completed_task_result(self, task_result: TaskResult) -> None:
114
+ self._pending_task_results.append(task_result)
115
+
116
+ def add_function_executor_update(self, update: FunctionExecutorUpdate) -> None:
117
+ """Adds a function executor update to the list of updates to be reported."""
118
+ self._pending_fe_updates.append(update)
113
119
 
114
120
  def schedule_state_report(self) -> None:
115
121
  """Schedules a state report to be sent to the server asap.
@@ -212,19 +218,28 @@ class ExecutorStateReporter:
212
218
  ):
213
219
  metric_state_report_rpcs.inc()
214
220
  state: ExecutorState = self._current_executor_state()
215
- task_outputs: List[TaskOutput] = self._remove_completed_tasks()
216
- task_results: List[TaskResult] = _to_task_result_protos(task_outputs)
221
+ update: ExecutorUpdate = self._remove_pending_update()
222
+
223
+ for task_result in update.task_results:
224
+ task_result_logger(task_result, self._logger).info(
225
+ "reporting task outcome",
226
+ outcome_code=TaskOutcomeCode.Name(task_result.outcome_code),
227
+ failure_reason=(
228
+ TaskFailureReason.Name(task_result.failure_reason)
229
+ if task_result.HasField("failure_reason")
230
+ else "None"
231
+ ),
232
+ )
217
233
 
218
234
  try:
219
235
  await stub.report_executor_state(
220
236
  ReportExecutorStateRequest(
221
- executor_state=state, task_results=task_results
237
+ executor_state=state, executor_update=update
222
238
  ),
223
239
  timeout=_REPORT_RPC_TIMEOUT_SEC,
224
240
  )
225
241
  except Exception as e:
226
- for task_output in task_outputs:
227
- self.add_completed_task_output(task_output)
242
+ self._add_to_pending_update(update)
228
243
  raise
229
244
 
230
245
  def _current_executor_state(self) -> ExecutorState:
@@ -247,21 +262,26 @@ class ExecutorStateReporter:
247
262
  state.server_clock = self._last_server_clock
248
263
  return state
249
264
 
250
- def _remove_completed_tasks(self) -> List[TaskOutput]:
251
- task_outputs: List[TaskOutput] = []
252
- while len(self._completed_task_outputs) > 0:
253
- task_output = self._completed_task_outputs.pop()
254
- task_outputs.append(task_output)
255
- task_logger(task_output.task, self._logger).info(
256
- "reporting task outcome",
257
- outcome_code=TaskOutcomeCode.Name(task_output.outcome_code),
258
- failure_reason=(
259
- "None"
260
- if task_output.failure_reason is None
261
- else TaskFailureReason.Name(task_output.failure_reason)
262
- ),
263
- )
264
- return task_outputs
265
+ def _remove_pending_update(self) -> ExecutorUpdate:
266
+ """Removes all pending executor updates and returns them."""
267
+ # No races here cause we don't await.
268
+ task_results: List[TaskResult] = self._pending_task_results
269
+ self._pending_task_results = []
270
+
271
+ fe_updates: List[FunctionExecutorUpdate] = self._pending_fe_updates
272
+ self._pending_fe_updates = []
273
+
274
+ return ExecutorUpdate(
275
+ executor_id=self._executor_id,
276
+ task_results=task_results,
277
+ function_executor_updates=fe_updates,
278
+ )
279
+
280
+ def _add_to_pending_update(self, update: ExecutorUpdate) -> None:
281
+ for task_result in update.task_results:
282
+ self.add_completed_task_result(task_result)
283
+ for function_executor_update in update.function_executor_updates:
284
+ self.add_function_executor_update(function_executor_update)
265
285
 
266
286
 
267
287
  def _to_allowed_function_protos(
@@ -324,36 +344,6 @@ def _to_gpu_model_proto(nvidia_gpu_model: NVIDIA_GPU_MODEL) -> GPUModel:
324
344
  return GPUModel.GPU_MODEL_UNKNOWN
325
345
 
326
346
 
327
- def _to_task_result_protos(task_outputs: List[TaskOutput]) -> List[TaskResult]:
328
- task_results: List[TaskResult] = []
329
-
330
- for output in task_outputs:
331
- task_result = TaskResult(
332
- task_id=output.task.id,
333
- allocation_id=output.allocation_id,
334
- namespace=output.task.namespace,
335
- graph_name=output.task.graph_name,
336
- function_name=output.task.function_name,
337
- graph_invocation_id=output.task.graph_invocation_id,
338
- reducer=output.reducer,
339
- outcome_code=output.outcome_code,
340
- next_functions=(output.router_output.edges if output.router_output else []),
341
- function_outputs=output.uploaded_data_payloads,
342
- )
343
- if output.failure_reason is not None:
344
- task_result.failure_reason = output.failure_reason
345
- if output.uploaded_stdout is not None:
346
- task_result.stdout.CopyFrom(output.uploaded_stdout)
347
- if output.uploaded_stderr is not None:
348
- task_result.stderr.CopyFrom(output.uploaded_stderr)
349
- if output.router_output is not None:
350
- task_result.routing.next_functions[:] = output.router_output.edges
351
-
352
- task_results.append(task_result)
353
-
354
- return task_results
355
-
356
-
357
347
  def _executor_labels() -> Dict[str, str]:
358
348
  """Returns standard executor labels always added to user supplied labels."""
359
349
  return {
@@ -109,6 +109,10 @@ message FunctionExecutorDescription {
109
109
  optional uint32 customer_code_timeout_ms = 9;
110
110
  optional DataPayload graph = 10;
111
111
  optional FunctionExecutorResources resources = 11;
112
+ // URI prefix for the startup output payloads.
113
+ // S3 URI if the data is stored in S3.
114
+ // Starts with "file://"" prefix followed by an absolute directory path if the data is stored on a local file system.
115
+ optional string output_payload_uri_prefix = 12;
112
116
  }
113
117
 
114
118
  message FunctionExecutorState {
@@ -117,6 +121,12 @@ message FunctionExecutorState {
117
121
  optional FunctionExecutorTerminationReason termination_reason = 3;
118
122
  }
119
123
 
124
+ message FunctionExecutorUpdate {
125
+ optional FunctionExecutorDescription description = 1;
126
+ optional DataPayload startup_stdout = 2;
127
+ optional DataPayload startup_stderr = 3;
128
+ }
129
+
120
130
  enum ExecutorStatus {
121
131
  EXECUTOR_STATUS_UNKNOWN = 0;
122
132
  EXECUTOR_STATUS_STARTING_UP = 1;
@@ -145,13 +155,20 @@ message ExecutorState {
145
155
  optional uint64 server_clock = 12;
146
156
  }
147
157
 
148
- // A message sent by Executor to report its up to date state to Server.
158
+ // Updates that Executor wants to report to Server. If report_executor_state RPC is successful
159
+ // then the updates from it won't be included in the next RPC.
160
+ message ExecutorUpdate {
161
+ optional string executor_id = 1;
162
+ repeated TaskResult task_results = 2;
163
+ repeated FunctionExecutorUpdate function_executor_updates = 3;
164
+ }
165
+
149
166
  message ReportExecutorStateRequest {
150
167
  optional ExecutorState executor_state = 1;
151
- repeated TaskResult task_results = 2;
168
+ optional ExecutorUpdate executor_update = 2;
152
169
  }
153
170
 
154
- // A message sent by Server to Executor to acknowledge the receipt of Executor state.
171
+ // A message sent by Server to Executor to acknowledge the receipt of ReportExecutorStateRequest.
155
172
  message ReportExecutorStateResponse {
156
173
  }
157
174
 
@@ -226,24 +243,24 @@ message ResultRouting {
226
243
 
227
244
  message TaskResult {
228
245
  optional string task_id = 1;
229
- optional string namespace = 2;
230
- optional string graph_name = 3;
231
- optional string function_name = 4;
232
- optional string graph_invocation_id = 5;
233
- optional bool reducer = 6;
234
- optional TaskOutcomeCode outcome_code = 7;
235
- optional TaskFailureReason failure_reason = 8;
246
+ optional string allocation_id = 2;
247
+ optional string namespace = 3;
248
+ optional string graph_name = 4;
249
+ optional string graph_version = 5;
250
+ optional string function_name = 6;
251
+ optional string graph_invocation_id = 7;
252
+ optional bool reducer = 8;
253
+ optional TaskOutcomeCode outcome_code = 9;
254
+ optional TaskFailureReason failure_reason = 10;
236
255
  // Edges that the function wants the invocation to be routed to.
237
256
  // Previously called router_edges.
238
257
  // NB: An empty list indicates that the graph's route definitions should be used,
239
258
  // unless this field is overridden by the presence of the `routing` field.
240
- repeated string next_functions = 9;
241
- repeated DataPayload function_outputs = 10;
259
+ repeated string next_functions = 11;
260
+ repeated DataPayload function_outputs = 12;
242
261
  // Standard output and error streams of the function.
243
- optional DataPayload stdout = 11;
244
- optional DataPayload stderr = 12;
245
-
246
- optional string allocation_id = 13;
262
+ optional DataPayload stdout = 13;
263
+ optional DataPayload stderr = 14;
247
264
 
248
265
  // Indicates how the results should be routed.
249
266
  // If this is present, it replaces `next_functions`.
@@ -253,7 +270,7 @@ message TaskResult {
253
270
  // graph's routing. The long-term goal is to deprecate
254
271
  // `next_functions`, so that if `routing` is not present, the
255
272
  // graph's routing definitions will always be used.
256
- ResultRouting routing = 14;
273
+ ResultRouting routing = 15;
257
274
  }
258
275
 
259
276
  // Internal API for scheduling and running tasks on Executors. Executors are acting as clients of this API.