indexify 0.3.18__py3-none-any.whl → 0.3.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. indexify/cli/cli.py +3 -17
  2. indexify/executor/api_objects.py +12 -0
  3. indexify/executor/downloader.py +4 -1
  4. indexify/executor/executor.py +51 -29
  5. indexify/executor/function_executor/function_executor.py +24 -11
  6. indexify/executor/function_executor/function_executor_state.py +9 -1
  7. indexify/executor/function_executor/function_executor_states_container.py +3 -1
  8. indexify/executor/function_executor/function_executor_status.py +2 -0
  9. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
  10. indexify/executor/function_executor/single_task_runner.py +15 -11
  11. indexify/executor/function_executor/task_output.py +35 -2
  12. indexify/executor/grpc/completed_tasks_container.py +26 -0
  13. indexify/executor/grpc/function_executor_controller.py +421 -0
  14. indexify/executor/grpc/state_reconciler.py +24 -34
  15. indexify/executor/grpc/state_reporter.py +35 -32
  16. indexify/executor/grpc/task_controller.py +449 -0
  17. indexify/executor/metrics/task_reporter.py +14 -0
  18. indexify/executor/task_reporter.py +95 -4
  19. indexify/executor/task_runner.py +1 -0
  20. indexify/proto/executor_api.proto +63 -5
  21. indexify/proto/executor_api_pb2.py +40 -30
  22. indexify/proto/executor_api_pb2.pyi +118 -3
  23. indexify/proto/executor_api_pb2_grpc.py +47 -0
  24. {indexify-0.3.18.dist-info → indexify-0.3.19.dist-info}/METADATA +1 -1
  25. {indexify-0.3.18.dist-info → indexify-0.3.19.dist-info}/RECORD +27 -24
  26. {indexify-0.3.18.dist-info → indexify-0.3.19.dist-info}/WHEEL +0 -0
  27. {indexify-0.3.18.dist-info → indexify-0.3.19.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,449 @@
1
+ import asyncio
2
+ import time
3
+ from typing import Any, Optional
4
+
5
+ import grpc
6
+ from tensorlake.function_executor.proto.function_executor_pb2 import (
7
+ RunTaskRequest,
8
+ RunTaskResponse,
9
+ SerializedObject,
10
+ )
11
+ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
12
+ FunctionExecutorStub,
13
+ )
14
+ from tensorlake.function_executor.proto.message_validator import MessageValidator
15
+
16
+ from indexify.proto.executor_api_pb2 import Task
17
+
18
+ from ..downloader import Downloader
19
+ from ..function_executor.function_executor_state import FunctionExecutorState
20
+ from ..function_executor.function_executor_status import FunctionExecutorStatus
21
+ from ..function_executor.metrics.single_task_runner import (
22
+ metric_function_executor_run_task_rpc_errors,
23
+ metric_function_executor_run_task_rpc_latency,
24
+ metric_function_executor_run_task_rpcs,
25
+ )
26
+ from ..function_executor.task_output import TaskMetrics, TaskOutput
27
+
28
+ # TODO: combine these metrics into a single python file once gRPC migration is over and old code is removed.
29
+ from ..metrics.executor import (
30
+ METRIC_TASKS_COMPLETED_OUTCOME_ALL,
31
+ METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
32
+ METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
33
+ METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
34
+ metric_task_completion_latency,
35
+ metric_task_outcome_report_latency,
36
+ metric_task_outcome_report_retries,
37
+ metric_task_outcome_reports,
38
+ metric_tasks_completed,
39
+ metric_tasks_reporting_outcome,
40
+ )
41
+ from ..metrics.task_runner import (
42
+ metric_task_policy_errors,
43
+ metric_task_policy_latency,
44
+ metric_task_policy_runs,
45
+ metric_task_run_latency,
46
+ metric_task_run_platform_errors,
47
+ metric_task_runs,
48
+ metric_tasks_blocked_by_policy,
49
+ metric_tasks_blocked_by_policy_per_function_name,
50
+ metric_tasks_running,
51
+ )
52
+ from ..task_reporter import TaskReporter
53
+ from .completed_tasks_container import CompletedTasksContainer
54
+
55
+ _TASK_OUTCOME_REPORT_BACKOFF_SEC = 5.0
56
+
57
+
58
+ class FunctionTimeoutError(Exception):
59
+ """Exception raised when a customer's task execution exceeds the allowed timeout."""
60
+
61
+ def __init__(self, message: str):
62
+ super().__init__(message)
63
+
64
+
65
+ class TaskController:
66
+ def __init__(
67
+ self,
68
+ task: Task,
69
+ function_executor_state: FunctionExecutorState,
70
+ downloader: Downloader,
71
+ task_reporter: TaskReporter,
72
+ completed_tasks_container: CompletedTasksContainer,
73
+ logger: Any,
74
+ ):
75
+ """Creates a new TaskController instance.
76
+
77
+ Raises ValueError if the supplied Task is not valid.
78
+ """
79
+ _validate_task(task)
80
+ self._task: Task = task
81
+ self._function_executor_state: FunctionExecutorState = function_executor_state
82
+ self._downloader: Downloader = downloader
83
+ self._task_reporter: TaskReporter = task_reporter
84
+ self._completed_tasks_container: CompletedTasksContainer = (
85
+ completed_tasks_container
86
+ )
87
+ self._logger: Any = logger.bind(
88
+ function_executor_id=function_executor_state.id,
89
+ task_id=task.id,
90
+ module=__name__,
91
+ namespace=task.namespace,
92
+ graph_name=task.graph_name,
93
+ graph_version=task.graph_version,
94
+ function_name=task.function_name,
95
+ invocation_id=task.graph_invocation_id,
96
+ )
97
+ self._is_running: bool = False
98
+ self._is_cancelled: bool = False
99
+ self._input: Optional[SerializedObject] = None
100
+ self._init_value: Optional[SerializedObject] = None
101
+ self._output: Optional[TaskOutput] = None
102
+
103
+ async def cancel_task(self) -> None:
104
+ """Cancells the task."""
105
+ self._is_cancelled = True
106
+
107
+ async with self._function_executor_state.lock:
108
+ if not self._is_running:
109
+ return
110
+
111
+ # Mark the Function Executor as unhealthy to destroy it to cancel the running function.
112
+ # If FE status changed, then it means that we're off normal task execution path, e.g.
113
+ # Server decided to do something with FE.
114
+ if (
115
+ self._function_executor_state.status
116
+ == FunctionExecutorStatus.RUNNING_TASK
117
+ ):
118
+ # TODO: Add a separate FE status for cancelled function so we don't lie to server that FE is unhealthy to destroy it.
119
+ await self._function_executor_state.set_status(
120
+ FunctionExecutorStatus.UNHEALTHY,
121
+ )
122
+ self._logger.warning("task is cancelled")
123
+ else:
124
+ self._logger.warning(
125
+ "skipping marking Function Executor unhealthy on task cancellation due to unexpected FE status",
126
+ status=self._function_executor_state.status.name,
127
+ )
128
+
129
+ async def run_task(self) -> None:
130
+ """Runs the supplied task and does full managemenet of its lifecycle.
131
+
132
+ Doesn't raise any exceptions."""
133
+ start_time: float = time.monotonic()
134
+
135
+ try:
136
+ # The task can be cancelled at any time but we'll just wait until FE gets shutdown
137
+ # because we require this to happen from the cancel_task() caller.
138
+ self._input = await self._downloader.download_input(
139
+ namespace=self._task.namespace,
140
+ graph_name=self._task.graph_name,
141
+ graph_invocation_id=self._task.graph_invocation_id,
142
+ input_key=self._task.input_key,
143
+ logger=self._logger,
144
+ )
145
+ if self._task.HasField("reducer_output_key"):
146
+ self._init_value = await self._downloader.download_init_value(
147
+ namespace=self._task.namespace,
148
+ graph_name=self._task.graph_name,
149
+ function_name=self._task.function_name,
150
+ graph_invocation_id=self._task.graph_invocation_id,
151
+ reducer_output_key=self._task.reducer_output_key,
152
+ logger=self._logger,
153
+ )
154
+
155
+ await self._wait_for_idle_function_executor()
156
+
157
+ with (
158
+ metric_task_run_platform_errors.count_exceptions(),
159
+ metric_tasks_running.track_inprogress(),
160
+ metric_task_run_latency.time(),
161
+ ):
162
+ metric_task_runs.inc()
163
+ await self._run_task()
164
+
165
+ self._logger.info("task execution finished", success=self._output.success)
166
+ except FunctionTimeoutError:
167
+ self._output = TaskOutput.function_timeout(
168
+ task_id=self._task.id,
169
+ namespace=self._task.namespace,
170
+ graph_name=self._task.graph_name,
171
+ function_name=self._task.function_name,
172
+ graph_version=self._task.graph_version,
173
+ graph_invocation_id=self._task.graph_invocation_id,
174
+ )
175
+ async with self._function_executor_state.lock:
176
+ # Mark the Function Executor as unhealthy to destroy it to cancel the running function.
177
+ # If FE status changed, then it means that we're off normal task execution path, e.g.
178
+ # Server decided to do something with FE.
179
+ if (
180
+ self._function_executor_state.status
181
+ == FunctionExecutorStatus.RUNNING_TASK
182
+ ):
183
+ # TODO: Add a separate FE status for timed out function so we don't lie to server that FE is unhealthy to destroy it.
184
+ await self._function_executor_state.set_status(
185
+ FunctionExecutorStatus.UNHEALTHY,
186
+ )
187
+ else:
188
+ self._logger.warning(
189
+ "skipping marking Function Executor unhealthy on task timeout due to unexpected FE status",
190
+ status=self._function_executor_state.status.name,
191
+ )
192
+ except Exception as e:
193
+ self._output = TaskOutput.internal_error(
194
+ task_id=self._task.id,
195
+ namespace=self._task.namespace,
196
+ graph_name=self._task.graph_name,
197
+ function_name=self._task.function_name,
198
+ graph_version=self._task.graph_version,
199
+ graph_invocation_id=self._task.graph_invocation_id,
200
+ )
201
+ self._logger.error("task execution failed", exc_info=e)
202
+ finally:
203
+ # Release the Function Executor so others can run tasks on it if FE status didn't change.
204
+ # If FE status changed, then it means that we're off normal task execution path, e.g.
205
+ # Server decided to do something with FE.
206
+ async with self._function_executor_state.lock:
207
+ if (
208
+ self._function_executor_state.status
209
+ == FunctionExecutorStatus.RUNNING_TASK
210
+ ):
211
+ await self._function_executor_state.set_status(
212
+ FunctionExecutorStatus.IDLE
213
+ )
214
+ else:
215
+ self._logger.warning(
216
+ "skipping marking Function Executor IDLE due to unexpected FE status",
217
+ status=self._function_executor_state.status,
218
+ )
219
+
220
+ _log_function_metrics(self._output, self._logger)
221
+
222
+ with (
223
+ metric_tasks_reporting_outcome.track_inprogress(),
224
+ metric_task_outcome_report_latency.time(),
225
+ ):
226
+ metric_task_outcome_reports.inc()
227
+ await self._report_task_outcome()
228
+
229
+ metric_task_completion_latency.observe(time.monotonic() - start_time)
230
+
231
+ async def _wait_for_idle_function_executor(self) -> None:
232
+ """Waits until the Function Executor is in IDLE state.
233
+
234
+ Raises an Exception if the Function Executor is in SHUTDOWN state.
235
+ """
236
+ with (
237
+ metric_task_policy_errors.count_exceptions(),
238
+ metric_tasks_blocked_by_policy.track_inprogress(),
239
+ metric_tasks_blocked_by_policy_per_function_name.labels(
240
+ function_name=self._task.function_name
241
+ ).track_inprogress(),
242
+ metric_task_policy_latency.time(),
243
+ ):
244
+ metric_task_policy_runs.inc()
245
+ self._logger.info(
246
+ "task is blocked by policy: waiting for idle function executor"
247
+ )
248
+ async with self._function_executor_state.lock:
249
+ await self._function_executor_state.wait_status(
250
+ allowlist=[
251
+ FunctionExecutorStatus.IDLE,
252
+ FunctionExecutorStatus.SHUTDOWN,
253
+ ]
254
+ )
255
+ if (
256
+ self._function_executor_state.status
257
+ == FunctionExecutorStatus.SHUTDOWN
258
+ ):
259
+ raise Exception(
260
+ "Task's Function Executor got shutdown, can't run task"
261
+ )
262
+ await self._function_executor_state.set_status(
263
+ FunctionExecutorStatus.RUNNING_TASK
264
+ )
265
+
266
+ # At this point the Function Executor belongs to this task controller due to RUNNING_TASK status.
267
+ # We can now unlock the FE state. We have to update the FE status once the task succeeds or fails.
268
+
269
+ async def _run_task(self) -> None:
270
+ request: RunTaskRequest = RunTaskRequest(
271
+ namespace=self._task.namespace,
272
+ graph_name=self._task.graph_name,
273
+ graph_version=self._task.graph_version,
274
+ function_name=self._task.function_name,
275
+ graph_invocation_id=self._task.graph_invocation_id,
276
+ task_id=self._task.id,
277
+ function_input=self._input,
278
+ )
279
+ if self._init_value is not None:
280
+ request.function_init_value.CopyFrom(self._init_value)
281
+ channel: grpc.aio.Channel = (
282
+ self._function_executor_state.function_executor.channel()
283
+ )
284
+
285
+ timeout_sec: Optional[float] = None
286
+ if self._task.HasField("timeout_ms"):
287
+ # TODO: Add integration tests with function timeout when end-to-end implementation is done.
288
+ timeout_sec = self._task.timeout_ms / 1000.0
289
+
290
+ async with _RunningTaskContextManager(
291
+ task=self._task,
292
+ function_executor_state=self._function_executor_state,
293
+ ):
294
+ with (
295
+ metric_function_executor_run_task_rpc_errors.count_exceptions(),
296
+ metric_function_executor_run_task_rpc_latency.time(),
297
+ ):
298
+ metric_function_executor_run_task_rpcs.inc()
299
+ # If this RPC failed due to customer code crashing the server we won't be
300
+ # able to detect this. We'll treat this as our own error for now and thus
301
+ # let the AioRpcError to be raised here.
302
+ try:
303
+ response: RunTaskResponse = await FunctionExecutorStub(
304
+ channel
305
+ ).run_task(request, timeout=timeout_sec)
306
+ except grpc.aio.AioRpcError as e:
307
+ if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
308
+ raise FunctionTimeoutError(
309
+ f"Task execution timeout {timeout_sec} expired"
310
+ ) from e
311
+ raise
312
+
313
+ self._output = _task_output(task=self._task, response=response)
314
+
315
+ async def _report_task_outcome(self) -> None:
316
+ """Reports the task with the given output to the server.
317
+
318
+ Doesn't raise any Exceptions. Runs till the reporting is successful."""
319
+ reporting_retries: int = 0
320
+
321
+ while True:
322
+ logger = self._logger.bind(retries=reporting_retries)
323
+ if self._is_cancelled:
324
+ logger.warning(
325
+ "task is cancelled, skipping its outcome reporting to workaround lack of server side retries"
326
+ )
327
+ break
328
+
329
+ try:
330
+ await self._task_reporter.report(output=self._output, logger=logger)
331
+ break
332
+ except Exception as e:
333
+ logger.error(
334
+ "failed to report task",
335
+ exc_info=e,
336
+ )
337
+ reporting_retries += 1
338
+ metric_task_outcome_report_retries.inc()
339
+ await asyncio.sleep(_TASK_OUTCOME_REPORT_BACKOFF_SEC)
340
+
341
+ await self._completed_tasks_container.add(self._task.id)
342
+ metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
343
+ if self._output.is_internal_error:
344
+ metric_tasks_completed.labels(
345
+ outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
346
+ ).inc()
347
+ elif self._output.success:
348
+ metric_tasks_completed.labels(
349
+ outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
350
+ ).inc()
351
+ else:
352
+ metric_tasks_completed.labels(
353
+ outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
354
+ ).inc()
355
+
356
+
357
+ def _validate_task(task: Task) -> None:
358
+ """Validates the supplied Task.
359
+
360
+ Raises ValueError if the Task is not valid.
361
+ """
362
+ validator = MessageValidator(task)
363
+ validator.required_field("id")
364
+ validator.required_field("namespace")
365
+ validator.required_field("graph_name")
366
+ validator.required_field("graph_version")
367
+ validator.required_field("function_name")
368
+ validator.required_field("graph_invocation_id")
369
+ validator.required_field("input_key")
370
+
371
+
372
+ def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
373
+ response_validator = MessageValidator(response)
374
+ response_validator.required_field("stdout")
375
+ response_validator.required_field("stderr")
376
+ response_validator.required_field("is_reducer")
377
+ response_validator.required_field("success")
378
+
379
+ metrics = TaskMetrics(counters={}, timers={})
380
+ if response.HasField("metrics"):
381
+ # Can be None if e.g. function failed.
382
+ metrics.counters = dict(response.metrics.counters)
383
+ metrics.timers = dict(response.metrics.timers)
384
+
385
+ output = TaskOutput(
386
+ task_id=task.id,
387
+ namespace=task.namespace,
388
+ graph_name=task.graph_name,
389
+ function_name=task.function_name,
390
+ graph_version=task.graph_version,
391
+ graph_invocation_id=task.graph_invocation_id,
392
+ stdout=response.stdout,
393
+ stderr=response.stderr,
394
+ reducer=response.is_reducer,
395
+ success=response.success,
396
+ metrics=metrics,
397
+ )
398
+
399
+ if response.HasField("function_output"):
400
+ output.function_output = response.function_output
401
+ if response.HasField("router_output"):
402
+ output.router_output = response.router_output
403
+
404
+ return output
405
+
406
+
407
+ # Temporary workaround is logging customer metrics until we store them somewhere
408
+ # for future retrieval and processing.
409
+ def _log_function_metrics(output: TaskOutput, logger: Any):
410
+ if output.metrics is None:
411
+ return
412
+
413
+ logger = logger.bind(
414
+ invocation_id=output.graph_invocation_id,
415
+ function_name=output.function_name,
416
+ graph_name=output.graph_name,
417
+ namespace=output.namespace,
418
+ )
419
+
420
+ for counter_name, counter_value in output.metrics.counters.items():
421
+ logger.info(
422
+ "function_metric", counter_name=counter_name, counter_value=counter_value
423
+ )
424
+ for timer_name, timer_value in output.metrics.timers.items():
425
+ logger.info("function_metric", timer_name=timer_name, timer_value=timer_value)
426
+
427
+
428
+ class _RunningTaskContextManager:
429
+ """Performs all the actions required before and after running a task."""
430
+
431
+ def __init__(
432
+ self,
433
+ task_controller: TaskController,
434
+ ):
435
+ self._task_controller: TaskController = task_controller
436
+
437
+ async def __aenter__(self):
438
+ self._task_controller._function_executor_state.function_executor.invocation_state_client().add_task_to_invocation_id_entry(
439
+ task_id=self._task_controller._task.id,
440
+ invocation_id=self._task_controller._task.graph_invocation_id,
441
+ )
442
+ self._task_controller._is_running = True
443
+ return self
444
+
445
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
446
+ self._task_controller._is_running = False
447
+ self._task_controller._function_executor_state.function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
448
+ task_id=self._task_controller._task.id,
449
+ )
@@ -20,3 +20,17 @@ metric_server_ingest_files_latency: prometheus_client.Histogram = (
20
20
  "server_ingest_files_request", "Ingest files request to Server"
21
21
  )
22
22
  )
23
+
24
+ metric_report_task_outcome_rpcs = prometheus_client.Counter(
25
+ "report_task_outcome_rpcs",
26
+ "Number of report task outcome RPCs to Server",
27
+ )
28
+ metric_report_task_outcome_errors = prometheus_client.Counter(
29
+ "report_task_outcome_rpc_errors",
30
+ "Number of report task outcome RPC errors",
31
+ )
32
+ metric_report_task_outcome_latency: prometheus_client.Histogram = (
33
+ latency_metric_for_fast_operation(
34
+ "report_task_outcome_rpc", "Report task outcome RPC to Server"
35
+ )
36
+ )
@@ -7,14 +7,27 @@ from httpx import Timeout
7
7
  from tensorlake.function_executor.proto.function_executor_pb2 import FunctionOutput
8
8
  from tensorlake.utils.http_client import get_httpx_client
9
9
 
10
+ from indexify.proto.executor_api_pb2 import (
11
+ DataPayload,
12
+ OutputEncoding,
13
+ ReportTaskOutcomeRequest,
14
+ TaskOutcome,
15
+ )
16
+ from indexify.proto.executor_api_pb2_grpc import ExecutorAPIStub
17
+
10
18
  from .api_objects import (
11
19
  TASK_OUTCOME_FAILURE,
12
20
  TASK_OUTCOME_SUCCESS,
21
+ IngestFnOutputsResponse,
13
22
  RouterOutput,
14
23
  TaskResult,
15
24
  )
16
25
  from .function_executor.task_output import TaskOutput
26
+ from .grpc.channel_manager import ChannelManager
17
27
  from .metrics.task_reporter import (
28
+ metric_report_task_outcome_errors,
29
+ metric_report_task_outcome_latency,
30
+ metric_report_task_outcome_rpcs,
18
31
  metric_server_ingest_files_errors,
19
32
  metric_server_ingest_files_latency,
20
33
  metric_server_ingest_files_requests,
@@ -45,7 +58,11 @@ class TaskOutputSummary:
45
58
 
46
59
  class TaskReporter:
47
60
  def __init__(
48
- self, base_url: str, executor_id: str, config_path: Optional[str] = None
61
+ self,
62
+ base_url: str,
63
+ executor_id: str,
64
+ channel_manager: ChannelManager,
65
+ config_path: Optional[str] = None,
49
66
  ):
50
67
  self._base_url = base_url
51
68
  self._executor_id = executor_id
@@ -56,6 +73,7 @@ class TaskReporter:
56
73
  # Creating a new async client for each request fixes this but it
57
74
  # results in not reusing established TCP connections to server.
58
75
  self._client = get_httpx_client(config_path, make_async=False)
76
+ self._channel_manager = channel_manager
59
77
 
60
78
  async def shutdown(self):
61
79
  """Shuts down the task reporter.
@@ -109,12 +127,12 @@ class TaskReporter:
109
127
  # Run in a separate thread to not block the main event loop.
110
128
  response = await asyncio.to_thread(
111
129
  self._client.post,
112
- url=f"{self._base_url}/internal/ingest_files",
130
+ url=f"{self._base_url}/internal/ingest_fn_outputs",
113
131
  **kwargs,
114
132
  )
115
133
  end_time = time.time()
116
134
  logger.info(
117
- "task outcome reported",
135
+ "files uploaded",
118
136
  response_time=end_time - start_time,
119
137
  response_code=response.status_code,
120
138
  )
@@ -125,11 +143,70 @@ class TaskReporter:
125
143
  metric_server_ingest_files_errors.inc()
126
144
  # Caller catches and logs the exception.
127
145
  raise Exception(
128
- "failed to report task outcome. "
146
+ "failed to upload files. "
129
147
  f"Response code: {response.status_code}. "
130
148
  f"Response text: '{response.text}'."
131
149
  ) from e
132
150
 
151
+ # TODO: If the files are uploaded successfully,
152
+ # we should record that so that if we fail to report
153
+ # the task outcome, we don't retry the upload.
154
+ # This will save us some time and resources.
155
+
156
+ ingested_files_response = response.json()
157
+ ingested_files = IngestFnOutputsResponse.model_validate(ingested_files_response)
158
+ fn_outputs = []
159
+ for data_payload in ingested_files.data_payloads:
160
+ fn_outputs.append(
161
+ DataPayload(
162
+ path=data_payload.path,
163
+ size=data_payload.size,
164
+ sha256_hash=data_payload.sha256_hash,
165
+ )
166
+ )
167
+ stdout, stderr = None, None
168
+ if ingested_files.stdout:
169
+ stdout = DataPayload(
170
+ path=ingested_files.stdout.path,
171
+ size=ingested_files.stdout.size,
172
+ sha256_hash=ingested_files.stdout.sha256_hash,
173
+ )
174
+ if ingested_files.stderr:
175
+ stderr = DataPayload(
176
+ path=ingested_files.stderr.path,
177
+ size=ingested_files.stderr.size,
178
+ sha256_hash=ingested_files.stderr.sha256_hash,
179
+ )
180
+
181
+ request = ReportTaskOutcomeRequest(
182
+ task_id=output.task_id,
183
+ namespace=output.namespace,
184
+ graph_name=output.graph_name,
185
+ function_name=output.function_name,
186
+ graph_invocation_id=output.graph_invocation_id,
187
+ outcome=_to_grpc_task_outcome(output),
188
+ invocation_id=output.graph_invocation_id,
189
+ executor_id=self._executor_id,
190
+ reducer=output.reducer,
191
+ next_functions=(output.router_output.edges if output.router_output else []),
192
+ fn_outputs=fn_outputs,
193
+ stdout=stdout,
194
+ stderr=stderr,
195
+ output_encoding=_to_grpc_output_encoding(output),
196
+ output_encoding_version=0,
197
+ )
198
+ try:
199
+ stub = ExecutorAPIStub(await self._channel_manager.get_channel())
200
+ with (
201
+ metric_report_task_outcome_latency.time(),
202
+ metric_report_task_outcome_errors.count_exceptions(),
203
+ ):
204
+ metric_report_task_outcome_rpcs.inc()
205
+ await stub.report_task_outcome(request, timeout=5.0)
206
+ except Exception as e:
207
+ logger.error("failed to report task outcome", error=e)
208
+ raise e
209
+
133
210
  def _process_task_output(
134
211
  self, output: TaskOutput
135
212
  ) -> Tuple[TaskResult, List[Any], TaskOutputSummary]:
@@ -246,3 +323,17 @@ def _process_stderr(
246
323
  )
247
324
  summary.stderr_count += 1
248
325
  summary.stderr_total_bytes += len(stderr)
326
+
327
+
328
+ def _to_grpc_task_outcome(task_output: TaskOutput) -> TaskOutcome:
329
+ if task_output.success:
330
+ return TaskOutcome.TASK_OUTCOME_SUCCESS
331
+ else:
332
+ return TaskOutcome.TASK_OUTCOME_FAILURE
333
+
334
+
335
+ def _to_grpc_output_encoding(task_output: TaskOutput) -> OutputEncoding:
336
+ if task_output.output_encoding == "json":
337
+ return OutputEncoding.OUTPUT_ENCODING_JSON
338
+ else:
339
+ return OutputEncoding.OUTPUT_ENCODING_PICKLE
@@ -105,6 +105,7 @@ class TaskRunner:
105
105
  graph_version=task_input.task.graph_version,
106
106
  function_name=task_input.task.compute_fn,
107
107
  image_uri=task_input.task.image_uri,
108
+ secret_names=task_input.task.secret_names or [],
108
109
  )
109
110
  await state.lock.acquire()
110
111