indexify 0.4.21__py3-none-any.whl → 0.4.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. indexify/cli/executor.py +2 -9
  2. indexify/executor/blob_store/blob_store.py +110 -26
  3. indexify/executor/blob_store/local_fs_blob_store.py +41 -1
  4. indexify/executor/blob_store/metrics/blob_store.py +87 -15
  5. indexify/executor/blob_store/s3_blob_store.py +112 -1
  6. indexify/executor/function_executor/function_executor.py +32 -56
  7. indexify/executor/function_executor/invocation_state_client.py +10 -3
  8. indexify/executor/function_executor/server/function_executor_server_factory.py +0 -1
  9. indexify/executor/function_executor_controller/create_function_executor.py +129 -116
  10. indexify/executor/function_executor_controller/downloads.py +34 -86
  11. indexify/executor/function_executor_controller/events.py +13 -7
  12. indexify/executor/function_executor_controller/finalize_task.py +184 -0
  13. indexify/executor/function_executor_controller/function_executor_controller.py +121 -78
  14. indexify/executor/function_executor_controller/message_validators.py +10 -3
  15. indexify/executor/function_executor_controller/metrics/downloads.py +8 -52
  16. indexify/executor/function_executor_controller/metrics/finalize_task.py +20 -0
  17. indexify/executor/function_executor_controller/metrics/prepare_task.py +18 -0
  18. indexify/executor/function_executor_controller/metrics/run_task.py +5 -4
  19. indexify/executor/function_executor_controller/prepare_task.py +232 -14
  20. indexify/executor/function_executor_controller/run_task.py +189 -81
  21. indexify/executor/function_executor_controller/task_info.py +4 -7
  22. indexify/executor/function_executor_controller/task_input.py +21 -0
  23. indexify/executor/function_executor_controller/task_output.py +41 -33
  24. indexify/executor/function_executor_controller/terminate_function_executor.py +6 -1
  25. indexify/executor/logging.py +69 -0
  26. indexify/executor/monitoring/metrics.py +22 -0
  27. indexify/proto/executor_api.proto +11 -3
  28. indexify/proto/executor_api_pb2.py +54 -54
  29. indexify/proto/executor_api_pb2.pyi +8 -1
  30. {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/METADATA +6 -7
  31. {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/RECORD +33 -31
  32. indexify/executor/function_executor_controller/function_executor_startup_output.py +0 -21
  33. indexify/executor/function_executor_controller/metrics/upload_task_output.py +0 -39
  34. indexify/executor/function_executor_controller/upload_task_output.py +0 -274
  35. {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/WHEEL +0 -0
  36. {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/entry_points.txt +0 -0
@@ -1,14 +1,17 @@
1
1
  import asyncio
2
- import os
3
- import random
4
2
  import time
5
3
  from typing import Any, Optional
6
4
 
7
5
  import grpc
8
6
  from tensorlake.function_executor.proto.function_executor_pb2 import (
9
- RunTaskRequest,
10
- RunTaskResponse,
11
- SerializedObject,
7
+ BLOB,
8
+ AwaitTaskProgress,
9
+ AwaitTaskRequest,
10
+ CreateTaskRequest,
11
+ DeleteTaskRequest,
12
+ SerializedObjectInsideBLOB,
13
+ Task,
14
+ TaskDiagnostics,
12
15
  )
13
16
  from tensorlake.function_executor.proto.function_executor_pb2 import (
14
17
  TaskFailureReason as FETaskFailureReason,
@@ -16,6 +19,9 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
16
19
  from tensorlake.function_executor.proto.function_executor_pb2 import (
17
20
  TaskOutcomeCode as FETaskOutcomeCode,
18
21
  )
22
+ from tensorlake.function_executor.proto.function_executor_pb2 import (
23
+ TaskResult,
24
+ )
19
25
  from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
20
26
  FunctionExecutorStub,
21
27
  )
@@ -40,9 +46,8 @@ from .metrics.run_task import (
40
46
  from .task_info import TaskInfo
41
47
  from .task_output import TaskMetrics, TaskOutput
42
48
 
43
- _ENABLE_INJECT_TASK_CANCELLATIONS = (
44
- os.getenv("INDEXIFY_INJECT_TASK_CANCELLATIONS", "0") == "1"
45
- )
49
+ _CREATE_TASK_TIMEOUT_SECS = 5
50
+ _DELETE_TASK_TIMEOUT_SECS = 5
46
51
 
47
52
 
48
53
  async def run_task_on_function_executor(
@@ -53,7 +58,22 @@ async def run_task_on_function_executor(
53
58
  Doesn't raise any exceptions.
54
59
  """
55
60
  logger = logger.bind(module=__name__)
56
- request: RunTaskRequest = RunTaskRequest(
61
+
62
+ if task_info.input is None:
63
+ logger.error(
64
+ "task input is None, this should never happen",
65
+ )
66
+ task_info.output = TaskOutput.internal_error(
67
+ allocation=task_info.allocation,
68
+ execution_start_time=None,
69
+ execution_end_time=None,
70
+ )
71
+ return TaskExecutionFinished(
72
+ task_info=task_info,
73
+ function_executor_termination_reason=None,
74
+ )
75
+
76
+ task = Task(
57
77
  namespace=task_info.allocation.task.namespace,
58
78
  graph_name=task_info.allocation.task.graph_name,
59
79
  graph_version=task_info.allocation.task.graph_version,
@@ -61,15 +81,8 @@ async def run_task_on_function_executor(
61
81
  graph_invocation_id=task_info.allocation.task.graph_invocation_id,
62
82
  task_id=task_info.allocation.task.id,
63
83
  allocation_id=task_info.allocation.allocation_id,
64
- function_input=task_info.input,
84
+ request=task_info.input.function_inputs,
65
85
  )
66
- # Don't keep the input in memory after we started running the task.
67
- task_info.input = None
68
-
69
- if task_info.init_value is not None:
70
- request.function_init_value.CopyFrom(task_info.init_value)
71
- # Don't keep the init value in memory after we started running the task.
72
- task_info.init_value = None
73
86
 
74
87
  function_executor.invocation_state_client().add_task_to_invocation_id_entry(
75
88
  task_id=task_info.allocation.task.id,
@@ -78,51 +91,78 @@ async def run_task_on_function_executor(
78
91
 
79
92
  metric_function_executor_run_task_rpcs.inc()
80
93
  metric_function_executor_run_task_rpcs_in_progress.inc()
81
- start_time = time.monotonic()
82
94
  # Not None if the Function Executor should be terminated after running the task.
83
95
  function_executor_termination_reason: Optional[
84
96
  FunctionExecutorTerminationReason
85
97
  ] = None
86
- execution_start_time: Optional[float] = None
98
+
99
+ # NB: We start this timer before invoking the first RPC, since
100
+ # user code should be executing by the time the create_task() RPC
101
+ # returns, so not attributing the task management RPC overhead to
102
+ # the user would open a possibility for abuse. (This is somewhat
103
+ # mitigated by the fact that these RPCs should have a very low
104
+ # overhead.)
105
+ execution_start_time: Optional[float] = time.monotonic()
87
106
 
88
107
  # If this RPC failed due to customer code crashing the server we won't be
89
108
  # able to detect this. We'll treat this as our own error for now and thus
90
109
  # let the AioRpcError to be raised here.
91
- timeout_sec = task_info.allocation.task.timeout_ms / 1000.0
110
+ timeout_sec: float = task_info.allocation.task.timeout_ms / 1000.0
92
111
  try:
93
- channel: grpc.aio.Channel = function_executor.channel()
94
- execution_start_time = time.monotonic()
95
- response: RunTaskResponse = await FunctionExecutorStub(channel).run_task(
96
- request, timeout=timeout_sec
97
- )
98
- task_info.output = _task_output_from_function_executor_response(
112
+ # This aio task can only be cancelled during this await call.
113
+ task_result = await _run_task_rpcs(task, function_executor, timeout_sec)
114
+
115
+ _process_task_diagnostics(task_result.diagnostics, logger)
116
+
117
+ task_info.output = _task_output_from_function_executor_result(
99
118
  allocation=task_info.allocation,
100
- response=response,
119
+ result=task_result,
101
120
  execution_start_time=execution_start_time,
102
121
  execution_end_time=time.monotonic(),
103
122
  logger=logger,
104
123
  )
124
+ except asyncio.TimeoutError:
125
+ # This is an await_task() RPC timeout - we're not getting
126
+ # progress messages or a task completion.
127
+ function_executor_termination_reason = (
128
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT
129
+ )
130
+ task_info.output = TaskOutput.function_timeout(
131
+ allocation=task_info.allocation,
132
+ execution_start_time=execution_start_time,
133
+ execution_end_time=time.monotonic(),
134
+ )
105
135
  except grpc.aio.AioRpcError as e:
136
+ # This indicates some sort of problem communicating with the FE.
137
+ #
138
+ # NB: We charge the user in these situations: code within the
139
+ # FE is not isolated, so not charging would enable abuse.
140
+ #
141
+ # This is an unexpected situation, though, so we make sure to
142
+ # log the situation for further investigation.
143
+
144
+ function_executor_termination_reason = (
145
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
146
+ )
147
+ metric_function_executor_run_task_rpc_errors.inc()
148
+
106
149
  if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
107
- # The task is still running in FE, we only cancelled the client-side RPC.
108
- function_executor_termination_reason = (
109
- FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT
110
- )
111
- task_info.output = TaskOutput.function_timeout(
112
- allocation=task_info.allocation,
113
- timeout_sec=timeout_sec,
114
- execution_start_time=execution_start_time,
115
- execution_end_time=time.monotonic(),
116
- )
150
+ # This is either a create_task() RPC timeout or a
151
+ # delete_task() RPC timeout; either suggests that the FE
152
+ # is unhealthy.
153
+ logger.error("task management RPC execution deadline exceeded", exc_info=e)
117
154
  else:
118
- metric_function_executor_run_task_rpc_errors.inc()
119
- logger.error("task execution failed", exc_info=e)
120
- task_info.output = TaskOutput.internal_error(
121
- allocation=task_info.allocation,
122
- execution_start_time=execution_start_time,
123
- execution_end_time=time.monotonic(),
124
- )
155
+ # This is a status from an unsuccessful RPC; this
156
+ # shouldn't happen, but we handle it.
157
+ logger.error("task management RPC failed", exc_info=e)
158
+
159
+ task_info.output = TaskOutput.function_executor_unresponsive(
160
+ allocation=task_info.allocation,
161
+ execution_start_time=execution_start_time,
162
+ execution_end_time=time.monotonic(),
163
+ )
125
164
  except asyncio.CancelledError:
165
+ # Handle aio task cancellation during `await _run_task_rpcs`.
126
166
  # The task is still running in FE, we only cancelled the client-side RPC.
127
167
  function_executor_termination_reason = (
128
168
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED
@@ -133,15 +173,20 @@ async def run_task_on_function_executor(
133
173
  execution_end_time=time.monotonic(),
134
174
  )
135
175
  except Exception as e:
136
- metric_function_executor_run_task_rpc_errors.inc()
137
- logger.error("task execution failed", exc_info=e)
176
+ # This is an unexpected exception; we believe that this
177
+ # indicates an internal error.
178
+ logger.error(
179
+ "Unexpected internal error during task lifecycle RPC sequence", exc_info=e
180
+ )
138
181
  task_info.output = TaskOutput.internal_error(
139
182
  allocation=task_info.allocation,
140
183
  execution_start_time=execution_start_time,
141
184
  execution_end_time=time.monotonic(),
142
185
  )
143
186
 
144
- metric_function_executor_run_task_rpc_latency.observe(time.monotonic() - start_time)
187
+ metric_function_executor_run_task_rpc_latency.observe(
188
+ time.monotonic() - execution_start_time
189
+ )
145
190
  metric_function_executor_run_task_rpcs_in_progress.dec()
146
191
 
147
192
  function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
@@ -152,16 +197,21 @@ async def run_task_on_function_executor(
152
197
  task_info.output.outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
153
198
  and function_executor_termination_reason is None
154
199
  ):
155
- # Check if the task failed because the FE is unhealthy to prevent more tasks failing.
156
- result: HealthCheckResult = await function_executor.health_checker().check()
157
- if not result.is_healthy:
158
- function_executor_termination_reason = (
159
- FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
160
- )
161
- logger.error(
162
- "Function Executor health check failed after running task, shutting down Function Executor",
163
- health_check_fail_reason=result.reason,
164
- )
200
+ try:
201
+ # Check if the task failed because the FE is unhealthy to prevent more tasks failing.
202
+ result: HealthCheckResult = await function_executor.health_checker().check()
203
+ if not result.is_healthy:
204
+ function_executor_termination_reason = (
205
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
206
+ )
207
+ logger.error(
208
+ "Function Executor health check failed after running task, shutting down Function Executor",
209
+ health_check_fail_reason=result.reason,
210
+ )
211
+ except asyncio.CancelledError:
212
+ # The aio task was cancelled during the health check await.
213
+ # We can't conclude anything about the health of the FE here.
214
+ pass
165
215
 
166
216
  _log_task_execution_finished(output=task_info.output, logger=logger)
167
217
 
@@ -171,56 +221,106 @@ async def run_task_on_function_executor(
171
221
  )
172
222
 
173
223
 
174
- def _task_output_from_function_executor_response(
224
+ async def _run_task_rpcs(
225
+ task: Task, function_executor: FunctionExecutor, timeout_sec: float
226
+ ) -> TaskResult:
227
+ """Runs the task, returning the result, reporting errors via exceptions."""
228
+ task_result: Optional[TaskResult] = None
229
+ channel: grpc.aio.Channel = function_executor.channel()
230
+ fe_stub = FunctionExecutorStub(channel)
231
+
232
+ # Create task with timeout
233
+ await fe_stub.create_task(
234
+ CreateTaskRequest(task=task), timeout=_CREATE_TASK_TIMEOUT_SECS
235
+ )
236
+
237
+ # Await task with timeout resets on each response
238
+ await_rpc = fe_stub.await_task(AwaitTaskRequest(task_id=task.task_id))
239
+
240
+ try:
241
+ while True:
242
+ # Wait for next response with fresh timeout each time
243
+ response: AwaitTaskProgress = await asyncio.wait_for(
244
+ await_rpc.read(), timeout=timeout_sec
245
+ )
246
+
247
+ if response == grpc.aio.EOF:
248
+ break
249
+ elif response.WhichOneof("response") == "task_result":
250
+ task_result = response.task_result
251
+ break
252
+
253
+ # NB: We don't actually check for other message types
254
+ # here; any message from the FE is treated as an
255
+ # indication that it's making forward progress.
256
+ finally:
257
+ # Cancel the outstanding RPC to ensure any resources in use
258
+ # are cleaned up; note that this is idempotent (in case the
259
+ # RPC has already completed).
260
+ await_rpc.cancel()
261
+
262
+ # Delete task with timeout
263
+ await fe_stub.delete_task(
264
+ DeleteTaskRequest(task_id=task.task_id), timeout=_DELETE_TASK_TIMEOUT_SECS
265
+ )
266
+
267
+ if task_result is None:
268
+ raise grpc.aio.AioRpcError(
269
+ grpc.StatusCode.CANCELLED,
270
+ None,
271
+ None,
272
+ "Function Executor didn't return function/task alloc result",
273
+ )
274
+
275
+ return task_result
276
+
277
+
278
+ def _task_output_from_function_executor_result(
175
279
  allocation: TaskAllocation,
176
- response: RunTaskResponse,
280
+ result: TaskResult,
177
281
  execution_start_time: Optional[float],
178
282
  execution_end_time: Optional[float],
179
283
  logger: Any,
180
284
  ) -> TaskOutput:
181
- response_validator = MessageValidator(response)
182
- response_validator.required_field("stdout")
183
- response_validator.required_field("stderr")
285
+ response_validator = MessageValidator(result)
184
286
  response_validator.required_field("outcome_code")
185
287
 
186
288
  metrics = TaskMetrics(counters={}, timers={})
187
- if response.HasField("metrics"):
289
+ if result.HasField("metrics"):
188
290
  # Can be None if e.g. function failed.
189
- metrics.counters = dict(response.metrics.counters)
190
- metrics.timers = dict(response.metrics.timers)
291
+ metrics.counters = dict(result.metrics.counters)
292
+ metrics.timers = dict(result.metrics.timers)
191
293
 
192
294
  outcome_code: TaskOutcomeCode = _to_task_outcome_code(
193
- response.outcome_code, logger=logger
295
+ result.outcome_code, logger=logger
194
296
  )
195
297
  failure_reason: Optional[TaskFailureReason] = None
196
- invocation_error_output: Optional[SerializedObject] = None
298
+ invocation_error_output: Optional[SerializedObjectInsideBLOB] = None
299
+ uploaded_invocation_error_blob: Optional[BLOB] = None
197
300
 
198
301
  if outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE:
199
302
  response_validator.required_field("failure_reason")
200
303
  failure_reason: Optional[TaskFailureReason] = _to_task_failure_reason(
201
- response.failure_reason, logger
304
+ result.failure_reason, logger
202
305
  )
203
306
  if failure_reason == TaskFailureReason.TASK_FAILURE_REASON_INVOCATION_ERROR:
204
307
  response_validator.required_field("invocation_error_output")
205
- invocation_error_output = response.invocation_error_output
206
-
207
- if _ENABLE_INJECT_TASK_CANCELLATIONS:
208
- logger.warning("injecting cancellation failure for the task allocation")
209
- if (
210
- random.random() < 0.5
211
- ): # 50% chance to get stable reproduction in manual testing
212
- outcome_code = TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
213
- failure_reason = TaskFailureReason.TASK_FAILURE_REASON_TASK_CANCELLED
308
+ response_validator.required_field("uploaded_invocation_error_blob")
309
+ invocation_error_output = result.invocation_error_output
310
+ uploaded_invocation_error_blob = result.uploaded_invocation_error_blob
311
+ elif outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS:
312
+ # function_outputs can have no items, this happens when the function returns None.
313
+ response_validator.required_field("uploaded_function_outputs_blob")
214
314
 
215
315
  return TaskOutput(
216
316
  allocation=allocation,
217
317
  outcome_code=outcome_code,
218
318
  failure_reason=failure_reason,
319
+ function_outputs=list(result.function_outputs),
320
+ uploaded_function_outputs_blob=result.uploaded_function_outputs_blob,
219
321
  invocation_error_output=invocation_error_output,
220
- function_outputs=response.function_outputs,
221
- next_functions=response.next_functions,
222
- stdout=response.stdout,
223
- stderr=response.stderr,
322
+ uploaded_invocation_error_blob=uploaded_invocation_error_blob,
323
+ next_functions=list(result.next_functions),
224
324
  metrics=metrics,
225
325
  execution_start_time=execution_start_time,
226
326
  execution_end_time=execution_end_time,
@@ -240,6 +340,14 @@ def _log_task_execution_finished(output: TaskOutput, logger: Any) -> None:
240
340
  )
241
341
 
242
342
 
343
+ def _process_task_diagnostics(task_diagnostics: TaskDiagnostics, logger: Any) -> None:
344
+ MessageValidator(task_diagnostics).required_field("function_executor_log")
345
+ # Uncomment these lines once we stop printing FE logs to stdout/stderr.
346
+ # Print FE logs directly to Executor logs so operators can see them.
347
+ # logger.info("Function Executor logs during task execution:")
348
+ # print(task_diagnostics.function_executor_log)
349
+
350
+
243
351
  def _to_task_outcome_code(
244
352
  fe_task_outcome_code: FETaskOutcomeCode, logger
245
353
  ) -> TaskOutcomeCode:
@@ -2,10 +2,9 @@ import asyncio
2
2
  from dataclasses import dataclass
3
3
  from typing import Optional
4
4
 
5
- from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
6
-
7
5
  from indexify.proto.executor_api_pb2 import TaskAllocation
8
6
 
7
+ from .task_input import TaskInput
9
8
  from .task_output import TaskOutput
10
9
 
11
10
 
@@ -22,11 +21,9 @@ class TaskInfo:
22
21
  is_cancelled: bool = False
23
22
  # aio task that is currently executing a lifecycle step of this task.
24
23
  aio_task: Optional[asyncio.Task] = None
25
- # Downloaded input if function was prepared successfully.
26
- input: Optional[SerializedObject] = None
27
- # Downloaded init value if function was prepared successfully and is a reducer.
28
- init_value: Optional[SerializedObject] = None
29
- # Output of the task.
24
+ # Input if function was prepared successfully.
25
+ input: Optional[TaskInput] = None
26
+ # Output of the task, always set when the task is completed.
30
27
  output: Optional[TaskOutput] = None
31
28
  # True if the task is fully completed and was added to state reporter.
32
29
  is_completed: bool = False
@@ -0,0 +1,21 @@
1
+ from tensorlake.function_executor.proto.function_executor_pb2 import FunctionInputs
2
+
3
+
4
+ class TaskInput:
5
+ """Represents the input for a task in the function executor controller."""
6
+
7
+ def __init__(
8
+ self,
9
+ function_inputs: FunctionInputs,
10
+ function_outputs_blob_uri: str,
11
+ function_outputs_blob_upload_id: str,
12
+ invocation_error_blob_uri: str,
13
+ invocation_error_blob_upload_id: str,
14
+ ):
15
+ # Actual input object sent to FE.
16
+ self.function_inputs = function_inputs
17
+ # Executor side function input related bookkeeping.
18
+ self.function_outputs_blob_uri = function_outputs_blob_uri
19
+ self.function_outputs_blob_upload_id = function_outputs_blob_upload_id
20
+ self.invocation_error_blob_uri = invocation_error_blob_uri
21
+ self.invocation_error_blob_upload_id = invocation_error_blob_upload_id
@@ -1,19 +1,17 @@
1
1
  from typing import Any, Dict, List, Optional
2
2
 
3
3
  from tensorlake.function_executor.proto.function_executor_pb2 import (
4
- SerializedObject,
4
+ BLOB,
5
+ SerializedObjectInsideBLOB,
5
6
  )
6
7
 
7
8
  from indexify.proto.executor_api_pb2 import (
8
- DataPayload,
9
9
  FunctionExecutorTerminationReason,
10
10
  TaskAllocation,
11
11
  TaskFailureReason,
12
12
  TaskOutcomeCode,
13
13
  )
14
14
 
15
- from .function_executor_startup_output import FunctionExecutorStartupOutput
16
-
17
15
 
18
16
  class TaskMetrics:
19
17
  """Metrics for a task."""
@@ -30,33 +28,27 @@ class TaskOutput:
30
28
  self,
31
29
  allocation: TaskAllocation,
32
30
  outcome_code: TaskOutcomeCode,
33
- # Optional[TaskFailureReason] is not supported in python 3.9
34
- failure_reason: TaskFailureReason = None,
35
- invocation_error_output: Optional[SerializedObject] = None,
36
- function_outputs: List[SerializedObject] = [],
31
+ failure_reason: Optional[TaskFailureReason] = None,
32
+ function_outputs: List[SerializedObjectInsideBLOB] = [],
33
+ uploaded_function_outputs_blob: Optional[BLOB] = None,
34
+ invocation_error_output: Optional[SerializedObjectInsideBLOB] = None,
35
+ uploaded_invocation_error_blob: Optional[BLOB] = None,
37
36
  next_functions: List[str] = [],
38
- stdout: Optional[str] = None,
39
- stderr: Optional[str] = None,
40
37
  metrics: Optional[TaskMetrics] = None,
41
38
  execution_start_time: Optional[float] = None,
42
39
  execution_end_time: Optional[float] = None,
43
40
  ):
44
- self.task = allocation.task
45
41
  self.allocation = allocation
46
- self.function_outputs = function_outputs
47
- self.next_functions = next_functions
48
- self.stdout = stdout
49
- self.stderr = stderr
50
42
  self.outcome_code = outcome_code
51
43
  self.failure_reason = failure_reason
44
+ self.function_outputs = function_outputs
45
+ self.uploaded_function_outputs_blob = uploaded_function_outputs_blob
52
46
  self.invocation_error_output = invocation_error_output
47
+ self.uploaded_invocation_error_blob = uploaded_invocation_error_blob
48
+ self.next_functions = next_functions
53
49
  self.metrics = metrics
54
50
  self.execution_start_time = execution_start_time
55
51
  self.execution_end_time = execution_end_time
56
- self.uploaded_data_payloads: List[DataPayload] = []
57
- self.uploaded_stdout: Optional[DataPayload] = None
58
- self.uploaded_stderr: Optional[DataPayload] = None
59
- self.uploaded_invocation_error_output: Optional[DataPayload] = None
60
52
 
61
53
  @classmethod
62
54
  def internal_error(
@@ -66,12 +58,10 @@ class TaskOutput:
66
58
  execution_end_time: Optional[float],
67
59
  ) -> "TaskOutput":
68
60
  """Creates a TaskOutput for an internal error."""
69
- # We are not sharing internal error messages with the customer.
70
61
  return TaskOutput(
71
62
  allocation=allocation,
72
63
  outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
73
64
  failure_reason=TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR,
74
- stderr="Platform failed to execute the function.",
75
65
  execution_start_time=execution_start_time,
76
66
  execution_end_time=execution_end_time,
77
67
  )
@@ -80,17 +70,33 @@ class TaskOutput:
80
70
  def function_timeout(
81
71
  cls,
82
72
  allocation: TaskAllocation,
83
- timeout_sec: float,
84
73
  execution_start_time: Optional[float],
85
74
  execution_end_time: Optional[float],
86
75
  ) -> "TaskOutput":
87
- """Creates a TaskOutput for an function timeout error."""
88
- # Task stdout, stderr is not available.
76
+ """Creates a TaskOutput for a function timeout error."""
89
77
  return TaskOutput(
90
78
  allocation=allocation,
91
79
  outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
92
80
  failure_reason=TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_TIMEOUT,
93
- stderr=f"Function exceeded its configured timeout of {timeout_sec:.3f} sec.",
81
+ execution_start_time=execution_start_time,
82
+ execution_end_time=execution_end_time,
83
+ )
84
+
85
+ @classmethod
86
+ def function_executor_unresponsive(
87
+ cls,
88
+ allocation: TaskAllocation,
89
+ execution_start_time: Optional[float],
90
+ execution_end_time: Optional[float],
91
+ ) -> "TaskOutput":
92
+ """Creates a TaskOutput for an unresponsive FE aka grey failure."""
93
+ # When FE is unresponsive we don't know exact cause of the failure.
94
+ return TaskOutput(
95
+ allocation=allocation,
96
+ outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
97
+ # Treat the grey failure as a function error and thus charge the customer.
98
+ # This is to prevent service abuse by intentionally misbehaving functions.
99
+ failure_reason=TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR,
94
100
  execution_start_time=execution_start_time,
95
101
  execution_end_time=execution_end_time,
96
102
  )
@@ -127,21 +133,17 @@ class TaskOutput:
127
133
  def function_executor_startup_failed(
128
134
  cls,
129
135
  allocation: TaskAllocation,
130
- fe_startup_output: FunctionExecutorStartupOutput,
136
+ fe_termination_reason: FunctionExecutorTerminationReason,
131
137
  logger: Any,
132
138
  ) -> "TaskOutput":
133
139
  """Creates a TaskOutput for the case when we fail a task that didn't run because its FE startup failed."""
134
- output = TaskOutput(
140
+ return TaskOutput(
135
141
  allocation=allocation,
136
142
  outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
137
143
  failure_reason=_fe_startup_failure_reason_to_task_failure_reason(
138
- fe_startup_output.termination_reason, logger
144
+ fe_termination_reason, logger
139
145
  ),
140
146
  )
141
- # Use FE startup stdout, stderr for allocations that we failed because FE startup failed.
142
- output.uploaded_stdout = fe_startup_output.stdout
143
- output.uploaded_stderr = fe_startup_output.stderr
144
- return output
145
147
 
146
148
 
147
149
  def _fe_startup_failure_reason_to_task_failure_reason(
@@ -163,6 +165,12 @@ def _fe_startup_failure_reason_to_task_failure_reason(
163
165
  == FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR
164
166
  ):
165
167
  return TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR
168
+ elif (
169
+ fe_termination_reason
170
+ == FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED
171
+ ):
172
+ # This fe termination reason is used when FE gets deleted by Server from desired state while it's starting up.
173
+ return TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED
166
174
  else:
167
175
  logger.error(
168
176
  "unexpected function executor startup failure reason",
@@ -170,4 +178,4 @@ def _fe_startup_failure_reason_to_task_failure_reason(
170
178
  fe_termination_reason
171
179
  ),
172
180
  )
173
- return TaskFailureReason.TASK_FAILURE_REASON_UNKNOWN
181
+ return TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR
@@ -29,7 +29,12 @@ async def terminate_function_executor(
29
29
  logger.info(
30
30
  "destroying function executor",
31
31
  )
32
- await function_executor.destroy()
32
+ try:
33
+ # This await is a cancellation point, need to shield to ensure we destroyed the FE.
34
+ await asyncio.shield(function_executor.destroy())
35
+ except asyncio.CancelledError:
36
+ # We actually destroyed the FE so we can return without error.
37
+ pass
33
38
 
34
39
  return FunctionExecutorTerminated(
35
40
  is_success=True,