indexify 0.4.21__py3-none-any.whl → 0.4.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/executor.py +2 -9
- indexify/executor/blob_store/blob_store.py +110 -26
- indexify/executor/blob_store/local_fs_blob_store.py +41 -1
- indexify/executor/blob_store/metrics/blob_store.py +87 -15
- indexify/executor/blob_store/s3_blob_store.py +112 -1
- indexify/executor/function_executor/function_executor.py +32 -56
- indexify/executor/function_executor/invocation_state_client.py +10 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +0 -1
- indexify/executor/function_executor_controller/create_function_executor.py +129 -116
- indexify/executor/function_executor_controller/downloads.py +34 -86
- indexify/executor/function_executor_controller/events.py +13 -7
- indexify/executor/function_executor_controller/finalize_task.py +184 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +121 -78
- indexify/executor/function_executor_controller/message_validators.py +10 -3
- indexify/executor/function_executor_controller/metrics/downloads.py +8 -52
- indexify/executor/function_executor_controller/metrics/finalize_task.py +20 -0
- indexify/executor/function_executor_controller/metrics/prepare_task.py +18 -0
- indexify/executor/function_executor_controller/metrics/run_task.py +5 -4
- indexify/executor/function_executor_controller/prepare_task.py +232 -14
- indexify/executor/function_executor_controller/run_task.py +189 -81
- indexify/executor/function_executor_controller/task_info.py +4 -7
- indexify/executor/function_executor_controller/task_input.py +21 -0
- indexify/executor/function_executor_controller/task_output.py +41 -33
- indexify/executor/function_executor_controller/terminate_function_executor.py +6 -1
- indexify/executor/logging.py +69 -0
- indexify/executor/monitoring/metrics.py +22 -0
- indexify/proto/executor_api.proto +11 -3
- indexify/proto/executor_api_pb2.py +54 -54
- indexify/proto/executor_api_pb2.pyi +8 -1
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/METADATA +6 -7
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/RECORD +33 -31
- indexify/executor/function_executor_controller/function_executor_startup_output.py +0 -21
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +0 -39
- indexify/executor/function_executor_controller/upload_task_output.py +0 -274
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/WHEEL +0 -0
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/entry_points.txt +0 -0
@@ -1,14 +1,17 @@
|
|
1
1
|
import asyncio
|
2
|
-
import os
|
3
|
-
import random
|
4
2
|
import time
|
5
3
|
from typing import Any, Optional
|
6
4
|
|
7
5
|
import grpc
|
8
6
|
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
9
|
-
|
10
|
-
|
11
|
-
|
7
|
+
BLOB,
|
8
|
+
AwaitTaskProgress,
|
9
|
+
AwaitTaskRequest,
|
10
|
+
CreateTaskRequest,
|
11
|
+
DeleteTaskRequest,
|
12
|
+
SerializedObjectInsideBLOB,
|
13
|
+
Task,
|
14
|
+
TaskDiagnostics,
|
12
15
|
)
|
13
16
|
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
14
17
|
TaskFailureReason as FETaskFailureReason,
|
@@ -16,6 +19,9 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
|
|
16
19
|
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
17
20
|
TaskOutcomeCode as FETaskOutcomeCode,
|
18
21
|
)
|
22
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
23
|
+
TaskResult,
|
24
|
+
)
|
19
25
|
from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
20
26
|
FunctionExecutorStub,
|
21
27
|
)
|
@@ -40,9 +46,8 @@ from .metrics.run_task import (
|
|
40
46
|
from .task_info import TaskInfo
|
41
47
|
from .task_output import TaskMetrics, TaskOutput
|
42
48
|
|
43
|
-
|
44
|
-
|
45
|
-
)
|
49
|
+
_CREATE_TASK_TIMEOUT_SECS = 5
|
50
|
+
_DELETE_TASK_TIMEOUT_SECS = 5
|
46
51
|
|
47
52
|
|
48
53
|
async def run_task_on_function_executor(
|
@@ -53,7 +58,22 @@ async def run_task_on_function_executor(
|
|
53
58
|
Doesn't raise any exceptions.
|
54
59
|
"""
|
55
60
|
logger = logger.bind(module=__name__)
|
56
|
-
|
61
|
+
|
62
|
+
if task_info.input is None:
|
63
|
+
logger.error(
|
64
|
+
"task input is None, this should never happen",
|
65
|
+
)
|
66
|
+
task_info.output = TaskOutput.internal_error(
|
67
|
+
allocation=task_info.allocation,
|
68
|
+
execution_start_time=None,
|
69
|
+
execution_end_time=None,
|
70
|
+
)
|
71
|
+
return TaskExecutionFinished(
|
72
|
+
task_info=task_info,
|
73
|
+
function_executor_termination_reason=None,
|
74
|
+
)
|
75
|
+
|
76
|
+
task = Task(
|
57
77
|
namespace=task_info.allocation.task.namespace,
|
58
78
|
graph_name=task_info.allocation.task.graph_name,
|
59
79
|
graph_version=task_info.allocation.task.graph_version,
|
@@ -61,15 +81,8 @@ async def run_task_on_function_executor(
|
|
61
81
|
graph_invocation_id=task_info.allocation.task.graph_invocation_id,
|
62
82
|
task_id=task_info.allocation.task.id,
|
63
83
|
allocation_id=task_info.allocation.allocation_id,
|
64
|
-
|
84
|
+
request=task_info.input.function_inputs,
|
65
85
|
)
|
66
|
-
# Don't keep the input in memory after we started running the task.
|
67
|
-
task_info.input = None
|
68
|
-
|
69
|
-
if task_info.init_value is not None:
|
70
|
-
request.function_init_value.CopyFrom(task_info.init_value)
|
71
|
-
# Don't keep the init value in memory after we started running the task.
|
72
|
-
task_info.init_value = None
|
73
86
|
|
74
87
|
function_executor.invocation_state_client().add_task_to_invocation_id_entry(
|
75
88
|
task_id=task_info.allocation.task.id,
|
@@ -78,51 +91,78 @@ async def run_task_on_function_executor(
|
|
78
91
|
|
79
92
|
metric_function_executor_run_task_rpcs.inc()
|
80
93
|
metric_function_executor_run_task_rpcs_in_progress.inc()
|
81
|
-
start_time = time.monotonic()
|
82
94
|
# Not None if the Function Executor should be terminated after running the task.
|
83
95
|
function_executor_termination_reason: Optional[
|
84
96
|
FunctionExecutorTerminationReason
|
85
97
|
] = None
|
86
|
-
|
98
|
+
|
99
|
+
# NB: We start this timer before invoking the first RPC, since
|
100
|
+
# user code should be executing by the time the create_task() RPC
|
101
|
+
# returns, so not attributing the task management RPC overhead to
|
102
|
+
# the user would open a possibility for abuse. (This is somewhat
|
103
|
+
# mitigated by the fact that these RPCs should have a very low
|
104
|
+
# overhead.)
|
105
|
+
execution_start_time: Optional[float] = time.monotonic()
|
87
106
|
|
88
107
|
# If this RPC failed due to customer code crashing the server we won't be
|
89
108
|
# able to detect this. We'll treat this as our own error for now and thus
|
90
109
|
# let the AioRpcError to be raised here.
|
91
|
-
timeout_sec = task_info.allocation.task.timeout_ms / 1000.0
|
110
|
+
timeout_sec: float = task_info.allocation.task.timeout_ms / 1000.0
|
92
111
|
try:
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
task_info.output =
|
112
|
+
# This aio task can only be cancelled during this await call.
|
113
|
+
task_result = await _run_task_rpcs(task, function_executor, timeout_sec)
|
114
|
+
|
115
|
+
_process_task_diagnostics(task_result.diagnostics, logger)
|
116
|
+
|
117
|
+
task_info.output = _task_output_from_function_executor_result(
|
99
118
|
allocation=task_info.allocation,
|
100
|
-
|
119
|
+
result=task_result,
|
101
120
|
execution_start_time=execution_start_time,
|
102
121
|
execution_end_time=time.monotonic(),
|
103
122
|
logger=logger,
|
104
123
|
)
|
124
|
+
except asyncio.TimeoutError:
|
125
|
+
# This is an await_task() RPC timeout - we're not getting
|
126
|
+
# progress messages or a task completion.
|
127
|
+
function_executor_termination_reason = (
|
128
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT
|
129
|
+
)
|
130
|
+
task_info.output = TaskOutput.function_timeout(
|
131
|
+
allocation=task_info.allocation,
|
132
|
+
execution_start_time=execution_start_time,
|
133
|
+
execution_end_time=time.monotonic(),
|
134
|
+
)
|
105
135
|
except grpc.aio.AioRpcError as e:
|
136
|
+
# This indicates some sort of problem communicating with the FE.
|
137
|
+
#
|
138
|
+
# NB: We charge the user in these situations: code within the
|
139
|
+
# FE is not isolated, so not charging would enable abuse.
|
140
|
+
#
|
141
|
+
# This is an unexpected situation, though, so we make sure to
|
142
|
+
# log the situation for further investigation.
|
143
|
+
|
144
|
+
function_executor_termination_reason = (
|
145
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
|
146
|
+
)
|
147
|
+
metric_function_executor_run_task_rpc_errors.inc()
|
148
|
+
|
106
149
|
if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
|
107
|
-
#
|
108
|
-
|
109
|
-
|
110
|
-
)
|
111
|
-
task_info.output = TaskOutput.function_timeout(
|
112
|
-
allocation=task_info.allocation,
|
113
|
-
timeout_sec=timeout_sec,
|
114
|
-
execution_start_time=execution_start_time,
|
115
|
-
execution_end_time=time.monotonic(),
|
116
|
-
)
|
150
|
+
# This is either a create_task() RPC timeout or a
|
151
|
+
# delete_task() RPC timeout; either suggests that the FE
|
152
|
+
# is unhealthy.
|
153
|
+
logger.error("task management RPC execution deadline exceeded", exc_info=e)
|
117
154
|
else:
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
155
|
+
# This is a status from an unsuccessful RPC; this
|
156
|
+
# shouldn't happen, but we handle it.
|
157
|
+
logger.error("task management RPC failed", exc_info=e)
|
158
|
+
|
159
|
+
task_info.output = TaskOutput.function_executor_unresponsive(
|
160
|
+
allocation=task_info.allocation,
|
161
|
+
execution_start_time=execution_start_time,
|
162
|
+
execution_end_time=time.monotonic(),
|
163
|
+
)
|
125
164
|
except asyncio.CancelledError:
|
165
|
+
# Handle aio task cancellation during `await _run_task_rpcs`.
|
126
166
|
# The task is still running in FE, we only cancelled the client-side RPC.
|
127
167
|
function_executor_termination_reason = (
|
128
168
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED
|
@@ -133,15 +173,20 @@ async def run_task_on_function_executor(
|
|
133
173
|
execution_end_time=time.monotonic(),
|
134
174
|
)
|
135
175
|
except Exception as e:
|
136
|
-
|
137
|
-
|
176
|
+
# This is an unexpected exception; we believe that this
|
177
|
+
# indicates an internal error.
|
178
|
+
logger.error(
|
179
|
+
"Unexpected internal error during task lifecycle RPC sequence", exc_info=e
|
180
|
+
)
|
138
181
|
task_info.output = TaskOutput.internal_error(
|
139
182
|
allocation=task_info.allocation,
|
140
183
|
execution_start_time=execution_start_time,
|
141
184
|
execution_end_time=time.monotonic(),
|
142
185
|
)
|
143
186
|
|
144
|
-
metric_function_executor_run_task_rpc_latency.observe(
|
187
|
+
metric_function_executor_run_task_rpc_latency.observe(
|
188
|
+
time.monotonic() - execution_start_time
|
189
|
+
)
|
145
190
|
metric_function_executor_run_task_rpcs_in_progress.dec()
|
146
191
|
|
147
192
|
function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
|
@@ -152,16 +197,21 @@ async def run_task_on_function_executor(
|
|
152
197
|
task_info.output.outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
|
153
198
|
and function_executor_termination_reason is None
|
154
199
|
):
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
200
|
+
try:
|
201
|
+
# Check if the task failed because the FE is unhealthy to prevent more tasks failing.
|
202
|
+
result: HealthCheckResult = await function_executor.health_checker().check()
|
203
|
+
if not result.is_healthy:
|
204
|
+
function_executor_termination_reason = (
|
205
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
|
206
|
+
)
|
207
|
+
logger.error(
|
208
|
+
"Function Executor health check failed after running task, shutting down Function Executor",
|
209
|
+
health_check_fail_reason=result.reason,
|
210
|
+
)
|
211
|
+
except asyncio.CancelledError:
|
212
|
+
# The aio task was cancelled during the health check await.
|
213
|
+
# We can't conclude anything about the health of the FE here.
|
214
|
+
pass
|
165
215
|
|
166
216
|
_log_task_execution_finished(output=task_info.output, logger=logger)
|
167
217
|
|
@@ -171,56 +221,106 @@ async def run_task_on_function_executor(
|
|
171
221
|
)
|
172
222
|
|
173
223
|
|
174
|
-
def
|
224
|
+
async def _run_task_rpcs(
|
225
|
+
task: Task, function_executor: FunctionExecutor, timeout_sec: float
|
226
|
+
) -> TaskResult:
|
227
|
+
"""Runs the task, returning the result, reporting errors via exceptions."""
|
228
|
+
task_result: Optional[TaskResult] = None
|
229
|
+
channel: grpc.aio.Channel = function_executor.channel()
|
230
|
+
fe_stub = FunctionExecutorStub(channel)
|
231
|
+
|
232
|
+
# Create task with timeout
|
233
|
+
await fe_stub.create_task(
|
234
|
+
CreateTaskRequest(task=task), timeout=_CREATE_TASK_TIMEOUT_SECS
|
235
|
+
)
|
236
|
+
|
237
|
+
# Await task with timeout resets on each response
|
238
|
+
await_rpc = fe_stub.await_task(AwaitTaskRequest(task_id=task.task_id))
|
239
|
+
|
240
|
+
try:
|
241
|
+
while True:
|
242
|
+
# Wait for next response with fresh timeout each time
|
243
|
+
response: AwaitTaskProgress = await asyncio.wait_for(
|
244
|
+
await_rpc.read(), timeout=timeout_sec
|
245
|
+
)
|
246
|
+
|
247
|
+
if response == grpc.aio.EOF:
|
248
|
+
break
|
249
|
+
elif response.WhichOneof("response") == "task_result":
|
250
|
+
task_result = response.task_result
|
251
|
+
break
|
252
|
+
|
253
|
+
# NB: We don't actually check for other message types
|
254
|
+
# here; any message from the FE is treated as an
|
255
|
+
# indication that it's making forward progress.
|
256
|
+
finally:
|
257
|
+
# Cancel the outstanding RPC to ensure any resources in use
|
258
|
+
# are cleaned up; note that this is idempotent (in case the
|
259
|
+
# RPC has already completed).
|
260
|
+
await_rpc.cancel()
|
261
|
+
|
262
|
+
# Delete task with timeout
|
263
|
+
await fe_stub.delete_task(
|
264
|
+
DeleteTaskRequest(task_id=task.task_id), timeout=_DELETE_TASK_TIMEOUT_SECS
|
265
|
+
)
|
266
|
+
|
267
|
+
if task_result is None:
|
268
|
+
raise grpc.aio.AioRpcError(
|
269
|
+
grpc.StatusCode.CANCELLED,
|
270
|
+
None,
|
271
|
+
None,
|
272
|
+
"Function Executor didn't return function/task alloc result",
|
273
|
+
)
|
274
|
+
|
275
|
+
return task_result
|
276
|
+
|
277
|
+
|
278
|
+
def _task_output_from_function_executor_result(
|
175
279
|
allocation: TaskAllocation,
|
176
|
-
|
280
|
+
result: TaskResult,
|
177
281
|
execution_start_time: Optional[float],
|
178
282
|
execution_end_time: Optional[float],
|
179
283
|
logger: Any,
|
180
284
|
) -> TaskOutput:
|
181
|
-
response_validator = MessageValidator(
|
182
|
-
response_validator.required_field("stdout")
|
183
|
-
response_validator.required_field("stderr")
|
285
|
+
response_validator = MessageValidator(result)
|
184
286
|
response_validator.required_field("outcome_code")
|
185
287
|
|
186
288
|
metrics = TaskMetrics(counters={}, timers={})
|
187
|
-
if
|
289
|
+
if result.HasField("metrics"):
|
188
290
|
# Can be None if e.g. function failed.
|
189
|
-
metrics.counters = dict(
|
190
|
-
metrics.timers = dict(
|
291
|
+
metrics.counters = dict(result.metrics.counters)
|
292
|
+
metrics.timers = dict(result.metrics.timers)
|
191
293
|
|
192
294
|
outcome_code: TaskOutcomeCode = _to_task_outcome_code(
|
193
|
-
|
295
|
+
result.outcome_code, logger=logger
|
194
296
|
)
|
195
297
|
failure_reason: Optional[TaskFailureReason] = None
|
196
|
-
invocation_error_output: Optional[
|
298
|
+
invocation_error_output: Optional[SerializedObjectInsideBLOB] = None
|
299
|
+
uploaded_invocation_error_blob: Optional[BLOB] = None
|
197
300
|
|
198
301
|
if outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE:
|
199
302
|
response_validator.required_field("failure_reason")
|
200
303
|
failure_reason: Optional[TaskFailureReason] = _to_task_failure_reason(
|
201
|
-
|
304
|
+
result.failure_reason, logger
|
202
305
|
)
|
203
306
|
if failure_reason == TaskFailureReason.TASK_FAILURE_REASON_INVOCATION_ERROR:
|
204
307
|
response_validator.required_field("invocation_error_output")
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
): # 50% chance to get stable reproduction in manual testing
|
212
|
-
outcome_code = TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
|
213
|
-
failure_reason = TaskFailureReason.TASK_FAILURE_REASON_TASK_CANCELLED
|
308
|
+
response_validator.required_field("uploaded_invocation_error_blob")
|
309
|
+
invocation_error_output = result.invocation_error_output
|
310
|
+
uploaded_invocation_error_blob = result.uploaded_invocation_error_blob
|
311
|
+
elif outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS:
|
312
|
+
# function_outputs can have no items, this happens when the function returns None.
|
313
|
+
response_validator.required_field("uploaded_function_outputs_blob")
|
214
314
|
|
215
315
|
return TaskOutput(
|
216
316
|
allocation=allocation,
|
217
317
|
outcome_code=outcome_code,
|
218
318
|
failure_reason=failure_reason,
|
319
|
+
function_outputs=list(result.function_outputs),
|
320
|
+
uploaded_function_outputs_blob=result.uploaded_function_outputs_blob,
|
219
321
|
invocation_error_output=invocation_error_output,
|
220
|
-
|
221
|
-
next_functions=
|
222
|
-
stdout=response.stdout,
|
223
|
-
stderr=response.stderr,
|
322
|
+
uploaded_invocation_error_blob=uploaded_invocation_error_blob,
|
323
|
+
next_functions=list(result.next_functions),
|
224
324
|
metrics=metrics,
|
225
325
|
execution_start_time=execution_start_time,
|
226
326
|
execution_end_time=execution_end_time,
|
@@ -240,6 +340,14 @@ def _log_task_execution_finished(output: TaskOutput, logger: Any) -> None:
|
|
240
340
|
)
|
241
341
|
|
242
342
|
|
343
|
+
def _process_task_diagnostics(task_diagnostics: TaskDiagnostics, logger: Any) -> None:
|
344
|
+
MessageValidator(task_diagnostics).required_field("function_executor_log")
|
345
|
+
# Uncomment these lines once we stop printing FE logs to stdout/stderr.
|
346
|
+
# Print FE logs directly to Executor logs so operators can see them.
|
347
|
+
# logger.info("Function Executor logs during task execution:")
|
348
|
+
# print(task_diagnostics.function_executor_log)
|
349
|
+
|
350
|
+
|
243
351
|
def _to_task_outcome_code(
|
244
352
|
fe_task_outcome_code: FETaskOutcomeCode, logger
|
245
353
|
) -> TaskOutcomeCode:
|
@@ -2,10 +2,9 @@ import asyncio
|
|
2
2
|
from dataclasses import dataclass
|
3
3
|
from typing import Optional
|
4
4
|
|
5
|
-
from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
|
6
|
-
|
7
5
|
from indexify.proto.executor_api_pb2 import TaskAllocation
|
8
6
|
|
7
|
+
from .task_input import TaskInput
|
9
8
|
from .task_output import TaskOutput
|
10
9
|
|
11
10
|
|
@@ -22,11 +21,9 @@ class TaskInfo:
|
|
22
21
|
is_cancelled: bool = False
|
23
22
|
# aio task that is currently executing a lifecycle step of this task.
|
24
23
|
aio_task: Optional[asyncio.Task] = None
|
25
|
-
#
|
26
|
-
input: Optional[
|
27
|
-
#
|
28
|
-
init_value: Optional[SerializedObject] = None
|
29
|
-
# Output of the task.
|
24
|
+
# Input if function was prepared successfully.
|
25
|
+
input: Optional[TaskInput] = None
|
26
|
+
# Output of the task, always set when the task is completed.
|
30
27
|
output: Optional[TaskOutput] = None
|
31
28
|
# True if the task is fully completed and was added to state reporter.
|
32
29
|
is_completed: bool = False
|
@@ -0,0 +1,21 @@
|
|
1
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import FunctionInputs
|
2
|
+
|
3
|
+
|
4
|
+
class TaskInput:
|
5
|
+
"""Represents the input for a task in the function executor controller."""
|
6
|
+
|
7
|
+
def __init__(
|
8
|
+
self,
|
9
|
+
function_inputs: FunctionInputs,
|
10
|
+
function_outputs_blob_uri: str,
|
11
|
+
function_outputs_blob_upload_id: str,
|
12
|
+
invocation_error_blob_uri: str,
|
13
|
+
invocation_error_blob_upload_id: str,
|
14
|
+
):
|
15
|
+
# Actual input object sent to FE.
|
16
|
+
self.function_inputs = function_inputs
|
17
|
+
# Executor side function input related bookkeeping.
|
18
|
+
self.function_outputs_blob_uri = function_outputs_blob_uri
|
19
|
+
self.function_outputs_blob_upload_id = function_outputs_blob_upload_id
|
20
|
+
self.invocation_error_blob_uri = invocation_error_blob_uri
|
21
|
+
self.invocation_error_blob_upload_id = invocation_error_blob_upload_id
|
@@ -1,19 +1,17 @@
|
|
1
1
|
from typing import Any, Dict, List, Optional
|
2
2
|
|
3
3
|
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
4
|
-
|
4
|
+
BLOB,
|
5
|
+
SerializedObjectInsideBLOB,
|
5
6
|
)
|
6
7
|
|
7
8
|
from indexify.proto.executor_api_pb2 import (
|
8
|
-
DataPayload,
|
9
9
|
FunctionExecutorTerminationReason,
|
10
10
|
TaskAllocation,
|
11
11
|
TaskFailureReason,
|
12
12
|
TaskOutcomeCode,
|
13
13
|
)
|
14
14
|
|
15
|
-
from .function_executor_startup_output import FunctionExecutorStartupOutput
|
16
|
-
|
17
15
|
|
18
16
|
class TaskMetrics:
|
19
17
|
"""Metrics for a task."""
|
@@ -30,33 +28,27 @@ class TaskOutput:
|
|
30
28
|
self,
|
31
29
|
allocation: TaskAllocation,
|
32
30
|
outcome_code: TaskOutcomeCode,
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
31
|
+
failure_reason: Optional[TaskFailureReason] = None,
|
32
|
+
function_outputs: List[SerializedObjectInsideBLOB] = [],
|
33
|
+
uploaded_function_outputs_blob: Optional[BLOB] = None,
|
34
|
+
invocation_error_output: Optional[SerializedObjectInsideBLOB] = None,
|
35
|
+
uploaded_invocation_error_blob: Optional[BLOB] = None,
|
37
36
|
next_functions: List[str] = [],
|
38
|
-
stdout: Optional[str] = None,
|
39
|
-
stderr: Optional[str] = None,
|
40
37
|
metrics: Optional[TaskMetrics] = None,
|
41
38
|
execution_start_time: Optional[float] = None,
|
42
39
|
execution_end_time: Optional[float] = None,
|
43
40
|
):
|
44
|
-
self.task = allocation.task
|
45
41
|
self.allocation = allocation
|
46
|
-
self.function_outputs = function_outputs
|
47
|
-
self.next_functions = next_functions
|
48
|
-
self.stdout = stdout
|
49
|
-
self.stderr = stderr
|
50
42
|
self.outcome_code = outcome_code
|
51
43
|
self.failure_reason = failure_reason
|
44
|
+
self.function_outputs = function_outputs
|
45
|
+
self.uploaded_function_outputs_blob = uploaded_function_outputs_blob
|
52
46
|
self.invocation_error_output = invocation_error_output
|
47
|
+
self.uploaded_invocation_error_blob = uploaded_invocation_error_blob
|
48
|
+
self.next_functions = next_functions
|
53
49
|
self.metrics = metrics
|
54
50
|
self.execution_start_time = execution_start_time
|
55
51
|
self.execution_end_time = execution_end_time
|
56
|
-
self.uploaded_data_payloads: List[DataPayload] = []
|
57
|
-
self.uploaded_stdout: Optional[DataPayload] = None
|
58
|
-
self.uploaded_stderr: Optional[DataPayload] = None
|
59
|
-
self.uploaded_invocation_error_output: Optional[DataPayload] = None
|
60
52
|
|
61
53
|
@classmethod
|
62
54
|
def internal_error(
|
@@ -66,12 +58,10 @@ class TaskOutput:
|
|
66
58
|
execution_end_time: Optional[float],
|
67
59
|
) -> "TaskOutput":
|
68
60
|
"""Creates a TaskOutput for an internal error."""
|
69
|
-
# We are not sharing internal error messages with the customer.
|
70
61
|
return TaskOutput(
|
71
62
|
allocation=allocation,
|
72
63
|
outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
|
73
64
|
failure_reason=TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR,
|
74
|
-
stderr="Platform failed to execute the function.",
|
75
65
|
execution_start_time=execution_start_time,
|
76
66
|
execution_end_time=execution_end_time,
|
77
67
|
)
|
@@ -80,17 +70,33 @@ class TaskOutput:
|
|
80
70
|
def function_timeout(
|
81
71
|
cls,
|
82
72
|
allocation: TaskAllocation,
|
83
|
-
timeout_sec: float,
|
84
73
|
execution_start_time: Optional[float],
|
85
74
|
execution_end_time: Optional[float],
|
86
75
|
) -> "TaskOutput":
|
87
|
-
"""Creates a TaskOutput for
|
88
|
-
# Task stdout, stderr is not available.
|
76
|
+
"""Creates a TaskOutput for a function timeout error."""
|
89
77
|
return TaskOutput(
|
90
78
|
allocation=allocation,
|
91
79
|
outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
|
92
80
|
failure_reason=TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_TIMEOUT,
|
93
|
-
|
81
|
+
execution_start_time=execution_start_time,
|
82
|
+
execution_end_time=execution_end_time,
|
83
|
+
)
|
84
|
+
|
85
|
+
@classmethod
|
86
|
+
def function_executor_unresponsive(
|
87
|
+
cls,
|
88
|
+
allocation: TaskAllocation,
|
89
|
+
execution_start_time: Optional[float],
|
90
|
+
execution_end_time: Optional[float],
|
91
|
+
) -> "TaskOutput":
|
92
|
+
"""Creates a TaskOutput for an unresponsive FE aka grey failure."""
|
93
|
+
# When FE is unresponsive we don't know exact cause of the failure.
|
94
|
+
return TaskOutput(
|
95
|
+
allocation=allocation,
|
96
|
+
outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
|
97
|
+
# Treat the grey failure as a function error and thus charge the customer.
|
98
|
+
# This is to prevent service abuse by intentionally misbehaving functions.
|
99
|
+
failure_reason=TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR,
|
94
100
|
execution_start_time=execution_start_time,
|
95
101
|
execution_end_time=execution_end_time,
|
96
102
|
)
|
@@ -127,21 +133,17 @@ class TaskOutput:
|
|
127
133
|
def function_executor_startup_failed(
|
128
134
|
cls,
|
129
135
|
allocation: TaskAllocation,
|
130
|
-
|
136
|
+
fe_termination_reason: FunctionExecutorTerminationReason,
|
131
137
|
logger: Any,
|
132
138
|
) -> "TaskOutput":
|
133
139
|
"""Creates a TaskOutput for the case when we fail a task that didn't run because its FE startup failed."""
|
134
|
-
|
140
|
+
return TaskOutput(
|
135
141
|
allocation=allocation,
|
136
142
|
outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
|
137
143
|
failure_reason=_fe_startup_failure_reason_to_task_failure_reason(
|
138
|
-
|
144
|
+
fe_termination_reason, logger
|
139
145
|
),
|
140
146
|
)
|
141
|
-
# Use FE startup stdout, stderr for allocations that we failed because FE startup failed.
|
142
|
-
output.uploaded_stdout = fe_startup_output.stdout
|
143
|
-
output.uploaded_stderr = fe_startup_output.stderr
|
144
|
-
return output
|
145
147
|
|
146
148
|
|
147
149
|
def _fe_startup_failure_reason_to_task_failure_reason(
|
@@ -163,6 +165,12 @@ def _fe_startup_failure_reason_to_task_failure_reason(
|
|
163
165
|
== FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR
|
164
166
|
):
|
165
167
|
return TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR
|
168
|
+
elif (
|
169
|
+
fe_termination_reason
|
170
|
+
== FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED
|
171
|
+
):
|
172
|
+
# This fe termination reason is used when FE gets deleted by Server from desired state while it's starting up.
|
173
|
+
return TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED
|
166
174
|
else:
|
167
175
|
logger.error(
|
168
176
|
"unexpected function executor startup failure reason",
|
@@ -170,4 +178,4 @@ def _fe_startup_failure_reason_to_task_failure_reason(
|
|
170
178
|
fe_termination_reason
|
171
179
|
),
|
172
180
|
)
|
173
|
-
return TaskFailureReason.
|
181
|
+
return TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR
|
@@ -29,7 +29,12 @@ async def terminate_function_executor(
|
|
29
29
|
logger.info(
|
30
30
|
"destroying function executor",
|
31
31
|
)
|
32
|
-
|
32
|
+
try:
|
33
|
+
# This await is a cancellation point, need to shield to ensure we destroyed the FE.
|
34
|
+
await asyncio.shield(function_executor.destroy())
|
35
|
+
except asyncio.CancelledError:
|
36
|
+
# We actually destroyed the FE so we can return without error.
|
37
|
+
pass
|
33
38
|
|
34
39
|
return FunctionExecutorTerminated(
|
35
40
|
is_success=True,
|