indexify 0.4.22__py3-none-any.whl → 0.4.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/executor.py +2 -9
- indexify/executor/blob_store/blob_store.py +110 -26
- indexify/executor/blob_store/local_fs_blob_store.py +41 -1
- indexify/executor/blob_store/metrics/blob_store.py +87 -15
- indexify/executor/blob_store/s3_blob_store.py +112 -1
- indexify/executor/function_executor/function_executor.py +32 -56
- indexify/executor/function_executor/invocation_state_client.py +10 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +0 -1
- indexify/executor/function_executor_controller/create_function_executor.py +129 -116
- indexify/executor/function_executor_controller/downloads.py +34 -86
- indexify/executor/function_executor_controller/events.py +13 -7
- indexify/executor/function_executor_controller/finalize_task.py +184 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +121 -78
- indexify/executor/function_executor_controller/message_validators.py +10 -3
- indexify/executor/function_executor_controller/metrics/downloads.py +8 -52
- indexify/executor/function_executor_controller/metrics/finalize_task.py +20 -0
- indexify/executor/function_executor_controller/metrics/prepare_task.py +18 -0
- indexify/executor/function_executor_controller/prepare_task.py +232 -14
- indexify/executor/function_executor_controller/run_task.py +77 -61
- indexify/executor/function_executor_controller/task_info.py +4 -7
- indexify/executor/function_executor_controller/task_input.py +21 -0
- indexify/executor/function_executor_controller/task_output.py +26 -35
- indexify/executor/function_executor_controller/terminate_function_executor.py +6 -1
- indexify/executor/logging.py +69 -0
- indexify/executor/monitoring/metrics.py +22 -0
- indexify/proto/executor_api.proto +11 -3
- indexify/proto/executor_api_pb2.py +54 -54
- indexify/proto/executor_api_pb2.pyi +8 -1
- {indexify-0.4.22.dist-info → indexify-0.4.24.dist-info}/METADATA +6 -6
- {indexify-0.4.22.dist-info → indexify-0.4.24.dist-info}/RECORD +32 -30
- indexify/executor/function_executor_controller/function_executor_startup_output.py +0 -21
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +0 -39
- indexify/executor/function_executor_controller/upload_task_output.py +0 -274
- {indexify-0.4.22.dist-info → indexify-0.4.24.dist-info}/WHEEL +0 -0
- {indexify-0.4.22.dist-info → indexify-0.4.24.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,20 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
# Task finalization metrics.
|
6
|
+
metric_task_finalizations: prometheus_client.Counter = prometheus_client.Counter(
|
7
|
+
"task_finalizations",
|
8
|
+
"Number of task finalizations",
|
9
|
+
)
|
10
|
+
metric_task_finalization_errors: prometheus_client.Counter = prometheus_client.Counter(
|
11
|
+
"task_finalization_errors",
|
12
|
+
"Number of task finalization errors",
|
13
|
+
)
|
14
|
+
metric_tasks_finalizing: prometheus_client.Gauge = prometheus_client.Gauge(
|
15
|
+
"tasks_finalizing",
|
16
|
+
"Number of tasks currently finalizing",
|
17
|
+
)
|
18
|
+
metric_task_finalization_latency: prometheus_client.Histogram = (
|
19
|
+
latency_metric_for_fast_operation("task_finalization", "task finalization")
|
20
|
+
)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
metric_task_preparations: prometheus_client.Counter = prometheus_client.Counter(
|
6
|
+
"task_preparations", "Number of task preparations for execution"
|
7
|
+
)
|
8
|
+
metric_task_preparation_errors: prometheus_client.Counter = prometheus_client.Counter(
|
9
|
+
"task_preparation_errors", "Number of task preparation errors"
|
10
|
+
)
|
11
|
+
metric_task_preparation_latency: prometheus_client.Histogram = (
|
12
|
+
latency_metric_for_fast_operation(
|
13
|
+
"task_preparation", "task preparation for execution"
|
14
|
+
)
|
15
|
+
)
|
16
|
+
metric_tasks_getting_prepared: prometheus_client.Gauge = prometheus_client.Gauge(
|
17
|
+
"tasks_getting_prepared", "Number of tasks currently getting prepared for execution"
|
18
|
+
)
|
@@ -1,38 +1,256 @@
|
|
1
|
-
|
1
|
+
import asyncio
|
2
|
+
import time
|
3
|
+
from typing import Any, List, Optional
|
4
|
+
|
5
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
6
|
+
BLOB,
|
7
|
+
BLOBChunk,
|
8
|
+
FunctionInputs,
|
9
|
+
SerializedObjectInsideBLOB,
|
10
|
+
)
|
2
11
|
|
3
12
|
from indexify.executor.blob_store.blob_store import BLOBStore
|
13
|
+
from indexify.proto.executor_api_pb2 import DataPayload, Task
|
4
14
|
|
5
|
-
from .downloads import
|
15
|
+
from .downloads import serialized_object_manifest_from_data_payload_proto
|
6
16
|
from .events import TaskPreparationFinished
|
17
|
+
from .metrics.prepare_task import (
|
18
|
+
metric_task_preparation_errors,
|
19
|
+
metric_task_preparation_latency,
|
20
|
+
metric_task_preparations,
|
21
|
+
metric_tasks_getting_prepared,
|
22
|
+
)
|
7
23
|
from .task_info import TaskInfo
|
24
|
+
from .task_input import TaskInput
|
25
|
+
|
26
|
+
# The following constants are subject to S3 limits,
|
27
|
+
# see https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html.
|
28
|
+
#
|
29
|
+
# 7 days - max presigned URL validity duration and limit on max function duration
|
30
|
+
_MAX_PRESIGNED_URI_EXPIRATION_SEC: int = 7 * 24 * 60 * 60
|
31
|
+
# This chunk size gives the best performance with S3. Based on our benchmarking.
|
32
|
+
_BLOB_OPTIMAL_CHUNK_SIZE_BYTES: int = 100 * 1024 * 1024 # 100 MB
|
33
|
+
# Max output size with optimal chunks is 100 * 100 MB = 10 GB.
|
34
|
+
# Each chunk requires a separate S3 presign operation, so we limit the number of optimal chunks to 100.
|
35
|
+
# S3 presign operations are local, it typically takes 30 ms per 100 URLs.
|
36
|
+
_OUTPUT_BLOB_OPTIMAL_CHUNKS_COUNT: int = 100
|
37
|
+
# This chunk size gives ~20% slower performance with S3 compared to optimal. Based on our benchmarking.
|
38
|
+
_OUTPUT_BLOB_SLOWER_CHUNK_SIZE_BYTES: int = 1 * 1024 * 1024 * 1024 # 1 GB
|
39
|
+
# Max output size with slower chunks is 100 * 1 GB = 100 GB.
|
40
|
+
_OUTPUT_BLOB_SLOWER_CHUNKS_COUNT: int = 100
|
41
|
+
# Invocation error output is using a single chunk.
|
42
|
+
_INVOCATION_ERROR_MAX_SIZE_BYTES: int = 10 * 1024 * 1024 # 10 MB
|
8
43
|
|
9
44
|
|
10
45
|
async def prepare_task(
|
11
46
|
task_info: TaskInfo, blob_store: BLOBStore, logger: Any
|
12
47
|
) -> TaskPreparationFinished:
|
13
|
-
"""Prepares the task
|
48
|
+
"""Prepares the task for execution.
|
14
49
|
|
50
|
+
If successful then the task is runnable.
|
15
51
|
Doesn't raise any exceptions.
|
16
52
|
"""
|
17
53
|
logger = logger.bind(module=__name__)
|
54
|
+
start_time = time.monotonic()
|
18
55
|
try:
|
19
|
-
|
20
|
-
|
56
|
+
with (
|
57
|
+
metric_task_preparation_errors.count_exceptions(),
|
58
|
+
metric_tasks_getting_prepared.track_inprogress(),
|
59
|
+
metric_task_preparation_latency.time(),
|
60
|
+
):
|
61
|
+
metric_task_preparations.inc()
|
62
|
+
task_info.input = await _prepare_task_input(
|
63
|
+
task_info=task_info,
|
64
|
+
blob_store=blob_store,
|
65
|
+
logger=logger,
|
66
|
+
)
|
67
|
+
logger.info(
|
68
|
+
"Task was prepared for execution",
|
69
|
+
duration=time.monotonic() - start_time,
|
70
|
+
)
|
71
|
+
return TaskPreparationFinished(
|
72
|
+
task_info=task_info,
|
73
|
+
is_success=True,
|
74
|
+
)
|
75
|
+
except asyncio.CancelledError:
|
76
|
+
return TaskPreparationFinished(task_info=task_info, is_success=False)
|
77
|
+
except BaseException as e:
|
78
|
+
logger.error(
|
79
|
+
"Failed to prepare task for execution",
|
80
|
+
exc_info=e,
|
81
|
+
duration=time.monotonic() - start_time,
|
82
|
+
)
|
83
|
+
return TaskPreparationFinished(task_info=task_info, is_success=False)
|
84
|
+
|
85
|
+
|
86
|
+
async def _prepare_task_input(
|
87
|
+
task_info: TaskInfo, blob_store: BLOBStore, logger: Any
|
88
|
+
) -> TaskInput:
|
89
|
+
"""Prepares the task for execution.
|
90
|
+
|
91
|
+
Raises an exception on error.
|
92
|
+
"""
|
93
|
+
task: Task = task_info.allocation.task
|
94
|
+
function_init_value_blob: Optional[BLOB] = None
|
95
|
+
function_init_value: Optional[SerializedObjectInsideBLOB] = None
|
96
|
+
if task.HasField("reducer_input"):
|
97
|
+
function_init_value_blob = await _presign_function_input_blob(
|
98
|
+
data_payload=task.reducer_input,
|
21
99
|
blob_store=blob_store,
|
22
100
|
logger=logger,
|
23
101
|
)
|
102
|
+
function_init_value = _to_serialized_object_inside_blob(task.reducer_input)
|
103
|
+
|
104
|
+
function_outputs_blob_uri: str = (
|
105
|
+
f"{task.output_payload_uri_prefix}.{task_info.allocation.allocation_id}.output"
|
106
|
+
)
|
107
|
+
invocation_error_blob_uri: str = (
|
108
|
+
f"{task.invocation_error_payload_uri_prefix}.{task.graph_invocation_id}.inverr"
|
109
|
+
)
|
110
|
+
|
111
|
+
# The uploads are completed when finalizing the task.
|
112
|
+
function_outputs_blob_upload_id: Optional[str] = None
|
113
|
+
invocation_error_blob_upload_id: Optional[str] = None
|
114
|
+
|
115
|
+
try:
|
116
|
+
function_outputs_blob_upload_id = await blob_store.create_multipart_upload(
|
117
|
+
uri=function_outputs_blob_uri,
|
118
|
+
logger=logger,
|
119
|
+
)
|
120
|
+
invocation_error_blob_upload_id = await blob_store.create_multipart_upload(
|
121
|
+
uri=invocation_error_blob_uri,
|
122
|
+
logger=logger,
|
123
|
+
)
|
124
|
+
except BaseException:
|
125
|
+
if function_outputs_blob_upload_id is not None:
|
126
|
+
await blob_store.abort_multipart_upload(
|
127
|
+
uri=function_outputs_blob_uri,
|
128
|
+
upload_id=function_outputs_blob_upload_id,
|
129
|
+
logger=logger,
|
130
|
+
)
|
131
|
+
if invocation_error_blob_upload_id is not None:
|
132
|
+
await blob_store.abort_multipart_upload(
|
133
|
+
uri=invocation_error_blob_uri,
|
134
|
+
upload_id=invocation_error_blob_upload_id,
|
135
|
+
logger=logger,
|
136
|
+
)
|
137
|
+
raise
|
24
138
|
|
25
|
-
|
26
|
-
|
27
|
-
|
139
|
+
return TaskInput(
|
140
|
+
function_inputs=FunctionInputs(
|
141
|
+
function_input_blob=await _presign_function_input_blob(
|
142
|
+
data_payload=task.input,
|
143
|
+
blob_store=blob_store,
|
144
|
+
logger=logger,
|
145
|
+
),
|
146
|
+
function_input=_to_serialized_object_inside_blob(task.input),
|
147
|
+
function_init_value_blob=function_init_value_blob,
|
148
|
+
function_init_value=function_init_value,
|
149
|
+
function_outputs_blob=await _presign_function_outputs_blob(
|
150
|
+
uri=function_outputs_blob_uri,
|
151
|
+
upload_id=function_outputs_blob_upload_id,
|
28
152
|
blob_store=blob_store,
|
29
153
|
logger=logger,
|
154
|
+
),
|
155
|
+
invocation_error_blob=await _presign_invocation_error_blob(
|
156
|
+
uri=invocation_error_blob_uri,
|
157
|
+
upload_id=invocation_error_blob_upload_id,
|
158
|
+
blob_store=blob_store,
|
159
|
+
logger=logger,
|
160
|
+
),
|
161
|
+
),
|
162
|
+
function_outputs_blob_uri=function_outputs_blob_uri,
|
163
|
+
function_outputs_blob_upload_id=function_outputs_blob_upload_id,
|
164
|
+
invocation_error_blob_uri=invocation_error_blob_uri,
|
165
|
+
invocation_error_blob_upload_id=invocation_error_blob_upload_id,
|
166
|
+
)
|
167
|
+
|
168
|
+
|
169
|
+
async def _presign_function_input_blob(
|
170
|
+
data_payload: DataPayload, blob_store: BLOBStore, logger: Any
|
171
|
+
) -> BLOB:
|
172
|
+
get_blob_uri: str = await blob_store.presign_get_uri(
|
173
|
+
uri=data_payload.uri,
|
174
|
+
expires_in_sec=_MAX_PRESIGNED_URI_EXPIRATION_SEC,
|
175
|
+
logger=logger,
|
176
|
+
)
|
177
|
+
chunks: List[BLOBChunk] = []
|
178
|
+
|
179
|
+
while len(chunks) * _BLOB_OPTIMAL_CHUNK_SIZE_BYTES < data_payload.size:
|
180
|
+
chunks.append(
|
181
|
+
BLOBChunk(
|
182
|
+
uri=get_blob_uri, # The URI allows to read any byte range in the BLOB.
|
183
|
+
size=_BLOB_OPTIMAL_CHUNK_SIZE_BYTES,
|
184
|
+
# ETag is only set by FE when returning BLOBs to us
|
30
185
|
)
|
186
|
+
)
|
31
187
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
188
|
+
return BLOB(
|
189
|
+
chunks=chunks,
|
190
|
+
)
|
191
|
+
|
192
|
+
|
193
|
+
async def _presign_function_outputs_blob(
|
194
|
+
uri: str, upload_id: str, blob_store: BLOBStore, logger: Any
|
195
|
+
) -> BLOB:
|
196
|
+
"""Presigns the output blob for the task."""
|
197
|
+
chunks: List[BLOBChunk] = []
|
198
|
+
|
199
|
+
while len(chunks) != (
|
200
|
+
_OUTPUT_BLOB_OPTIMAL_CHUNKS_COUNT + _OUTPUT_BLOB_SLOWER_CHUNKS_COUNT
|
201
|
+
):
|
202
|
+
upload_chunk_uri: str = await blob_store.presign_upload_part_uri(
|
203
|
+
uri=uri,
|
204
|
+
part_number=len(chunks) + 1,
|
205
|
+
upload_id=upload_id,
|
206
|
+
expires_in_sec=_MAX_PRESIGNED_URI_EXPIRATION_SEC,
|
207
|
+
logger=logger,
|
37
208
|
)
|
38
|
-
|
209
|
+
|
210
|
+
chunk_size: int = (
|
211
|
+
_BLOB_OPTIMAL_CHUNK_SIZE_BYTES
|
212
|
+
if len(chunks) < _OUTPUT_BLOB_OPTIMAL_CHUNKS_COUNT
|
213
|
+
else _OUTPUT_BLOB_SLOWER_CHUNK_SIZE_BYTES
|
214
|
+
)
|
215
|
+
chunks.append(
|
216
|
+
BLOBChunk(
|
217
|
+
uri=upload_chunk_uri,
|
218
|
+
size=chunk_size,
|
219
|
+
# ETag is only set by FE when returning BLOBs to us
|
220
|
+
)
|
221
|
+
)
|
222
|
+
|
223
|
+
return BLOB(
|
224
|
+
chunks=chunks,
|
225
|
+
)
|
226
|
+
|
227
|
+
|
228
|
+
async def _presign_invocation_error_blob(
|
229
|
+
uri: str, upload_id: str, blob_store: BLOBStore, logger: Any
|
230
|
+
) -> BLOB:
|
231
|
+
"""Presigns the output blob for the invocation error."""
|
232
|
+
upload_chunk_uri: str = await blob_store.presign_upload_part_uri(
|
233
|
+
uri=uri,
|
234
|
+
part_number=1,
|
235
|
+
upload_id=upload_id,
|
236
|
+
expires_in_sec=_MAX_PRESIGNED_URI_EXPIRATION_SEC,
|
237
|
+
logger=logger,
|
238
|
+
)
|
239
|
+
return BLOB(
|
240
|
+
chunks=[
|
241
|
+
BLOBChunk(
|
242
|
+
uri=upload_chunk_uri,
|
243
|
+
size=_INVOCATION_ERROR_MAX_SIZE_BYTES,
|
244
|
+
# ETag is only set by FE when returning BLOBs to us
|
245
|
+
)
|
246
|
+
]
|
247
|
+
)
|
248
|
+
|
249
|
+
|
250
|
+
def _to_serialized_object_inside_blob(
|
251
|
+
data_payload: DataPayload,
|
252
|
+
) -> SerializedObjectInsideBLOB:
|
253
|
+
return SerializedObjectInsideBLOB(
|
254
|
+
manifest=serialized_object_manifest_from_data_payload_proto(data_payload),
|
255
|
+
offset=data_payload.offset,
|
256
|
+
)
|
@@ -1,18 +1,17 @@
|
|
1
1
|
import asyncio
|
2
|
-
import os
|
3
|
-
import random
|
4
2
|
import time
|
5
3
|
from typing import Any, Optional
|
6
4
|
|
7
5
|
import grpc
|
8
6
|
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
7
|
+
BLOB,
|
9
8
|
AwaitTaskProgress,
|
10
9
|
AwaitTaskRequest,
|
11
10
|
CreateTaskRequest,
|
12
11
|
DeleteTaskRequest,
|
13
|
-
|
14
|
-
SerializedObject,
|
12
|
+
SerializedObjectInsideBLOB,
|
15
13
|
Task,
|
14
|
+
TaskDiagnostics,
|
16
15
|
)
|
17
16
|
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
18
17
|
TaskFailureReason as FETaskFailureReason,
|
@@ -47,10 +46,6 @@ from .metrics.run_task import (
|
|
47
46
|
from .task_info import TaskInfo
|
48
47
|
from .task_output import TaskMetrics, TaskOutput
|
49
48
|
|
50
|
-
_ENABLE_INJECT_TASK_CANCELLATIONS = (
|
51
|
-
os.getenv("INDEXIFY_INJECT_TASK_CANCELLATIONS", "0") == "1"
|
52
|
-
)
|
53
|
-
|
54
49
|
_CREATE_TASK_TIMEOUT_SECS = 5
|
55
50
|
_DELETE_TASK_TIMEOUT_SECS = 5
|
56
51
|
|
@@ -63,23 +58,31 @@ async def run_task_on_function_executor(
|
|
63
58
|
Doesn't raise any exceptions.
|
64
59
|
"""
|
65
60
|
logger = logger.bind(module=__name__)
|
61
|
+
|
62
|
+
if task_info.input is None:
|
63
|
+
logger.error(
|
64
|
+
"task input is None, this should never happen",
|
65
|
+
)
|
66
|
+
task_info.output = TaskOutput.internal_error(
|
67
|
+
allocation=task_info.allocation,
|
68
|
+
execution_start_time=None,
|
69
|
+
execution_end_time=None,
|
70
|
+
)
|
71
|
+
return TaskExecutionFinished(
|
72
|
+
task_info=task_info,
|
73
|
+
function_executor_termination_reason=None,
|
74
|
+
)
|
75
|
+
|
66
76
|
task = Task(
|
67
|
-
task_id=task_info.allocation.task.id,
|
68
77
|
namespace=task_info.allocation.task.namespace,
|
69
78
|
graph_name=task_info.allocation.task.graph_name,
|
70
79
|
graph_version=task_info.allocation.task.graph_version,
|
71
80
|
function_name=task_info.allocation.task.function_name,
|
72
81
|
graph_invocation_id=task_info.allocation.task.graph_invocation_id,
|
82
|
+
task_id=task_info.allocation.task.id,
|
73
83
|
allocation_id=task_info.allocation.allocation_id,
|
74
|
-
request=
|
84
|
+
request=task_info.input.function_inputs,
|
75
85
|
)
|
76
|
-
# Don't keep the input in memory after we started running the task.
|
77
|
-
task_info.input = None
|
78
|
-
|
79
|
-
if task_info.init_value is not None:
|
80
|
-
task.request.function_init_value.CopyFrom(task_info.init_value)
|
81
|
-
# Don't keep the init value in memory after we started running the task.
|
82
|
-
task_info.init_value = None
|
83
86
|
|
84
87
|
function_executor.invocation_state_client().add_task_to_invocation_id_entry(
|
85
88
|
task_id=task_info.allocation.task.id,
|
@@ -104,10 +107,13 @@ async def run_task_on_function_executor(
|
|
104
107
|
# If this RPC failed due to customer code crashing the server we won't be
|
105
108
|
# able to detect this. We'll treat this as our own error for now and thus
|
106
109
|
# let the AioRpcError to be raised here.
|
107
|
-
timeout_sec = task_info.allocation.task.timeout_ms / 1000.0
|
110
|
+
timeout_sec: float = task_info.allocation.task.timeout_ms / 1000.0
|
108
111
|
try:
|
112
|
+
# This aio task can only be cancelled during this await call.
|
109
113
|
task_result = await _run_task_rpcs(task, function_executor, timeout_sec)
|
110
114
|
|
115
|
+
_process_task_diagnostics(task_result.diagnostics, logger)
|
116
|
+
|
111
117
|
task_info.output = _task_output_from_function_executor_result(
|
112
118
|
allocation=task_info.allocation,
|
113
119
|
result=task_result,
|
@@ -123,7 +129,6 @@ async def run_task_on_function_executor(
|
|
123
129
|
)
|
124
130
|
task_info.output = TaskOutput.function_timeout(
|
125
131
|
allocation=task_info.allocation,
|
126
|
-
timeout_sec=timeout_sec,
|
127
132
|
execution_start_time=execution_start_time,
|
128
133
|
execution_end_time=time.monotonic(),
|
129
134
|
)
|
@@ -156,8 +161,8 @@ async def run_task_on_function_executor(
|
|
156
161
|
execution_start_time=execution_start_time,
|
157
162
|
execution_end_time=time.monotonic(),
|
158
163
|
)
|
159
|
-
|
160
164
|
except asyncio.CancelledError:
|
165
|
+
# Handle aio task cancellation during `await _run_task_rpcs`.
|
161
166
|
# The task is still running in FE, we only cancelled the client-side RPC.
|
162
167
|
function_executor_termination_reason = (
|
163
168
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED
|
@@ -192,16 +197,21 @@ async def run_task_on_function_executor(
|
|
192
197
|
task_info.output.outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
|
193
198
|
and function_executor_termination_reason is None
|
194
199
|
):
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
200
|
+
try:
|
201
|
+
# Check if the task failed because the FE is unhealthy to prevent more tasks failing.
|
202
|
+
result: HealthCheckResult = await function_executor.health_checker().check()
|
203
|
+
if not result.is_healthy:
|
204
|
+
function_executor_termination_reason = (
|
205
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
|
206
|
+
)
|
207
|
+
logger.error(
|
208
|
+
"Function Executor health check failed after running task, shutting down Function Executor",
|
209
|
+
health_check_fail_reason=result.reason,
|
210
|
+
)
|
211
|
+
except asyncio.CancelledError:
|
212
|
+
# The aio task was cancelled during the health check await.
|
213
|
+
# We can't conclude anything about the health of the FE here.
|
214
|
+
pass
|
205
215
|
|
206
216
|
_log_task_execution_finished(output=task_info.output, logger=logger)
|
207
217
|
|
@@ -215,8 +225,7 @@ async def _run_task_rpcs(
|
|
215
225
|
task: Task, function_executor: FunctionExecutor, timeout_sec: float
|
216
226
|
) -> TaskResult:
|
217
227
|
"""Runs the task, returning the result, reporting errors via exceptions."""
|
218
|
-
|
219
|
-
response: AwaitTaskProgress
|
228
|
+
task_result: Optional[TaskResult] = None
|
220
229
|
channel: grpc.aio.Channel = function_executor.channel()
|
221
230
|
fe_stub = FunctionExecutorStub(channel)
|
222
231
|
|
@@ -231,24 +240,19 @@ async def _run_task_rpcs(
|
|
231
240
|
try:
|
232
241
|
while True:
|
233
242
|
# Wait for next response with fresh timeout each time
|
234
|
-
response = await asyncio.wait_for(
|
235
|
-
|
236
|
-
|
243
|
+
response: AwaitTaskProgress = await asyncio.wait_for(
|
244
|
+
await_rpc.read(), timeout=timeout_sec
|
245
|
+
)
|
246
|
+
|
247
|
+
if response == grpc.aio.EOF:
|
248
|
+
break
|
249
|
+
elif response.WhichOneof("response") == "task_result":
|
250
|
+
task_result = response.task_result
|
237
251
|
break
|
238
252
|
|
239
253
|
# NB: We don't actually check for other message types
|
240
254
|
# here; any message from the FE is treated as an
|
241
255
|
# indication that it's making forward progress.
|
242
|
-
|
243
|
-
if response == grpc.aio.EOF:
|
244
|
-
# Protocol error: we should get a task_result before
|
245
|
-
# we see the RPC complete.
|
246
|
-
raise grpc.aio.AioRpcError(
|
247
|
-
grpc.StatusCode.CANCELLED,
|
248
|
-
None,
|
249
|
-
None,
|
250
|
-
"Function Executor didn't return function/task alloc response",
|
251
|
-
)
|
252
256
|
finally:
|
253
257
|
# Cancel the outstanding RPC to ensure any resources in use
|
254
258
|
# are cleaned up; note that this is idempotent (in case the
|
@@ -260,7 +264,15 @@ async def _run_task_rpcs(
|
|
260
264
|
DeleteTaskRequest(task_id=task.task_id), timeout=_DELETE_TASK_TIMEOUT_SECS
|
261
265
|
)
|
262
266
|
|
263
|
-
|
267
|
+
if task_result is None:
|
268
|
+
raise grpc.aio.AioRpcError(
|
269
|
+
grpc.StatusCode.CANCELLED,
|
270
|
+
None,
|
271
|
+
None,
|
272
|
+
"Function Executor didn't return function/task alloc result",
|
273
|
+
)
|
274
|
+
|
275
|
+
return task_result
|
264
276
|
|
265
277
|
|
266
278
|
def _task_output_from_function_executor_result(
|
@@ -271,8 +283,6 @@ def _task_output_from_function_executor_result(
|
|
271
283
|
logger: Any,
|
272
284
|
) -> TaskOutput:
|
273
285
|
response_validator = MessageValidator(result)
|
274
|
-
response_validator.required_field("stdout")
|
275
|
-
response_validator.required_field("stderr")
|
276
286
|
response_validator.required_field("outcome_code")
|
277
287
|
|
278
288
|
metrics = TaskMetrics(counters={}, timers={})
|
@@ -285,7 +295,8 @@ def _task_output_from_function_executor_result(
|
|
285
295
|
result.outcome_code, logger=logger
|
286
296
|
)
|
287
297
|
failure_reason: Optional[TaskFailureReason] = None
|
288
|
-
invocation_error_output: Optional[
|
298
|
+
invocation_error_output: Optional[SerializedObjectInsideBLOB] = None
|
299
|
+
uploaded_invocation_error_blob: Optional[BLOB] = None
|
289
300
|
|
290
301
|
if outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE:
|
291
302
|
response_validator.required_field("failure_reason")
|
@@ -294,25 +305,22 @@ def _task_output_from_function_executor_result(
|
|
294
305
|
)
|
295
306
|
if failure_reason == TaskFailureReason.TASK_FAILURE_REASON_INVOCATION_ERROR:
|
296
307
|
response_validator.required_field("invocation_error_output")
|
308
|
+
response_validator.required_field("uploaded_invocation_error_blob")
|
297
309
|
invocation_error_output = result.invocation_error_output
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
random.random() < 0.5
|
303
|
-
): # 50% chance to get stable reproduction in manual testing
|
304
|
-
outcome_code = TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
|
305
|
-
failure_reason = TaskFailureReason.TASK_FAILURE_REASON_TASK_CANCELLED
|
310
|
+
uploaded_invocation_error_blob = result.uploaded_invocation_error_blob
|
311
|
+
elif outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS:
|
312
|
+
# function_outputs can have no items, this happens when the function returns None.
|
313
|
+
response_validator.required_field("uploaded_function_outputs_blob")
|
306
314
|
|
307
315
|
return TaskOutput(
|
308
316
|
allocation=allocation,
|
309
317
|
outcome_code=outcome_code,
|
310
318
|
failure_reason=failure_reason,
|
319
|
+
function_outputs=list(result.function_outputs),
|
320
|
+
uploaded_function_outputs_blob=result.uploaded_function_outputs_blob,
|
311
321
|
invocation_error_output=invocation_error_output,
|
312
|
-
|
313
|
-
next_functions=result.next_functions,
|
314
|
-
stdout=result.stdout,
|
315
|
-
stderr=result.stderr,
|
322
|
+
uploaded_invocation_error_blob=uploaded_invocation_error_blob,
|
323
|
+
next_functions=list(result.next_functions),
|
316
324
|
metrics=metrics,
|
317
325
|
execution_start_time=execution_start_time,
|
318
326
|
execution_end_time=execution_end_time,
|
@@ -332,6 +340,14 @@ def _log_task_execution_finished(output: TaskOutput, logger: Any) -> None:
|
|
332
340
|
)
|
333
341
|
|
334
342
|
|
343
|
+
def _process_task_diagnostics(task_diagnostics: TaskDiagnostics, logger: Any) -> None:
|
344
|
+
MessageValidator(task_diagnostics).required_field("function_executor_log")
|
345
|
+
# Uncomment these lines once we stop printing FE logs to stdout/stderr.
|
346
|
+
# Print FE logs directly to Executor logs so operators can see them.
|
347
|
+
# logger.info("Function Executor logs during task execution:")
|
348
|
+
# print(task_diagnostics.function_executor_log)
|
349
|
+
|
350
|
+
|
335
351
|
def _to_task_outcome_code(
|
336
352
|
fe_task_outcome_code: FETaskOutcomeCode, logger
|
337
353
|
) -> TaskOutcomeCode:
|
@@ -2,10 +2,9 @@ import asyncio
|
|
2
2
|
from dataclasses import dataclass
|
3
3
|
from typing import Optional
|
4
4
|
|
5
|
-
from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
|
6
|
-
|
7
5
|
from indexify.proto.executor_api_pb2 import TaskAllocation
|
8
6
|
|
7
|
+
from .task_input import TaskInput
|
9
8
|
from .task_output import TaskOutput
|
10
9
|
|
11
10
|
|
@@ -22,11 +21,9 @@ class TaskInfo:
|
|
22
21
|
is_cancelled: bool = False
|
23
22
|
# aio task that is currently executing a lifecycle step of this task.
|
24
23
|
aio_task: Optional[asyncio.Task] = None
|
25
|
-
#
|
26
|
-
input: Optional[
|
27
|
-
#
|
28
|
-
init_value: Optional[SerializedObject] = None
|
29
|
-
# Output of the task.
|
24
|
+
# Input if function was prepared successfully.
|
25
|
+
input: Optional[TaskInput] = None
|
26
|
+
# Output of the task, always set when the task is completed.
|
30
27
|
output: Optional[TaskOutput] = None
|
31
28
|
# True if the task is fully completed and was added to state reporter.
|
32
29
|
is_completed: bool = False
|
@@ -0,0 +1,21 @@
|
|
1
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import FunctionInputs
|
2
|
+
|
3
|
+
|
4
|
+
class TaskInput:
|
5
|
+
"""Represents the input for a task in the function executor controller."""
|
6
|
+
|
7
|
+
def __init__(
|
8
|
+
self,
|
9
|
+
function_inputs: FunctionInputs,
|
10
|
+
function_outputs_blob_uri: str,
|
11
|
+
function_outputs_blob_upload_id: str,
|
12
|
+
invocation_error_blob_uri: str,
|
13
|
+
invocation_error_blob_upload_id: str,
|
14
|
+
):
|
15
|
+
# Actual input object sent to FE.
|
16
|
+
self.function_inputs = function_inputs
|
17
|
+
# Executor side function input related bookkeeping.
|
18
|
+
self.function_outputs_blob_uri = function_outputs_blob_uri
|
19
|
+
self.function_outputs_blob_upload_id = function_outputs_blob_upload_id
|
20
|
+
self.invocation_error_blob_uri = invocation_error_blob_uri
|
21
|
+
self.invocation_error_blob_upload_id = invocation_error_blob_upload_id
|