indexify 0.4.21__py3-none-any.whl → 0.4.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/executor.py +2 -9
- indexify/executor/blob_store/blob_store.py +110 -26
- indexify/executor/blob_store/local_fs_blob_store.py +41 -1
- indexify/executor/blob_store/metrics/blob_store.py +87 -15
- indexify/executor/blob_store/s3_blob_store.py +112 -1
- indexify/executor/function_executor/function_executor.py +32 -56
- indexify/executor/function_executor/invocation_state_client.py +10 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +0 -1
- indexify/executor/function_executor_controller/create_function_executor.py +129 -116
- indexify/executor/function_executor_controller/downloads.py +34 -86
- indexify/executor/function_executor_controller/events.py +13 -7
- indexify/executor/function_executor_controller/finalize_task.py +184 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +121 -78
- indexify/executor/function_executor_controller/message_validators.py +10 -3
- indexify/executor/function_executor_controller/metrics/downloads.py +8 -52
- indexify/executor/function_executor_controller/metrics/finalize_task.py +20 -0
- indexify/executor/function_executor_controller/metrics/prepare_task.py +18 -0
- indexify/executor/function_executor_controller/metrics/run_task.py +5 -4
- indexify/executor/function_executor_controller/prepare_task.py +232 -14
- indexify/executor/function_executor_controller/run_task.py +189 -81
- indexify/executor/function_executor_controller/task_info.py +4 -7
- indexify/executor/function_executor_controller/task_input.py +21 -0
- indexify/executor/function_executor_controller/task_output.py +41 -33
- indexify/executor/function_executor_controller/terminate_function_executor.py +6 -1
- indexify/executor/logging.py +69 -0
- indexify/executor/monitoring/metrics.py +22 -0
- indexify/proto/executor_api.proto +11 -3
- indexify/proto/executor_api_pb2.py +54 -54
- indexify/proto/executor_api_pb2.pyi +8 -1
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/METADATA +6 -7
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/RECORD +33 -31
- indexify/executor/function_executor_controller/function_executor_startup_output.py +0 -21
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +0 -39
- indexify/executor/function_executor_controller/upload_task_output.py +0 -274
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/WHEEL +0 -0
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,20 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
# Task finalization metrics.
|
6
|
+
metric_task_finalizations: prometheus_client.Counter = prometheus_client.Counter(
|
7
|
+
"task_finalizations",
|
8
|
+
"Number of task finalizations",
|
9
|
+
)
|
10
|
+
metric_task_finalization_errors: prometheus_client.Counter = prometheus_client.Counter(
|
11
|
+
"task_finalization_errors",
|
12
|
+
"Number of task finalization errors",
|
13
|
+
)
|
14
|
+
metric_tasks_finalizing: prometheus_client.Gauge = prometheus_client.Gauge(
|
15
|
+
"tasks_finalizing",
|
16
|
+
"Number of tasks currently finalizing",
|
17
|
+
)
|
18
|
+
metric_task_finalization_latency: prometheus_client.Histogram = (
|
19
|
+
latency_metric_for_fast_operation("task_finalization", "task finalization")
|
20
|
+
)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
metric_task_preparations: prometheus_client.Counter = prometheus_client.Counter(
|
6
|
+
"task_preparations", "Number of task preparations for execution"
|
7
|
+
)
|
8
|
+
metric_task_preparation_errors: prometheus_client.Counter = prometheus_client.Counter(
|
9
|
+
"task_preparation_errors", "Number of task preparation errors"
|
10
|
+
)
|
11
|
+
metric_task_preparation_latency: prometheus_client.Histogram = (
|
12
|
+
latency_metric_for_fast_operation(
|
13
|
+
"task_preparation", "task preparation for execution"
|
14
|
+
)
|
15
|
+
)
|
16
|
+
metric_tasks_getting_prepared: prometheus_client.Gauge = prometheus_client.Gauge(
|
17
|
+
"tasks_getting_prepared", "Number of tasks currently getting prepared for execution"
|
18
|
+
)
|
@@ -6,23 +6,24 @@ from indexify.executor.monitoring.metrics import (
|
|
6
6
|
|
7
7
|
metric_function_executor_run_task_rpcs: prometheus_client.Counter = (
|
8
8
|
prometheus_client.Counter(
|
9
|
-
"function_executor_run_task_rpcs",
|
9
|
+
"function_executor_run_task_rpcs",
|
10
|
+
"Number of Function Executor run task lifecycle RPC sequences",
|
10
11
|
)
|
11
12
|
)
|
12
13
|
metric_function_executor_run_task_rpc_errors: prometheus_client.Counter = (
|
13
14
|
prometheus_client.Counter(
|
14
15
|
"function_executor_run_task_rpc_errors",
|
15
|
-
"Number of Function Executor run task RPC errors",
|
16
|
+
"Number of Function Executor run task lifecycle RPC errors",
|
16
17
|
)
|
17
18
|
)
|
18
19
|
metric_function_executor_run_task_rpc_latency: prometheus_client.Histogram = (
|
19
20
|
latency_metric_for_customer_controlled_operation(
|
20
|
-
"function_executor_run_task_rpc", "Function Executor run task RPC"
|
21
|
+
"function_executor_run_task_rpc", "Function Executor run task lifecycle RPC"
|
21
22
|
)
|
22
23
|
)
|
23
24
|
metric_function_executor_run_task_rpcs_in_progress: prometheus_client.Gauge = (
|
24
25
|
prometheus_client.Gauge(
|
25
26
|
"function_executor_run_task_rpcs_in_progress",
|
26
|
-
"Number of Function Executor run task RPCs in progress",
|
27
|
+
"Number of Function Executor run task lifecycle RPCs in progress",
|
27
28
|
)
|
28
29
|
)
|
@@ -1,38 +1,256 @@
|
|
1
|
-
|
1
|
+
import asyncio
|
2
|
+
import time
|
3
|
+
from typing import Any, List, Optional
|
4
|
+
|
5
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
6
|
+
BLOB,
|
7
|
+
BLOBChunk,
|
8
|
+
FunctionInputs,
|
9
|
+
SerializedObjectInsideBLOB,
|
10
|
+
)
|
2
11
|
|
3
12
|
from indexify.executor.blob_store.blob_store import BLOBStore
|
13
|
+
from indexify.proto.executor_api_pb2 import DataPayload, Task
|
4
14
|
|
5
|
-
from .downloads import
|
15
|
+
from .downloads import serialized_object_manifest_from_data_payload_proto
|
6
16
|
from .events import TaskPreparationFinished
|
17
|
+
from .metrics.prepare_task import (
|
18
|
+
metric_task_preparation_errors,
|
19
|
+
metric_task_preparation_latency,
|
20
|
+
metric_task_preparations,
|
21
|
+
metric_tasks_getting_prepared,
|
22
|
+
)
|
7
23
|
from .task_info import TaskInfo
|
24
|
+
from .task_input import TaskInput
|
25
|
+
|
26
|
+
# The following constants are subject to S3 limits,
|
27
|
+
# see https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html.
|
28
|
+
#
|
29
|
+
# 7 days - max presigned URL validity duration and limit on max function duration
|
30
|
+
_MAX_PRESIGNED_URI_EXPIRATION_SEC: int = 7 * 24 * 60 * 60
|
31
|
+
# This chunk size gives the best performance with S3. Based on our benchmarking.
|
32
|
+
_BLOB_OPTIMAL_CHUNK_SIZE_BYTES: int = 100 * 1024 * 1024 # 100 MB
|
33
|
+
# Max output size with optimal chunks is 100 * 100 MB = 10 GB.
|
34
|
+
# Each chunk requires a separate S3 presign operation, so we limit the number of optimal chunks to 100.
|
35
|
+
# S3 presign operations are local, it typically takes 30 ms per 100 URLs.
|
36
|
+
_OUTPUT_BLOB_OPTIMAL_CHUNKS_COUNT: int = 100
|
37
|
+
# This chunk size gives ~20% slower performance with S3 compared to optimal. Based on our benchmarking.
|
38
|
+
_OUTPUT_BLOB_SLOWER_CHUNK_SIZE_BYTES: int = 1 * 1024 * 1024 * 1024 # 1 GB
|
39
|
+
# Max output size with slower chunks is 100 * 1 GB = 100 GB.
|
40
|
+
_OUTPUT_BLOB_SLOWER_CHUNKS_COUNT: int = 100
|
41
|
+
# Invocation error output is using a single chunk.
|
42
|
+
_INVOCATION_ERROR_MAX_SIZE_BYTES: int = 10 * 1024 * 1024 # 10 MB
|
8
43
|
|
9
44
|
|
10
45
|
async def prepare_task(
|
11
46
|
task_info: TaskInfo, blob_store: BLOBStore, logger: Any
|
12
47
|
) -> TaskPreparationFinished:
|
13
|
-
"""Prepares the task
|
48
|
+
"""Prepares the task for execution.
|
14
49
|
|
50
|
+
If successful then the task is runnable.
|
15
51
|
Doesn't raise any exceptions.
|
16
52
|
"""
|
17
53
|
logger = logger.bind(module=__name__)
|
54
|
+
start_time = time.monotonic()
|
18
55
|
try:
|
19
|
-
|
20
|
-
|
56
|
+
with (
|
57
|
+
metric_task_preparation_errors.count_exceptions(),
|
58
|
+
metric_tasks_getting_prepared.track_inprogress(),
|
59
|
+
metric_task_preparation_latency.time(),
|
60
|
+
):
|
61
|
+
metric_task_preparations.inc()
|
62
|
+
task_info.input = await _prepare_task_input(
|
63
|
+
task_info=task_info,
|
64
|
+
blob_store=blob_store,
|
65
|
+
logger=logger,
|
66
|
+
)
|
67
|
+
logger.info(
|
68
|
+
"Task was prepared for execution",
|
69
|
+
duration=time.monotonic() - start_time,
|
70
|
+
)
|
71
|
+
return TaskPreparationFinished(
|
72
|
+
task_info=task_info,
|
73
|
+
is_success=True,
|
74
|
+
)
|
75
|
+
except asyncio.CancelledError:
|
76
|
+
return TaskPreparationFinished(task_info=task_info, is_success=False)
|
77
|
+
except BaseException as e:
|
78
|
+
logger.error(
|
79
|
+
"Failed to prepare task for execution",
|
80
|
+
exc_info=e,
|
81
|
+
duration=time.monotonic() - start_time,
|
82
|
+
)
|
83
|
+
return TaskPreparationFinished(task_info=task_info, is_success=False)
|
84
|
+
|
85
|
+
|
86
|
+
async def _prepare_task_input(
|
87
|
+
task_info: TaskInfo, blob_store: BLOBStore, logger: Any
|
88
|
+
) -> TaskInput:
|
89
|
+
"""Prepares the task for execution.
|
90
|
+
|
91
|
+
Raises an exception on error.
|
92
|
+
"""
|
93
|
+
task: Task = task_info.allocation.task
|
94
|
+
function_init_value_blob: Optional[BLOB] = None
|
95
|
+
function_init_value: Optional[SerializedObjectInsideBLOB] = None
|
96
|
+
if task.HasField("reducer_input"):
|
97
|
+
function_init_value_blob = await _presign_function_input_blob(
|
98
|
+
data_payload=task.reducer_input,
|
21
99
|
blob_store=blob_store,
|
22
100
|
logger=logger,
|
23
101
|
)
|
102
|
+
function_init_value = _to_serialized_object_inside_blob(task.reducer_input)
|
103
|
+
|
104
|
+
function_outputs_blob_uri: str = (
|
105
|
+
f"{task.output_payload_uri_prefix}.{task_info.allocation.allocation_id}.output"
|
106
|
+
)
|
107
|
+
invocation_error_blob_uri: str = (
|
108
|
+
f"{task.invocation_error_payload_uri_prefix}.{task.graph_invocation_id}.inverr"
|
109
|
+
)
|
110
|
+
|
111
|
+
# The uploads are completed when finalizing the task.
|
112
|
+
function_outputs_blob_upload_id: Optional[str] = None
|
113
|
+
invocation_error_blob_upload_id: Optional[str] = None
|
114
|
+
|
115
|
+
try:
|
116
|
+
function_outputs_blob_upload_id = await blob_store.create_multipart_upload(
|
117
|
+
uri=function_outputs_blob_uri,
|
118
|
+
logger=logger,
|
119
|
+
)
|
120
|
+
invocation_error_blob_upload_id = await blob_store.create_multipart_upload(
|
121
|
+
uri=invocation_error_blob_uri,
|
122
|
+
logger=logger,
|
123
|
+
)
|
124
|
+
except BaseException:
|
125
|
+
if function_outputs_blob_upload_id is not None:
|
126
|
+
await blob_store.abort_multipart_upload(
|
127
|
+
uri=function_outputs_blob_uri,
|
128
|
+
upload_id=function_outputs_blob_upload_id,
|
129
|
+
logger=logger,
|
130
|
+
)
|
131
|
+
if invocation_error_blob_upload_id is not None:
|
132
|
+
await blob_store.abort_multipart_upload(
|
133
|
+
uri=invocation_error_blob_uri,
|
134
|
+
upload_id=invocation_error_blob_upload_id,
|
135
|
+
logger=logger,
|
136
|
+
)
|
137
|
+
raise
|
24
138
|
|
25
|
-
|
26
|
-
|
27
|
-
|
139
|
+
return TaskInput(
|
140
|
+
function_inputs=FunctionInputs(
|
141
|
+
function_input_blob=await _presign_function_input_blob(
|
142
|
+
data_payload=task.input,
|
143
|
+
blob_store=blob_store,
|
144
|
+
logger=logger,
|
145
|
+
),
|
146
|
+
function_input=_to_serialized_object_inside_blob(task.input),
|
147
|
+
function_init_value_blob=function_init_value_blob,
|
148
|
+
function_init_value=function_init_value,
|
149
|
+
function_outputs_blob=await _presign_function_outputs_blob(
|
150
|
+
uri=function_outputs_blob_uri,
|
151
|
+
upload_id=function_outputs_blob_upload_id,
|
28
152
|
blob_store=blob_store,
|
29
153
|
logger=logger,
|
154
|
+
),
|
155
|
+
invocation_error_blob=await _presign_invocation_error_blob(
|
156
|
+
uri=invocation_error_blob_uri,
|
157
|
+
upload_id=invocation_error_blob_upload_id,
|
158
|
+
blob_store=blob_store,
|
159
|
+
logger=logger,
|
160
|
+
),
|
161
|
+
),
|
162
|
+
function_outputs_blob_uri=function_outputs_blob_uri,
|
163
|
+
function_outputs_blob_upload_id=function_outputs_blob_upload_id,
|
164
|
+
invocation_error_blob_uri=invocation_error_blob_uri,
|
165
|
+
invocation_error_blob_upload_id=invocation_error_blob_upload_id,
|
166
|
+
)
|
167
|
+
|
168
|
+
|
169
|
+
async def _presign_function_input_blob(
|
170
|
+
data_payload: DataPayload, blob_store: BLOBStore, logger: Any
|
171
|
+
) -> BLOB:
|
172
|
+
get_blob_uri: str = await blob_store.presign_get_uri(
|
173
|
+
uri=data_payload.uri,
|
174
|
+
expires_in_sec=_MAX_PRESIGNED_URI_EXPIRATION_SEC,
|
175
|
+
logger=logger,
|
176
|
+
)
|
177
|
+
chunks: List[BLOBChunk] = []
|
178
|
+
|
179
|
+
while len(chunks) * _BLOB_OPTIMAL_CHUNK_SIZE_BYTES < data_payload.size:
|
180
|
+
chunks.append(
|
181
|
+
BLOBChunk(
|
182
|
+
uri=get_blob_uri, # The URI allows to read any byte range in the BLOB.
|
183
|
+
size=_BLOB_OPTIMAL_CHUNK_SIZE_BYTES,
|
184
|
+
# ETag is only set by FE when returning BLOBs to us
|
30
185
|
)
|
186
|
+
)
|
31
187
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
188
|
+
return BLOB(
|
189
|
+
chunks=chunks,
|
190
|
+
)
|
191
|
+
|
192
|
+
|
193
|
+
async def _presign_function_outputs_blob(
|
194
|
+
uri: str, upload_id: str, blob_store: BLOBStore, logger: Any
|
195
|
+
) -> BLOB:
|
196
|
+
"""Presigns the output blob for the task."""
|
197
|
+
chunks: List[BLOBChunk] = []
|
198
|
+
|
199
|
+
while len(chunks) != (
|
200
|
+
_OUTPUT_BLOB_OPTIMAL_CHUNKS_COUNT + _OUTPUT_BLOB_SLOWER_CHUNKS_COUNT
|
201
|
+
):
|
202
|
+
upload_chunk_uri: str = await blob_store.presign_upload_part_uri(
|
203
|
+
uri=uri,
|
204
|
+
part_number=len(chunks) + 1,
|
205
|
+
upload_id=upload_id,
|
206
|
+
expires_in_sec=_MAX_PRESIGNED_URI_EXPIRATION_SEC,
|
207
|
+
logger=logger,
|
37
208
|
)
|
38
|
-
|
209
|
+
|
210
|
+
chunk_size: int = (
|
211
|
+
_BLOB_OPTIMAL_CHUNK_SIZE_BYTES
|
212
|
+
if len(chunks) < _OUTPUT_BLOB_OPTIMAL_CHUNKS_COUNT
|
213
|
+
else _OUTPUT_BLOB_SLOWER_CHUNK_SIZE_BYTES
|
214
|
+
)
|
215
|
+
chunks.append(
|
216
|
+
BLOBChunk(
|
217
|
+
uri=upload_chunk_uri,
|
218
|
+
size=chunk_size,
|
219
|
+
# ETag is only set by FE when returning BLOBs to us
|
220
|
+
)
|
221
|
+
)
|
222
|
+
|
223
|
+
return BLOB(
|
224
|
+
chunks=chunks,
|
225
|
+
)
|
226
|
+
|
227
|
+
|
228
|
+
async def _presign_invocation_error_blob(
|
229
|
+
uri: str, upload_id: str, blob_store: BLOBStore, logger: Any
|
230
|
+
) -> BLOB:
|
231
|
+
"""Presigns the output blob for the invocation error."""
|
232
|
+
upload_chunk_uri: str = await blob_store.presign_upload_part_uri(
|
233
|
+
uri=uri,
|
234
|
+
part_number=1,
|
235
|
+
upload_id=upload_id,
|
236
|
+
expires_in_sec=_MAX_PRESIGNED_URI_EXPIRATION_SEC,
|
237
|
+
logger=logger,
|
238
|
+
)
|
239
|
+
return BLOB(
|
240
|
+
chunks=[
|
241
|
+
BLOBChunk(
|
242
|
+
uri=upload_chunk_uri,
|
243
|
+
size=_INVOCATION_ERROR_MAX_SIZE_BYTES,
|
244
|
+
# ETag is only set by FE when returning BLOBs to us
|
245
|
+
)
|
246
|
+
]
|
247
|
+
)
|
248
|
+
|
249
|
+
|
250
|
+
def _to_serialized_object_inside_blob(
|
251
|
+
data_payload: DataPayload,
|
252
|
+
) -> SerializedObjectInsideBLOB:
|
253
|
+
return SerializedObjectInsideBLOB(
|
254
|
+
manifest=serialized_object_manifest_from_data_payload_proto(data_payload),
|
255
|
+
offset=data_payload.offset,
|
256
|
+
)
|