indexify 0.4.21__py3-none-any.whl → 0.4.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/executor.py +2 -9
- indexify/executor/blob_store/blob_store.py +110 -26
- indexify/executor/blob_store/local_fs_blob_store.py +41 -1
- indexify/executor/blob_store/metrics/blob_store.py +87 -15
- indexify/executor/blob_store/s3_blob_store.py +112 -1
- indexify/executor/function_executor/function_executor.py +32 -56
- indexify/executor/function_executor/invocation_state_client.py +10 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +0 -1
- indexify/executor/function_executor_controller/create_function_executor.py +129 -116
- indexify/executor/function_executor_controller/downloads.py +34 -86
- indexify/executor/function_executor_controller/events.py +13 -7
- indexify/executor/function_executor_controller/finalize_task.py +184 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +121 -78
- indexify/executor/function_executor_controller/message_validators.py +10 -3
- indexify/executor/function_executor_controller/metrics/downloads.py +8 -52
- indexify/executor/function_executor_controller/metrics/finalize_task.py +20 -0
- indexify/executor/function_executor_controller/metrics/prepare_task.py +18 -0
- indexify/executor/function_executor_controller/metrics/run_task.py +5 -4
- indexify/executor/function_executor_controller/prepare_task.py +232 -14
- indexify/executor/function_executor_controller/run_task.py +189 -81
- indexify/executor/function_executor_controller/task_info.py +4 -7
- indexify/executor/function_executor_controller/task_input.py +21 -0
- indexify/executor/function_executor_controller/task_output.py +41 -33
- indexify/executor/function_executor_controller/terminate_function_executor.py +6 -1
- indexify/executor/logging.py +69 -0
- indexify/executor/monitoring/metrics.py +22 -0
- indexify/proto/executor_api.proto +11 -3
- indexify/proto/executor_api_pb2.py +54 -54
- indexify/proto/executor_api_pb2.pyi +8 -1
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/METADATA +6 -7
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/RECORD +33 -31
- indexify/executor/function_executor_controller/function_executor_startup_output.py +0 -21
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +0 -39
- indexify/executor/function_executor_controller/upload_task_output.py +0 -274
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/WHEEL +0 -0
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/entry_points.txt +0 -0
@@ -1,274 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
import hashlib
|
3
|
-
import time
|
4
|
-
from typing import Any, List
|
5
|
-
|
6
|
-
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
7
|
-
SerializedObject,
|
8
|
-
SerializedObjectEncoding,
|
9
|
-
)
|
10
|
-
|
11
|
-
from indexify.executor.blob_store.blob_store import BLOBStore
|
12
|
-
from indexify.proto.executor_api_pb2 import (
|
13
|
-
DataPayload,
|
14
|
-
DataPayloadEncoding,
|
15
|
-
)
|
16
|
-
|
17
|
-
from .events import TaskOutputUploadFinished
|
18
|
-
from .metrics.upload_task_output import (
|
19
|
-
metric_task_output_blob_store_upload_errors,
|
20
|
-
metric_task_output_blob_store_upload_latency,
|
21
|
-
metric_task_output_blob_store_uploads,
|
22
|
-
metric_task_output_upload_latency,
|
23
|
-
metric_task_output_upload_retries,
|
24
|
-
metric_task_output_uploads,
|
25
|
-
metric_tasks_uploading_outputs,
|
26
|
-
)
|
27
|
-
from .task_info import TaskInfo
|
28
|
-
from .task_output import TaskOutput
|
29
|
-
|
30
|
-
_TASK_OUTPUT_UPLOAD_BACKOFF_SEC = 5.0
|
31
|
-
|
32
|
-
|
33
|
-
async def upload_task_output(
|
34
|
-
task_info: TaskInfo, blob_store: BLOBStore, logger: Any
|
35
|
-
) -> TaskOutputUploadFinished:
|
36
|
-
"""Uploads the task output to blob store.
|
37
|
-
|
38
|
-
Doesn't raise any Exceptions. Runs till the reporting is successful.
|
39
|
-
"""
|
40
|
-
logger = logger.bind(module=__name__)
|
41
|
-
|
42
|
-
with (
|
43
|
-
metric_tasks_uploading_outputs.track_inprogress(),
|
44
|
-
metric_task_output_upload_latency.time(),
|
45
|
-
):
|
46
|
-
metric_task_output_uploads.inc()
|
47
|
-
await _upload_task_output_until_successful(
|
48
|
-
output=task_info.output,
|
49
|
-
blob_store=blob_store,
|
50
|
-
logger=logger,
|
51
|
-
)
|
52
|
-
_log_function_metrics(output=task_info.output, logger=logger)
|
53
|
-
return TaskOutputUploadFinished(task_info=task_info, is_success=True)
|
54
|
-
|
55
|
-
|
56
|
-
async def _upload_task_output_until_successful(
|
57
|
-
output: TaskOutput, blob_store: BLOBStore, logger: Any
|
58
|
-
) -> None:
|
59
|
-
upload_retries: int = 0
|
60
|
-
|
61
|
-
while True:
|
62
|
-
logger = logger.bind(retries=upload_retries)
|
63
|
-
try:
|
64
|
-
await _upload_task_output_once(
|
65
|
-
output=output, blob_store=blob_store, logger=logger
|
66
|
-
)
|
67
|
-
return
|
68
|
-
except Exception as e:
|
69
|
-
logger.error(
|
70
|
-
"failed to upload task output",
|
71
|
-
exc_info=e,
|
72
|
-
)
|
73
|
-
upload_retries += 1
|
74
|
-
metric_task_output_upload_retries.inc()
|
75
|
-
await asyncio.sleep(_TASK_OUTPUT_UPLOAD_BACKOFF_SEC)
|
76
|
-
|
77
|
-
|
78
|
-
class _TaskOutputSummary:
|
79
|
-
def __init__(self):
|
80
|
-
self.output_count: int = 0
|
81
|
-
self.output_total_bytes: int = 0
|
82
|
-
self.next_functions_count: int = 0
|
83
|
-
self.stdout_count: int = 0
|
84
|
-
self.stdout_total_bytes: int = 0
|
85
|
-
self.stderr_count: int = 0
|
86
|
-
self.stderr_total_bytes: int = 0
|
87
|
-
self.invocation_error_output_count: int = 0
|
88
|
-
self.invocation_error_output_total_bytes: int = 0
|
89
|
-
self.total_bytes: int = 0
|
90
|
-
|
91
|
-
|
92
|
-
async def _upload_task_output_once(
|
93
|
-
output: TaskOutput, blob_store: BLOBStore, logger: Any
|
94
|
-
) -> None:
|
95
|
-
"""Uploads the supplied task output to blob store.
|
96
|
-
|
97
|
-
Raises an Exception if the upload fails.
|
98
|
-
"""
|
99
|
-
output_summary: _TaskOutputSummary = _task_output_summary(output)
|
100
|
-
logger.info(
|
101
|
-
"uploading task output to blob store",
|
102
|
-
total_bytes=output_summary.total_bytes,
|
103
|
-
total_files=output_summary.output_count
|
104
|
-
+ output_summary.stdout_count
|
105
|
-
+ output_summary.stderr_count
|
106
|
-
+ output_summary.invocation_error_output_count,
|
107
|
-
output_files=output_summary.output_count,
|
108
|
-
output_bytes=output_summary.total_bytes,
|
109
|
-
next_functions_count=output_summary.next_functions_count,
|
110
|
-
stdout_bytes=output_summary.stdout_total_bytes,
|
111
|
-
stderr_bytes=output_summary.stderr_total_bytes,
|
112
|
-
invocation_error_output_bytes=output_summary.invocation_error_output_total_bytes,
|
113
|
-
)
|
114
|
-
|
115
|
-
start_time = time.time()
|
116
|
-
with (
|
117
|
-
metric_task_output_blob_store_upload_latency.time(),
|
118
|
-
metric_task_output_blob_store_upload_errors.count_exceptions(),
|
119
|
-
):
|
120
|
-
metric_task_output_blob_store_uploads.inc()
|
121
|
-
await _upload_to_blob_store(
|
122
|
-
task_output=output, blob_store=blob_store, logger=logger
|
123
|
-
)
|
124
|
-
|
125
|
-
logger.info(
|
126
|
-
"files uploaded to blob store",
|
127
|
-
duration=time.time() - start_time,
|
128
|
-
)
|
129
|
-
|
130
|
-
|
131
|
-
async def _upload_to_blob_store(
|
132
|
-
task_output: TaskOutput, blob_store: BLOBStore, logger: Any
|
133
|
-
) -> None:
|
134
|
-
if task_output.stdout is not None:
|
135
|
-
stdout_url = f"{task_output.allocation.task.output_payload_uri_prefix}.{task_output.allocation.task.id}.stdout"
|
136
|
-
stdout_bytes: bytes = task_output.stdout.encode()
|
137
|
-
await blob_store.put(stdout_url, stdout_bytes, logger)
|
138
|
-
task_output.uploaded_stdout = DataPayload(
|
139
|
-
uri=stdout_url,
|
140
|
-
size=len(stdout_bytes),
|
141
|
-
sha256_hash=compute_hash(stdout_bytes),
|
142
|
-
encoding=DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT,
|
143
|
-
encoding_version=0,
|
144
|
-
)
|
145
|
-
# stdout is uploaded, free the memory used for it and don't upload again if we retry overall output upload again.
|
146
|
-
task_output.stdout = None
|
147
|
-
|
148
|
-
if task_output.stderr is not None:
|
149
|
-
stderr_url = f"{task_output.allocation.task.output_payload_uri_prefix}.{task_output.allocation.task.id}.stderr"
|
150
|
-
stderr_bytes: bytes = task_output.stderr.encode()
|
151
|
-
await blob_store.put(stderr_url, stderr_bytes, logger)
|
152
|
-
task_output.uploaded_stderr = DataPayload(
|
153
|
-
uri=stderr_url,
|
154
|
-
size=len(stderr_bytes),
|
155
|
-
sha256_hash=compute_hash(stderr_bytes),
|
156
|
-
encoding=DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT,
|
157
|
-
encoding_version=0,
|
158
|
-
)
|
159
|
-
# stderr is uploaded, free the memory used for it and don't upload again if we retry overall output upload again.
|
160
|
-
task_output.stderr = None
|
161
|
-
|
162
|
-
if task_output.invocation_error_output is not None:
|
163
|
-
invocation_error_output_url = (
|
164
|
-
f"{task_output.allocation.task.output_payload_uri_prefix}.inverr."
|
165
|
-
f"{task_output.allocation.task.graph_invocation_id}"
|
166
|
-
)
|
167
|
-
invocation_error_output_bytes: bytes = task_output.invocation_error_output.data
|
168
|
-
await blob_store.put(
|
169
|
-
invocation_error_output_url, invocation_error_output_bytes, logger
|
170
|
-
)
|
171
|
-
task_output.uploaded_invocation_error_output = DataPayload(
|
172
|
-
uri=invocation_error_output_url,
|
173
|
-
size=len(invocation_error_output_bytes),
|
174
|
-
sha256_hash=compute_hash(invocation_error_output_bytes),
|
175
|
-
encoding=_to_grpc_data_payload_encoding(
|
176
|
-
task_output.invocation_error_output.encoding, logger
|
177
|
-
),
|
178
|
-
encoding_version=0,
|
179
|
-
)
|
180
|
-
# Invocation error output is uploaded, free the memory used for it and don't upload again if we retry overall output upload again.
|
181
|
-
task_output.invocation_error_output = None
|
182
|
-
|
183
|
-
# We can't use the default empty list output.uploaded_data_payloads because it's a singleton.
|
184
|
-
uploaded_data_payloads: List[DataPayload] = []
|
185
|
-
for output in task_output.function_outputs:
|
186
|
-
output: SerializedObject
|
187
|
-
output_ix: int = len(uploaded_data_payloads)
|
188
|
-
output_url: str = (
|
189
|
-
f"{task_output.allocation.task.output_payload_uri_prefix}.{task_output.allocation.task.id}.{output_ix}"
|
190
|
-
)
|
191
|
-
await blob_store.put(output_url, output.data, logger)
|
192
|
-
uploaded_data_payloads.append(
|
193
|
-
DataPayload(
|
194
|
-
uri=output_url,
|
195
|
-
size=len(output.data),
|
196
|
-
sha256_hash=compute_hash(output.data),
|
197
|
-
encoding=_to_grpc_data_payload_encoding(output.encoding, logger),
|
198
|
-
encoding_version=0,
|
199
|
-
)
|
200
|
-
)
|
201
|
-
|
202
|
-
task_output.uploaded_data_payloads = uploaded_data_payloads
|
203
|
-
# The output is uploaded, free the memory used for it and don't upload again if we retry overall output upload again.
|
204
|
-
task_output.function_outputs = []
|
205
|
-
|
206
|
-
|
207
|
-
def _task_output_summary(task_output: TaskOutput) -> _TaskOutputSummary:
|
208
|
-
summary: _TaskOutputSummary = _TaskOutputSummary()
|
209
|
-
|
210
|
-
if task_output.stdout is not None:
|
211
|
-
summary.stdout_count += 1
|
212
|
-
summary.stdout_total_bytes += len(task_output.stdout)
|
213
|
-
|
214
|
-
if task_output.stderr is not None:
|
215
|
-
summary.stderr_count += 1
|
216
|
-
summary.stderr_total_bytes += len(task_output.stderr)
|
217
|
-
|
218
|
-
if task_output.invocation_error_output is not None:
|
219
|
-
summary.invocation_error_output_count += 1
|
220
|
-
summary.invocation_error_output_total_bytes += len(
|
221
|
-
task_output.invocation_error_output.data
|
222
|
-
)
|
223
|
-
|
224
|
-
for output in task_output.function_outputs:
|
225
|
-
output: SerializedObject
|
226
|
-
output_len: bytes = len(output.data)
|
227
|
-
summary.output_count += 1
|
228
|
-
summary.output_total_bytes += output_len
|
229
|
-
|
230
|
-
summary.next_functions_count = len(task_output.next_functions)
|
231
|
-
|
232
|
-
summary.total_bytes = (
|
233
|
-
summary.output_total_bytes
|
234
|
-
+ summary.stdout_total_bytes
|
235
|
-
+ summary.stderr_total_bytes
|
236
|
-
)
|
237
|
-
return summary
|
238
|
-
|
239
|
-
|
240
|
-
def _to_grpc_data_payload_encoding(
|
241
|
-
encoding: SerializedObjectEncoding, logger: Any
|
242
|
-
) -> DataPayloadEncoding:
|
243
|
-
if encoding == SerializedObjectEncoding.SERIALIZED_OBJECT_ENCODING_BINARY_PICKLE:
|
244
|
-
return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_BINARY_PICKLE
|
245
|
-
elif encoding == SerializedObjectEncoding.SERIALIZED_OBJECT_ENCODING_UTF8_JSON:
|
246
|
-
return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_JSON
|
247
|
-
elif encoding == SerializedObjectEncoding.SERIALIZED_OBJECT_ENCODING_UTF8_TEXT:
|
248
|
-
return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT
|
249
|
-
else:
|
250
|
-
logger.error(
|
251
|
-
"Unexpected encoding for SerializedObject",
|
252
|
-
encoding=SerializedObjectEncoding.Name(encoding),
|
253
|
-
)
|
254
|
-
return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UNKNOWN
|
255
|
-
|
256
|
-
|
257
|
-
def compute_hash(data: bytes) -> str:
|
258
|
-
hasher = hashlib.sha256(usedforsecurity=False)
|
259
|
-
hasher.update(data)
|
260
|
-
return hasher.hexdigest()
|
261
|
-
|
262
|
-
|
263
|
-
# Temporary workaround is logging customer metrics until we store them somewhere
|
264
|
-
# for future retrieval and processing.
|
265
|
-
def _log_function_metrics(output: TaskOutput, logger: Any):
|
266
|
-
if output.metrics is None:
|
267
|
-
return
|
268
|
-
|
269
|
-
for counter_name, counter_value in output.metrics.counters.items():
|
270
|
-
logger.info(
|
271
|
-
"function_metric", counter_name=counter_name, counter_value=counter_value
|
272
|
-
)
|
273
|
-
for timer_name, timer_value in output.metrics.timers.items():
|
274
|
-
logger.info("function_metric", timer_name=timer_name, timer_value=timer_value)
|
File without changes
|
File without changes
|