indexify 0.3.31__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/__init__.py +18 -0
- indexify/cli/build_image.py +51 -0
- indexify/cli/deploy.py +57 -0
- indexify/cli/executor.py +205 -0
- indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
- indexify/executor/executor.py +57 -313
- indexify/executor/function_allowlist.py +59 -0
- indexify/executor/function_executor/function_executor.py +12 -6
- indexify/executor/function_executor/invocation_state_client.py +25 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
- indexify/executor/function_executor_controller/__init__.py +13 -0
- indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
- indexify/executor/function_executor_controller/create_function_executor.py +158 -0
- indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
- indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
- indexify/executor/function_executor_controller/downloads.py +199 -0
- indexify/executor/function_executor_controller/events.py +172 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
- indexify/executor/function_executor_controller/loggers.py +57 -0
- indexify/executor/function_executor_controller/message_validators.py +69 -0
- indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
- indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
- indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
- indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
- indexify/executor/function_executor_controller/prepare_task.py +38 -0
- indexify/executor/function_executor_controller/run_task.py +201 -0
- indexify/executor/function_executor_controller/task_info.py +33 -0
- indexify/executor/function_executor_controller/task_output.py +122 -0
- indexify/executor/function_executor_controller/upload_task_output.py +234 -0
- indexify/executor/host_resources/host_resources.py +20 -25
- indexify/executor/host_resources/nvidia_gpu_allocator.py +8 -1
- indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
- indexify/executor/metrics/executor.py +0 -47
- indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
- indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
- indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
- indexify/executor/monitoring/health_checker/health_checker.py +0 -11
- indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
- indexify/executor/state_reporter.py +364 -0
- indexify/proto/executor_api.proto +68 -60
- indexify/proto/executor_api_pb2.py +52 -52
- indexify/proto/executor_api_pb2.pyi +129 -108
- indexify/proto/executor_api_pb2_grpc.py +0 -47
- {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/METADATA +2 -5
- indexify-0.4.3.dist-info/RECORD +68 -0
- indexify-0.4.3.dist-info/entry_points.txt +3 -0
- indexify/cli/cli.py +0 -268
- indexify/executor/api_objects.py +0 -92
- indexify/executor/downloader.py +0 -417
- indexify/executor/executor_flavor.py +0 -7
- indexify/executor/function_executor/function_executor_state.py +0 -107
- indexify/executor/function_executor/function_executor_states_container.py +0 -93
- indexify/executor/function_executor/function_executor_status.py +0 -95
- indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
- indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
- indexify/executor/function_executor/single_task_runner.py +0 -345
- indexify/executor/function_executor/task_input.py +0 -21
- indexify/executor/function_executor/task_output.py +0 -105
- indexify/executor/grpc/function_executor_controller.py +0 -418
- indexify/executor/grpc/metrics/task_controller.py +0 -8
- indexify/executor/grpc/state_reporter.py +0 -317
- indexify/executor/grpc/task_controller.py +0 -508
- indexify/executor/metrics/task_fetcher.py +0 -21
- indexify/executor/metrics/task_reporter.py +0 -53
- indexify/executor/metrics/task_runner.py +0 -52
- indexify/executor/monitoring/function_allowlist.py +0 -25
- indexify/executor/runtime_probes.py +0 -68
- indexify/executor/task_fetcher.py +0 -96
- indexify/executor/task_reporter.py +0 -459
- indexify/executor/task_runner.py +0 -177
- indexify-0.3.31.dist-info/RECORD +0 -68
- indexify-0.3.31.dist-info/entry_points.txt +0 -3
- {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/WHEEL +0 -0
@@ -1,68 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import platform
|
3
|
-
import sys
|
4
|
-
from typing import Any, Dict, Tuple
|
5
|
-
|
6
|
-
from pydantic import BaseModel
|
7
|
-
|
8
|
-
DEFAULT_EXECUTOR = "tensorlake/indexify-executor-default"
|
9
|
-
# Empty string is used as a default hash which tells the scheduler to accept any hash.
|
10
|
-
DEFAULT_HASH = ""
|
11
|
-
|
12
|
-
|
13
|
-
class ProbeInfo(BaseModel):
|
14
|
-
image_name: str
|
15
|
-
image_hash: str
|
16
|
-
python_major_version: int
|
17
|
-
labels: Dict[str, Any] = {}
|
18
|
-
is_default_executor: bool
|
19
|
-
|
20
|
-
|
21
|
-
class RuntimeProbes:
|
22
|
-
def __init__(self) -> None:
|
23
|
-
self._image_name = self._read_image_name()
|
24
|
-
self._image_hash = self._read_image_hash()
|
25
|
-
self._os_name = platform.system()
|
26
|
-
self._architecture = platform.machine()
|
27
|
-
(
|
28
|
-
self._python_version_major,
|
29
|
-
self._python_version_minor,
|
30
|
-
) = self._get_python_version()
|
31
|
-
|
32
|
-
def _read_image_name(self) -> str:
|
33
|
-
file_path = os.path.expanduser("~/.indexify/image_name")
|
34
|
-
if os.path.exists(file_path):
|
35
|
-
with open(file_path, "r") as file:
|
36
|
-
return file.read().strip()
|
37
|
-
return DEFAULT_EXECUTOR
|
38
|
-
|
39
|
-
def _read_image_hash(self) -> str:
|
40
|
-
file_path = os.path.expanduser("~/.indexify/image_hash")
|
41
|
-
if os.path.exists(file_path):
|
42
|
-
with open(file_path, "r") as file:
|
43
|
-
return file.read().strip()
|
44
|
-
return DEFAULT_HASH
|
45
|
-
|
46
|
-
def _get_python_version(self) -> Tuple[int, int]:
|
47
|
-
version_info = sys.version_info
|
48
|
-
return version_info.major, version_info.minor
|
49
|
-
|
50
|
-
def _is_default_executor(self):
|
51
|
-
return True if self._read_image_name() == DEFAULT_EXECUTOR else False
|
52
|
-
|
53
|
-
def probe(self) -> ProbeInfo:
|
54
|
-
labels = {
|
55
|
-
"os": self._os_name,
|
56
|
-
"image_name": self._image_name,
|
57
|
-
"architecture": self._architecture,
|
58
|
-
"python_major_version": self._python_version_major,
|
59
|
-
"python_minor_version": self._python_version_minor,
|
60
|
-
}
|
61
|
-
|
62
|
-
return ProbeInfo(
|
63
|
-
image_name=self._image_name,
|
64
|
-
image_hash=self._image_hash,
|
65
|
-
python_major_version=self._python_version_major,
|
66
|
-
labels=labels,
|
67
|
-
is_default_executor=self._is_default_executor(),
|
68
|
-
)
|
@@ -1,96 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import time
|
3
|
-
from socket import gethostname
|
4
|
-
from typing import AsyncGenerator, Dict, List, Optional
|
5
|
-
|
6
|
-
import structlog
|
7
|
-
from httpx_sse import aconnect_sse
|
8
|
-
from tensorlake.utils.http_client import get_httpx_client
|
9
|
-
|
10
|
-
from .api_objects import ExecutorMetadata, FunctionURI, Task
|
11
|
-
from .metrics.task_fetcher import (
|
12
|
-
metric_server_registration_errors,
|
13
|
-
metric_server_registration_latency,
|
14
|
-
metric_server_registrations,
|
15
|
-
)
|
16
|
-
from .runtime_probes import ProbeInfo, RuntimeProbes
|
17
|
-
|
18
|
-
|
19
|
-
class TaskFetcher:
|
20
|
-
"""Registers with Indexify server and fetches tasks from it."""
|
21
|
-
|
22
|
-
def __init__(
|
23
|
-
self,
|
24
|
-
executor_id: str,
|
25
|
-
executor_version: str,
|
26
|
-
labels: Dict[str, str],
|
27
|
-
function_allowlist: Optional[List[FunctionURI]],
|
28
|
-
protocol: str,
|
29
|
-
indexify_server_addr: str,
|
30
|
-
config_path: Optional[str] = None,
|
31
|
-
):
|
32
|
-
self._protocol: str = protocol
|
33
|
-
self._indexify_server_addr: str = indexify_server_addr
|
34
|
-
self.config_path = config_path
|
35
|
-
self._logger = structlog.get_logger(module=__name__)
|
36
|
-
|
37
|
-
probe_info: ProbeInfo = RuntimeProbes().probe()
|
38
|
-
all_labels = probe_info.labels.copy()
|
39
|
-
all_labels.update(labels)
|
40
|
-
|
41
|
-
self._executor_metadata: ExecutorMetadata = ExecutorMetadata(
|
42
|
-
id=executor_id,
|
43
|
-
executor_version=executor_version,
|
44
|
-
addr=gethostname(),
|
45
|
-
function_allowlist=function_allowlist,
|
46
|
-
labels=all_labels,
|
47
|
-
)
|
48
|
-
|
49
|
-
async def run(self) -> AsyncGenerator[Task, None]:
|
50
|
-
"""Fetches tasks that Indexify server assigned to the Executor.
|
51
|
-
|
52
|
-
Raises an exception if error occurred."""
|
53
|
-
url = f"{self._protocol}://{self._indexify_server_addr}/internal/executors/{self._executor_metadata.id}/tasks"
|
54
|
-
|
55
|
-
self._logger.info(
|
56
|
-
"registering_executor",
|
57
|
-
executor_id=self._executor_metadata.id,
|
58
|
-
url=url,
|
59
|
-
executor_version=self._executor_metadata.executor_version,
|
60
|
-
)
|
61
|
-
metric_server_registrations.inc()
|
62
|
-
registration_start_time: float = time.monotonic()
|
63
|
-
|
64
|
-
async with get_httpx_client(
|
65
|
-
config_path=self.config_path, make_async=True
|
66
|
-
) as client:
|
67
|
-
async with aconnect_sse(
|
68
|
-
client,
|
69
|
-
"POST",
|
70
|
-
url,
|
71
|
-
json=self._executor_metadata.model_dump(),
|
72
|
-
headers={"Content-Type": "application/json"},
|
73
|
-
) as event_source:
|
74
|
-
try:
|
75
|
-
event_source.response.raise_for_status()
|
76
|
-
except Exception as e:
|
77
|
-
metric_server_registration_errors.inc()
|
78
|
-
await event_source.response.aread()
|
79
|
-
raise Exception(
|
80
|
-
"failed to register at server. "
|
81
|
-
f"Response code: {event_source.response.status_code}. "
|
82
|
-
f"Response text: '{event_source.response.text}'."
|
83
|
-
) from e
|
84
|
-
finally:
|
85
|
-
metric_server_registration_latency.observe(
|
86
|
-
time.monotonic() - registration_start_time
|
87
|
-
)
|
88
|
-
|
89
|
-
self._logger.info(
|
90
|
-
"executor_registered", executor_id=self._executor_metadata.id
|
91
|
-
)
|
92
|
-
|
93
|
-
async for sse in event_source.aiter_sse():
|
94
|
-
task_dicts = json.loads(sse.data)
|
95
|
-
for task_dict in task_dicts:
|
96
|
-
yield Task.model_validate(task_dict, strict=False)
|
@@ -1,459 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
import hashlib
|
3
|
-
import time
|
4
|
-
from typing import Any, List, Optional, Tuple
|
5
|
-
|
6
|
-
import nanoid
|
7
|
-
from httpx import Timeout
|
8
|
-
from tensorlake.function_executor.proto.function_executor_pb2 import FunctionOutput
|
9
|
-
from tensorlake.utils.http_client import get_httpx_client
|
10
|
-
|
11
|
-
from indexify.proto.executor_api_pb2 import DataPayload as DataPayloadProto
|
12
|
-
from indexify.proto.executor_api_pb2 import (
|
13
|
-
DataPayloadEncoding,
|
14
|
-
OutputEncoding,
|
15
|
-
ReportTaskOutcomeRequest,
|
16
|
-
TaskOutcome,
|
17
|
-
)
|
18
|
-
from indexify.proto.executor_api_pb2_grpc import ExecutorAPIStub
|
19
|
-
|
20
|
-
from .api_objects import (
|
21
|
-
TASK_OUTCOME_FAILURE,
|
22
|
-
TASK_OUTCOME_SUCCESS,
|
23
|
-
DataPayload,
|
24
|
-
IngestFnOutputsResponse,
|
25
|
-
RouterOutput,
|
26
|
-
TaskResult,
|
27
|
-
)
|
28
|
-
from .blob_store.blob_store import BLOBStore
|
29
|
-
from .function_executor.task_output import TaskOutput
|
30
|
-
from .grpc.channel_manager import ChannelManager
|
31
|
-
from .metrics.task_reporter import (
|
32
|
-
metric_report_task_outcome_errors,
|
33
|
-
metric_report_task_outcome_latency,
|
34
|
-
metric_report_task_outcome_rpcs,
|
35
|
-
metric_server_ingest_files_errors,
|
36
|
-
metric_server_ingest_files_latency,
|
37
|
-
metric_server_ingest_files_requests,
|
38
|
-
metric_task_output_blob_store_upload_errors,
|
39
|
-
metric_task_output_blob_store_upload_latency,
|
40
|
-
metric_task_output_blob_store_uploads,
|
41
|
-
)
|
42
|
-
|
43
|
-
|
44
|
-
# https://github.com/psf/requests/issues/1081#issuecomment-428504128
|
45
|
-
class ForceMultipartDict(dict):
|
46
|
-
def __bool__(self):
|
47
|
-
return True
|
48
|
-
|
49
|
-
|
50
|
-
FORCE_MULTIPART = ForceMultipartDict()
|
51
|
-
UTF_8_CONTENT_TYPE = "application/octet-stream"
|
52
|
-
|
53
|
-
|
54
|
-
class TaskOutputSummary:
|
55
|
-
def __init__(self):
|
56
|
-
self.output_count: int = 0
|
57
|
-
self.output_total_bytes: int = 0
|
58
|
-
self.router_output_count: int = 0
|
59
|
-
self.stdout_count: int = 0
|
60
|
-
self.stdout_total_bytes: int = 0
|
61
|
-
self.stderr_count: int = 0
|
62
|
-
self.stderr_total_bytes: int = 0
|
63
|
-
self.total_bytes: int = 0
|
64
|
-
|
65
|
-
|
66
|
-
class TaskReporter:
|
67
|
-
def __init__(
|
68
|
-
self,
|
69
|
-
base_url: str,
|
70
|
-
executor_id: str,
|
71
|
-
channel_manager: ChannelManager,
|
72
|
-
blob_store: BLOBStore,
|
73
|
-
config_path: Optional[str] = None,
|
74
|
-
):
|
75
|
-
self._base_url = base_url
|
76
|
-
self._executor_id = executor_id
|
77
|
-
self._is_shutdown = False
|
78
|
-
# Use thread-safe sync client due to issues with async client.
|
79
|
-
# Async client attempts to use connections it already closed.
|
80
|
-
# See e.g. https://github.com/encode/httpx/issues/2337.
|
81
|
-
# Creating a new async client for each request fixes this but it
|
82
|
-
# results in not reusing established TCP connections to server.
|
83
|
-
self._client = get_httpx_client(config_path, make_async=False)
|
84
|
-
self._channel_manager = channel_manager
|
85
|
-
self._blob_store = blob_store
|
86
|
-
|
87
|
-
async def shutdown(self) -> None:
|
88
|
-
"""Shuts down the task reporter.
|
89
|
-
|
90
|
-
Task reporter stops reporting all task outcomes to the Server.
|
91
|
-
There are many task failures due to Executor shutdown. We give wrong
|
92
|
-
signals to Server if we report such failures.
|
93
|
-
"""
|
94
|
-
self._is_shutdown = True
|
95
|
-
|
96
|
-
async def report(self, output: TaskOutput, logger: Any) -> None:
|
97
|
-
"""Reports result of the supplied task."""
|
98
|
-
logger = logger.bind(module=__name__)
|
99
|
-
|
100
|
-
if self._is_shutdown:
|
101
|
-
logger.warning(
|
102
|
-
"task reporter got shutdown, skipping task outcome reporting"
|
103
|
-
)
|
104
|
-
return
|
105
|
-
|
106
|
-
# TODO: If the files are uploaded successfully,
|
107
|
-
# we should record that so that if we fail to report
|
108
|
-
# the task outcome, we don't retry the upload.
|
109
|
-
# This will save us some time and resources.
|
110
|
-
# It's good to do this once we delete all the legacy code paths.
|
111
|
-
|
112
|
-
output_summary: TaskOutputSummary = _task_output_summary(output)
|
113
|
-
logger.info(
|
114
|
-
"reporting task outcome",
|
115
|
-
total_bytes=output_summary.total_bytes,
|
116
|
-
total_files=output_summary.output_count
|
117
|
-
+ output_summary.stdout_count
|
118
|
-
+ output_summary.stderr_count,
|
119
|
-
output_files=output_summary.output_count,
|
120
|
-
output_bytes=output_summary.total_bytes,
|
121
|
-
router_output_count=output_summary.router_output_count,
|
122
|
-
stdout_bytes=output_summary.stdout_total_bytes,
|
123
|
-
stderr_bytes=output_summary.stderr_total_bytes,
|
124
|
-
)
|
125
|
-
|
126
|
-
if output.output_payload_uri_prefix is None:
|
127
|
-
ingested_files = await self._ingest_files_at_server(output, logger)
|
128
|
-
else:
|
129
|
-
ingested_files = await self._ingest_files_at_blob_store(output, logger)
|
130
|
-
|
131
|
-
fn_outputs = []
|
132
|
-
for data_payload in ingested_files.data_payloads:
|
133
|
-
fn_outputs.append(
|
134
|
-
DataPayloadProto(
|
135
|
-
path=data_payload.path, # TODO: stop using this deprecated field once Server side migration is done.
|
136
|
-
uri=data_payload.path,
|
137
|
-
size=data_payload.size,
|
138
|
-
sha256_hash=data_payload.sha256_hash,
|
139
|
-
encoding=_to_grpc_data_payload_encoding(output),
|
140
|
-
encoding_version=0,
|
141
|
-
)
|
142
|
-
)
|
143
|
-
stdout, stderr = None, None
|
144
|
-
if ingested_files.stdout is not None:
|
145
|
-
stdout = DataPayloadProto(
|
146
|
-
path=ingested_files.stdout.path, # TODO: stop using this deprecated field once Server side migration is done.
|
147
|
-
uri=ingested_files.stdout.path,
|
148
|
-
size=ingested_files.stdout.size,
|
149
|
-
sha256_hash=ingested_files.stdout.sha256_hash,
|
150
|
-
encoding=DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT,
|
151
|
-
encoding_version=0,
|
152
|
-
)
|
153
|
-
if ingested_files.stderr is not None:
|
154
|
-
stderr = DataPayloadProto(
|
155
|
-
path=ingested_files.stderr.path, # TODO: stop using this deprecated field once Server side migration is done.
|
156
|
-
uri=ingested_files.stderr.path,
|
157
|
-
size=ingested_files.stderr.size,
|
158
|
-
sha256_hash=ingested_files.stderr.sha256_hash,
|
159
|
-
encoding=DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT,
|
160
|
-
encoding_version=0,
|
161
|
-
)
|
162
|
-
|
163
|
-
request = ReportTaskOutcomeRequest(
|
164
|
-
task_id=output.task_id,
|
165
|
-
namespace=output.namespace,
|
166
|
-
graph_name=output.graph_name,
|
167
|
-
function_name=output.function_name,
|
168
|
-
graph_invocation_id=output.graph_invocation_id,
|
169
|
-
outcome=_to_grpc_task_outcome(output),
|
170
|
-
invocation_id=output.graph_invocation_id,
|
171
|
-
executor_id=self._executor_id,
|
172
|
-
reducer=output.reducer,
|
173
|
-
next_functions=(output.router_output.edges if output.router_output else []),
|
174
|
-
fn_outputs=fn_outputs,
|
175
|
-
stdout=stdout,
|
176
|
-
stderr=stderr,
|
177
|
-
output_encoding=_to_grpc_output_encoding(output),
|
178
|
-
output_encoding_version=0,
|
179
|
-
)
|
180
|
-
try:
|
181
|
-
stub = ExecutorAPIStub(await self._channel_manager.get_channel())
|
182
|
-
with (
|
183
|
-
metric_report_task_outcome_latency.time(),
|
184
|
-
metric_report_task_outcome_errors.count_exceptions(),
|
185
|
-
):
|
186
|
-
metric_report_task_outcome_rpcs.inc()
|
187
|
-
await stub.report_task_outcome(request, timeout=5.0)
|
188
|
-
except Exception as e:
|
189
|
-
logger.error("failed to report task outcome", error=e)
|
190
|
-
raise e
|
191
|
-
|
192
|
-
async def _ingest_files_at_server(
|
193
|
-
self, output: TaskOutput, logger: Any
|
194
|
-
) -> IngestFnOutputsResponse:
|
195
|
-
logger.warning("uploading task output files to server (deprecated mode)")
|
196
|
-
|
197
|
-
task_result, output_files = self._process_task_output(output)
|
198
|
-
task_result_data = task_result.model_dump_json(exclude_none=True)
|
199
|
-
|
200
|
-
kwargs = {
|
201
|
-
"data": {"task_result": task_result_data},
|
202
|
-
# Use httpx default timeout of 5s for all timeout types.
|
203
|
-
# For read timeouts, use 5 minutes to allow for large file uploads.
|
204
|
-
"timeout": Timeout(
|
205
|
-
5.0,
|
206
|
-
read=5.0 * 60,
|
207
|
-
),
|
208
|
-
"files": output_files if len(output_files) > 0 else FORCE_MULTIPART,
|
209
|
-
}
|
210
|
-
|
211
|
-
start_time = time.time()
|
212
|
-
with metric_server_ingest_files_latency.time():
|
213
|
-
metric_server_ingest_files_requests.inc()
|
214
|
-
# Run in a separate thread to not block the main event loop.
|
215
|
-
response = await asyncio.to_thread(
|
216
|
-
self._client.post,
|
217
|
-
url=f"{self._base_url}/internal/ingest_fn_outputs",
|
218
|
-
**kwargs,
|
219
|
-
)
|
220
|
-
end_time = time.time()
|
221
|
-
logger.info(
|
222
|
-
"files uploaded to server",
|
223
|
-
response_time=end_time - start_time,
|
224
|
-
response_code=response.status_code,
|
225
|
-
)
|
226
|
-
|
227
|
-
try:
|
228
|
-
response.raise_for_status()
|
229
|
-
except Exception as e:
|
230
|
-
metric_server_ingest_files_errors.inc()
|
231
|
-
# Caller catches and logs the exception.
|
232
|
-
raise Exception(
|
233
|
-
"failed to upload files. "
|
234
|
-
f"Response code: {response.status_code}. "
|
235
|
-
f"Response text: '{response.text}'."
|
236
|
-
) from e
|
237
|
-
|
238
|
-
ingested_files_response = response.json()
|
239
|
-
return IngestFnOutputsResponse.model_validate(ingested_files_response)
|
240
|
-
|
241
|
-
async def _ingest_files_at_blob_store(
|
242
|
-
self, output: TaskOutput, logger: Any
|
243
|
-
) -> IngestFnOutputsResponse:
|
244
|
-
start_time = time.time()
|
245
|
-
with (
|
246
|
-
metric_task_output_blob_store_upload_latency.time(),
|
247
|
-
metric_task_output_blob_store_upload_errors.count_exceptions(),
|
248
|
-
):
|
249
|
-
metric_task_output_blob_store_uploads.inc()
|
250
|
-
response = await self._upload_output_to_blob_store(output, logger)
|
251
|
-
|
252
|
-
logger.info(
|
253
|
-
"files uploaded to blob store",
|
254
|
-
duration=time.time() - start_time,
|
255
|
-
)
|
256
|
-
return response
|
257
|
-
|
258
|
-
async def _upload_output_to_blob_store(
|
259
|
-
self, output: TaskOutput, logger: Any
|
260
|
-
) -> IngestFnOutputsResponse:
|
261
|
-
data_payloads: List[DataPayload] = []
|
262
|
-
stdout: Optional[DataPayload] = None
|
263
|
-
stderr: Optional[DataPayload] = None
|
264
|
-
|
265
|
-
if output.stdout is not None:
|
266
|
-
stdout_url = f"{output.output_payload_uri_prefix}.{output.task_id}.stdout"
|
267
|
-
stdout_bytes: bytes = output.stdout.encode()
|
268
|
-
await self._blob_store.put(stdout_url, stdout_bytes, logger)
|
269
|
-
stdout = DataPayload(
|
270
|
-
path=stdout_url,
|
271
|
-
size=len(stdout_bytes),
|
272
|
-
sha256_hash=_compute_hash(stdout_bytes),
|
273
|
-
)
|
274
|
-
|
275
|
-
if output.stderr is not None:
|
276
|
-
stderr_url = f"{output.output_payload_uri_prefix}.{output.task_id}.stderr"
|
277
|
-
stderr_bytes: bytes = output.stderr.encode()
|
278
|
-
await self._blob_store.put(stderr_url, stderr_bytes, logger)
|
279
|
-
stderr = DataPayload(
|
280
|
-
path=stderr_url,
|
281
|
-
size=len(stderr_bytes),
|
282
|
-
sha256_hash=_compute_hash(stderr_bytes),
|
283
|
-
)
|
284
|
-
|
285
|
-
if output.function_output is not None:
|
286
|
-
for func_output_item in output.function_output.outputs:
|
287
|
-
node_output_sequence = len(data_payloads)
|
288
|
-
if output.reducer:
|
289
|
-
# Reducer tasks have to write their results into the same blob.
|
290
|
-
output_url = (
|
291
|
-
f"{output.output_payload_uri_prefix}.{node_output_sequence}"
|
292
|
-
)
|
293
|
-
else:
|
294
|
-
# Regular tasks write their results into different blobs made unique using task ids.
|
295
|
-
output_url = f"{output.output_payload_uri_prefix}.{output.task_id}.{node_output_sequence}"
|
296
|
-
|
297
|
-
output_bytes: bytes = (
|
298
|
-
func_output_item.bytes
|
299
|
-
if func_output_item.HasField("bytes")
|
300
|
-
else func_output_item.string.encode()
|
301
|
-
)
|
302
|
-
await self._blob_store.put(output_url, output_bytes, logger)
|
303
|
-
data_payloads.append(
|
304
|
-
DataPayload(
|
305
|
-
path=output_url,
|
306
|
-
size=len(output_bytes),
|
307
|
-
sha256_hash=_compute_hash(output_bytes),
|
308
|
-
)
|
309
|
-
)
|
310
|
-
|
311
|
-
return IngestFnOutputsResponse(
|
312
|
-
data_payloads=data_payloads,
|
313
|
-
stdout=stdout,
|
314
|
-
stderr=stderr,
|
315
|
-
)
|
316
|
-
|
317
|
-
def _process_task_output(self, output: TaskOutput) -> Tuple[TaskResult, List[Any]]:
|
318
|
-
task_result = TaskResult(
|
319
|
-
outcome="failure",
|
320
|
-
namespace=output.namespace,
|
321
|
-
compute_graph=output.graph_name,
|
322
|
-
compute_fn=output.function_name,
|
323
|
-
invocation_id=output.graph_invocation_id,
|
324
|
-
executor_id=self._executor_id,
|
325
|
-
task_id=output.task_id,
|
326
|
-
reducer=output.reducer,
|
327
|
-
)
|
328
|
-
output_files: List[Any] = []
|
329
|
-
task_result.outcome = (
|
330
|
-
TASK_OUTCOME_SUCCESS if output.success else TASK_OUTCOME_FAILURE
|
331
|
-
)
|
332
|
-
|
333
|
-
_process_function_output(
|
334
|
-
function_output=output.function_output, output_files=output_files
|
335
|
-
)
|
336
|
-
_process_router_output(
|
337
|
-
router_output=output.router_output, task_result=task_result
|
338
|
-
)
|
339
|
-
_process_stdout(stdout=output.stdout, output_files=output_files)
|
340
|
-
_process_stderr(stderr=output.stderr, output_files=output_files)
|
341
|
-
|
342
|
-
return task_result, output_files
|
343
|
-
|
344
|
-
|
345
|
-
def _process_function_output(
|
346
|
-
function_output: Optional[FunctionOutput], output_files: List[Any]
|
347
|
-
) -> None:
|
348
|
-
if function_output is None:
|
349
|
-
return
|
350
|
-
|
351
|
-
for output in function_output.outputs or []:
|
352
|
-
payload = output.bytes if output.HasField("bytes") else output.string
|
353
|
-
output_files.append(
|
354
|
-
(
|
355
|
-
"node_outputs",
|
356
|
-
(nanoid.generate(), payload, output.content_type),
|
357
|
-
)
|
358
|
-
)
|
359
|
-
|
360
|
-
|
361
|
-
def _process_router_output(
|
362
|
-
router_output: Optional[RouterOutput],
|
363
|
-
task_result: TaskResult,
|
364
|
-
) -> None:
|
365
|
-
if router_output is None:
|
366
|
-
return
|
367
|
-
|
368
|
-
task_result.router_output = RouterOutput(edges=router_output.edges)
|
369
|
-
|
370
|
-
|
371
|
-
def _process_stdout(stdout: Optional[str], output_files: List[Any]) -> None:
|
372
|
-
if stdout is None:
|
373
|
-
return
|
374
|
-
|
375
|
-
output_files.append(
|
376
|
-
(
|
377
|
-
"stdout",
|
378
|
-
(
|
379
|
-
nanoid.generate(),
|
380
|
-
stdout.encode(),
|
381
|
-
UTF_8_CONTENT_TYPE,
|
382
|
-
),
|
383
|
-
)
|
384
|
-
)
|
385
|
-
|
386
|
-
|
387
|
-
def _process_stderr(stderr: Optional[str], output_files: List[Any]) -> None:
|
388
|
-
if stderr is None:
|
389
|
-
return
|
390
|
-
|
391
|
-
output_files.append(
|
392
|
-
(
|
393
|
-
"stderr",
|
394
|
-
(
|
395
|
-
nanoid.generate(),
|
396
|
-
stderr.encode(),
|
397
|
-
UTF_8_CONTENT_TYPE,
|
398
|
-
),
|
399
|
-
)
|
400
|
-
)
|
401
|
-
|
402
|
-
|
403
|
-
def _task_output_summary(output: TaskOutput) -> TaskOutputSummary:
|
404
|
-
summary: TaskOutputSummary = TaskOutputSummary()
|
405
|
-
|
406
|
-
if output.stdout is not None:
|
407
|
-
summary.stdout_count += 1
|
408
|
-
summary.stdout_total_bytes += len(output.stdout)
|
409
|
-
|
410
|
-
if output.stderr is not None:
|
411
|
-
summary.stderr_count += 1
|
412
|
-
summary.stderr_total_bytes += len(output.stderr)
|
413
|
-
|
414
|
-
if output.function_output is not None:
|
415
|
-
for func_output_item in output.function_output.outputs:
|
416
|
-
output_len: bytes = len(
|
417
|
-
func_output_item.bytes
|
418
|
-
if func_output_item.HasField("bytes")
|
419
|
-
else func_output_item.string
|
420
|
-
)
|
421
|
-
summary.output_count += 1
|
422
|
-
summary.output_total_bytes += output_len
|
423
|
-
|
424
|
-
if output.router_output is not None:
|
425
|
-
summary.router_output_count += 1
|
426
|
-
|
427
|
-
summary.total_bytes = (
|
428
|
-
summary.output_total_bytes
|
429
|
-
+ summary.stdout_total_bytes
|
430
|
-
+ summary.stderr_total_bytes
|
431
|
-
)
|
432
|
-
return summary
|
433
|
-
|
434
|
-
|
435
|
-
def _to_grpc_task_outcome(task_output: TaskOutput) -> TaskOutcome:
|
436
|
-
if task_output.success:
|
437
|
-
return TaskOutcome.TASK_OUTCOME_SUCCESS
|
438
|
-
else:
|
439
|
-
return TaskOutcome.TASK_OUTCOME_FAILURE
|
440
|
-
|
441
|
-
|
442
|
-
def _to_grpc_output_encoding(task_output: TaskOutput) -> OutputEncoding:
|
443
|
-
if task_output.output_encoding == "json":
|
444
|
-
return OutputEncoding.OUTPUT_ENCODING_JSON
|
445
|
-
else:
|
446
|
-
return OutputEncoding.OUTPUT_ENCODING_PICKLE
|
447
|
-
|
448
|
-
|
449
|
-
def _to_grpc_data_payload_encoding(task_output: TaskOutput) -> DataPayloadEncoding:
|
450
|
-
if task_output.output_encoding == "json":
|
451
|
-
return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_JSON
|
452
|
-
else:
|
453
|
-
return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_BINARY_PICKLE
|
454
|
-
|
455
|
-
|
456
|
-
def _compute_hash(data: bytes) -> str:
|
457
|
-
hasher = hashlib.sha256(usedforsecurity=False)
|
458
|
-
hasher.update(data)
|
459
|
-
return hasher.hexdigest()
|