indexify 0.4.21__py3-none-any.whl → 0.4.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/executor.py +2 -9
- indexify/executor/blob_store/blob_store.py +110 -26
- indexify/executor/blob_store/local_fs_blob_store.py +41 -1
- indexify/executor/blob_store/metrics/blob_store.py +87 -15
- indexify/executor/blob_store/s3_blob_store.py +112 -1
- indexify/executor/function_executor/function_executor.py +32 -56
- indexify/executor/function_executor/invocation_state_client.py +10 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +0 -1
- indexify/executor/function_executor_controller/create_function_executor.py +129 -116
- indexify/executor/function_executor_controller/downloads.py +34 -86
- indexify/executor/function_executor_controller/events.py +13 -7
- indexify/executor/function_executor_controller/finalize_task.py +184 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +121 -78
- indexify/executor/function_executor_controller/message_validators.py +10 -3
- indexify/executor/function_executor_controller/metrics/downloads.py +8 -52
- indexify/executor/function_executor_controller/metrics/finalize_task.py +20 -0
- indexify/executor/function_executor_controller/metrics/prepare_task.py +18 -0
- indexify/executor/function_executor_controller/metrics/run_task.py +5 -4
- indexify/executor/function_executor_controller/prepare_task.py +232 -14
- indexify/executor/function_executor_controller/run_task.py +189 -81
- indexify/executor/function_executor_controller/task_info.py +4 -7
- indexify/executor/function_executor_controller/task_input.py +21 -0
- indexify/executor/function_executor_controller/task_output.py +41 -33
- indexify/executor/function_executor_controller/terminate_function_executor.py +6 -1
- indexify/executor/logging.py +69 -0
- indexify/executor/monitoring/metrics.py +22 -0
- indexify/proto/executor_api.proto +11 -3
- indexify/proto/executor_api_pb2.py +54 -54
- indexify/proto/executor_api_pb2.pyi +8 -1
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/METADATA +6 -7
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/RECORD +33 -31
- indexify/executor/function_executor_controller/function_executor_startup_output.py +0 -21
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +0 -39
- indexify/executor/function_executor_controller/upload_task_output.py +0 -274
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/WHEEL +0 -0
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/entry_points.txt +0 -0
indexify/cli/executor.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from
|
1
|
+
from indexify.executor.logging import (
|
2
2
|
configure_development_mode_logging,
|
3
3
|
configure_logging_early,
|
4
4
|
configure_production_mode_logging,
|
@@ -162,13 +162,6 @@ def executor(
|
|
162
162
|
shutil.rmtree(str(executor_cache_path))
|
163
163
|
executor_cache_path.mkdir(parents=True, exist_ok=True)
|
164
164
|
|
165
|
-
blob_store: BLOBStore = BLOBStore(
|
166
|
-
# Local FS mode is used in tests and in cases when user wants to store data on NFS.
|
167
|
-
local=LocalFSBLOBStore(),
|
168
|
-
# S3 is initiliazed lazily so it's okay to create it even if the user is not going to use it.
|
169
|
-
s3=S3BLOBStore(),
|
170
|
-
)
|
171
|
-
|
172
165
|
host_resources_provider: HostResourcesProvider = HostResourcesProvider(
|
173
166
|
gpu_allocator=NvidiaGPUAllocator(logger),
|
174
167
|
# Assuming a simple setup in OSS where Executor container has a single file system
|
@@ -200,6 +193,6 @@ def executor(
|
|
200
193
|
config_path=config_path,
|
201
194
|
monitoring_server_host=monitoring_server_host,
|
202
195
|
monitoring_server_port=monitoring_server_port,
|
203
|
-
blob_store=
|
196
|
+
blob_store=BLOBStore(),
|
204
197
|
host_resources_provider=host_resources_provider,
|
205
198
|
).run()
|
@@ -1,13 +1,25 @@
|
|
1
|
-
from typing import Any
|
1
|
+
from typing import Any
|
2
2
|
|
3
3
|
from .local_fs_blob_store import LocalFSBLOBStore
|
4
4
|
from .metrics.blob_store import (
|
5
|
+
metric_abort_multipart_upload_errors,
|
6
|
+
metric_abort_multipart_upload_latency,
|
7
|
+
metric_abort_multipart_upload_requests,
|
8
|
+
metric_complete_multipart_upload_errors,
|
9
|
+
metric_complete_multipart_upload_latency,
|
10
|
+
metric_complete_multipart_upload_requests,
|
11
|
+
metric_create_multipart_upload_errors,
|
12
|
+
metric_create_multipart_upload_latency,
|
13
|
+
metric_create_multipart_upload_requests,
|
5
14
|
metric_get_blob_errors,
|
6
15
|
metric_get_blob_latency,
|
7
16
|
metric_get_blob_requests,
|
8
|
-
|
9
|
-
|
10
|
-
|
17
|
+
metric_presign_uri_errors,
|
18
|
+
metric_presign_uri_latency,
|
19
|
+
metric_presign_uri_requests,
|
20
|
+
metric_upload_blob_errors,
|
21
|
+
metric_upload_blob_latency,
|
22
|
+
metric_upload_blob_requests,
|
11
23
|
)
|
12
24
|
from .s3_blob_store import S3BLOBStore
|
13
25
|
|
@@ -15,12 +27,9 @@ from .s3_blob_store import S3BLOBStore
|
|
15
27
|
class BLOBStore:
|
16
28
|
"""Dispatches generic BLOB store calls to their real backends."""
|
17
29
|
|
18
|
-
def __init__(
|
19
|
-
self
|
20
|
-
|
21
|
-
"""Creates a BLOB store that uses the supplied BLOB stores."""
|
22
|
-
self._local: Optional[LocalFSBLOBStore] = local
|
23
|
-
self._s3: Optional[S3BLOBStore] = s3
|
30
|
+
def __init__(self):
|
31
|
+
self._local: LocalFSBLOBStore = LocalFSBLOBStore()
|
32
|
+
self._s3: S3BLOBStore = S3BLOBStore()
|
24
33
|
|
25
34
|
async def get(self, uri: str, logger: Any) -> bytes:
|
26
35
|
"""Returns binary value stored in BLOB with the supplied URI.
|
@@ -33,36 +42,111 @@ class BLOBStore:
|
|
33
42
|
):
|
34
43
|
metric_get_blob_requests.inc()
|
35
44
|
if _is_file_uri(uri):
|
36
|
-
self._check_local_is_available()
|
37
45
|
return await self._local.get(uri, logger)
|
38
46
|
else:
|
39
|
-
self._check_s3_is_available()
|
40
47
|
return await self._s3.get(uri, logger)
|
41
48
|
|
42
|
-
async def
|
49
|
+
async def presign_get_uri(self, uri: str, expires_in_sec: int, logger: Any) -> str:
|
50
|
+
"""Returns a presigned URI for getting the BLOB with the supplied URI.
|
51
|
+
|
52
|
+
The URI allows to read any byte range in the BLOB."""
|
53
|
+
with (
|
54
|
+
metric_presign_uri_errors.count_exceptions(),
|
55
|
+
metric_presign_uri_latency.time(),
|
56
|
+
):
|
57
|
+
metric_presign_uri_requests.inc()
|
58
|
+
if _is_file_uri(uri):
|
59
|
+
return await self._local.presign_get_uri(uri, expires_in_sec, logger)
|
60
|
+
else:
|
61
|
+
return await self._s3.presign_get_uri(uri, expires_in_sec, logger)
|
62
|
+
|
63
|
+
async def upload(self, uri: str, value: bytes, logger: Any) -> None:
|
43
64
|
"""Stores the supplied binary value in a BLOB with the supplied URI.
|
44
65
|
|
45
66
|
Overwrites existing BLOB. Raises Exception on error.
|
46
67
|
"""
|
47
68
|
with (
|
48
|
-
|
49
|
-
|
69
|
+
metric_upload_blob_errors.count_exceptions(),
|
70
|
+
metric_upload_blob_latency.time(),
|
71
|
+
):
|
72
|
+
metric_upload_blob_requests.inc()
|
73
|
+
if _is_file_uri(uri):
|
74
|
+
await self._local.upload(uri, value, logger)
|
75
|
+
else:
|
76
|
+
await self._s3.upload(uri, value, logger)
|
77
|
+
|
78
|
+
async def create_multipart_upload(self, uri: str, logger: Any) -> str:
|
79
|
+
"""Creates a multipart upload for BLOB with the supplied URI and returns the upload ID."""
|
80
|
+
with (
|
81
|
+
metric_create_multipart_upload_errors.count_exceptions(),
|
82
|
+
metric_create_multipart_upload_latency.time(),
|
50
83
|
):
|
51
|
-
|
84
|
+
metric_create_multipart_upload_requests.inc()
|
52
85
|
if _is_file_uri(uri):
|
53
|
-
self.
|
54
|
-
await self._local.put(uri, value, logger)
|
86
|
+
return await self._local.create_multipart_upload(uri, logger)
|
55
87
|
else:
|
56
|
-
self.
|
57
|
-
await self._s3.put(uri, value, logger)
|
88
|
+
return await self._s3.create_multipart_upload(uri, logger)
|
58
89
|
|
59
|
-
def
|
60
|
-
|
61
|
-
|
90
|
+
async def complete_multipart_upload(
|
91
|
+
self, uri: str, upload_id: str, parts_etags: list[str], logger: Any
|
92
|
+
) -> None:
|
93
|
+
"""Completes a multipart upload for BLOB with the supplied URI.
|
62
94
|
|
63
|
-
|
64
|
-
|
65
|
-
|
95
|
+
parts_etags is a list of ETags for the parts that were uploaded.
|
96
|
+
The list is ordered by part number starting from 1.
|
97
|
+
"""
|
98
|
+
with (
|
99
|
+
metric_complete_multipart_upload_errors.count_exceptions(),
|
100
|
+
metric_complete_multipart_upload_latency.time(),
|
101
|
+
):
|
102
|
+
metric_complete_multipart_upload_requests.inc()
|
103
|
+
if _is_file_uri(uri):
|
104
|
+
await self._local.complete_multipart_upload(
|
105
|
+
uri, upload_id, parts_etags, logger
|
106
|
+
)
|
107
|
+
else:
|
108
|
+
await self._s3.complete_multipart_upload(
|
109
|
+
uri, upload_id, parts_etags, logger
|
110
|
+
)
|
111
|
+
|
112
|
+
async def abort_multipart_upload(
|
113
|
+
self, uri: str, upload_id: str, logger: Any
|
114
|
+
) -> None:
|
115
|
+
"""Aborts a multipart upload for BLOB with the supplied URI."""
|
116
|
+
with (
|
117
|
+
metric_abort_multipart_upload_errors.count_exceptions(),
|
118
|
+
metric_abort_multipart_upload_latency.time(),
|
119
|
+
):
|
120
|
+
metric_abort_multipart_upload_requests.inc()
|
121
|
+
if _is_file_uri(uri):
|
122
|
+
await self._local.abort_multipart_upload(uri, upload_id, logger)
|
123
|
+
else:
|
124
|
+
await self._s3.abort_multipart_upload(uri, upload_id, logger)
|
125
|
+
|
126
|
+
async def presign_upload_part_uri(
|
127
|
+
self,
|
128
|
+
uri: str,
|
129
|
+
part_number: int,
|
130
|
+
upload_id: str,
|
131
|
+
expires_in_sec: int,
|
132
|
+
logger: Any,
|
133
|
+
) -> str:
|
134
|
+
"""Returns a presigned URI for uploading a part in a multipart upload.
|
135
|
+
|
136
|
+
part_number starts from 1."""
|
137
|
+
with (
|
138
|
+
metric_presign_uri_errors.count_exceptions(),
|
139
|
+
metric_presign_uri_latency.time(),
|
140
|
+
):
|
141
|
+
metric_presign_uri_requests.inc()
|
142
|
+
if _is_file_uri(uri):
|
143
|
+
return await self._local.presign_upload_part_uri(
|
144
|
+
uri, part_number, upload_id, expires_in_sec, logger
|
145
|
+
)
|
146
|
+
else:
|
147
|
+
return await self._s3.presign_upload_part_uri(
|
148
|
+
uri, part_number, upload_id, expires_in_sec, logger
|
149
|
+
)
|
66
150
|
|
67
151
|
|
68
152
|
def _is_file_uri(uri: str) -> bool:
|
@@ -16,7 +16,14 @@ class LocalFSBLOBStore:
|
|
16
16
|
# Run synchronous code in a thread to not block the event loop.
|
17
17
|
return await asyncio.to_thread(self._sync_get, _path_from_file_uri(uri))
|
18
18
|
|
19
|
-
async def
|
19
|
+
async def presign_get_uri(self, uri: str, expires_in_sec: int, logger: Any) -> str:
|
20
|
+
"""Returns a presigned URI for getting the file at the supplied URI.
|
21
|
+
|
22
|
+
For local files, just returns the file URI itself.
|
23
|
+
"""
|
24
|
+
return uri
|
25
|
+
|
26
|
+
async def upload(self, uri: str, value: bytes, logger: Any) -> None:
|
20
27
|
"""Stores the supplied binary value in a file at the supplied URI.
|
21
28
|
|
22
29
|
The URI must be a file URI (starts with "file://"). The path must be absolute.
|
@@ -25,6 +32,39 @@ class LocalFSBLOBStore:
|
|
25
32
|
# Run synchronous code in a thread to not block the event loop.
|
26
33
|
return await asyncio.to_thread(self._sync_put, _path_from_file_uri(uri), value)
|
27
34
|
|
35
|
+
async def create_multipart_upload(self, uri: str, logger: Any) -> str:
|
36
|
+
"""Creates a multipart upload for local file and returns a dummy upload ID."""
|
37
|
+
# Local files do not require multipart upload, return a dummy ID
|
38
|
+
return "local-multipart-upload-id"
|
39
|
+
|
40
|
+
async def complete_multipart_upload(
|
41
|
+
self, uri: str, upload_id: str, parts_etags: list[str], logger: Any
|
42
|
+
) -> None:
|
43
|
+
"""Completes a multipart upload for local file. No-op for local files."""
|
44
|
+
# No action needed for local files
|
45
|
+
return None
|
46
|
+
|
47
|
+
async def abort_multipart_upload(
|
48
|
+
self, uri: str, upload_id: str, logger: Any
|
49
|
+
) -> None:
|
50
|
+
"""Aborts a multipart upload for local file. No-op for local files."""
|
51
|
+
# No action needed for local files
|
52
|
+
return None
|
53
|
+
|
54
|
+
async def presign_upload_part_uri(
|
55
|
+
self,
|
56
|
+
uri: str,
|
57
|
+
part_number: int,
|
58
|
+
upload_id: str,
|
59
|
+
expires_in_sec: int,
|
60
|
+
logger: Any,
|
61
|
+
) -> str:
|
62
|
+
"""Returns a presigned URI for uploading a part in a multipart upload for local file.
|
63
|
+
|
64
|
+
For local files, just returns the file URI itself.
|
65
|
+
"""
|
66
|
+
return uri
|
67
|
+
|
28
68
|
def _sync_get(self, path: str) -> bytes:
|
29
69
|
if not os.path.isabs(path):
|
30
70
|
raise ValueError(f"Path {path} must be absolute")
|
@@ -3,31 +3,103 @@ import prometheus_client
|
|
3
3
|
from ...monitoring.metrics import latency_metric_for_fast_operation
|
4
4
|
|
5
5
|
metric_get_blob_requests: prometheus_client.Counter = prometheus_client.Counter(
|
6
|
-
"
|
7
|
-
"Number of get
|
6
|
+
"blob_store_get_requests",
|
7
|
+
"Number of get BLOB requests in BLOB store",
|
8
8
|
)
|
9
9
|
metric_get_blob_errors: prometheus_client.Counter = prometheus_client.Counter(
|
10
|
-
"
|
11
|
-
"Number of get
|
10
|
+
"blob_store_get_request_errors",
|
11
|
+
"Number of get BLOB request errors in BLOB store",
|
12
12
|
)
|
13
13
|
metric_get_blob_latency: prometheus_client.Histogram = (
|
14
14
|
latency_metric_for_fast_operation(
|
15
|
-
"
|
16
|
-
"get
|
15
|
+
"blob_store_get",
|
16
|
+
"BLOB store get BLOB request",
|
17
17
|
)
|
18
18
|
)
|
19
19
|
|
20
|
-
|
21
|
-
"
|
22
|
-
"Number of
|
20
|
+
metric_presign_uri_requests: prometheus_client.Counter = prometheus_client.Counter(
|
21
|
+
"blob_store_presign_uri_requests",
|
22
|
+
"Number of presign URI requests in BLOB store",
|
23
23
|
)
|
24
|
-
|
25
|
-
"
|
26
|
-
"Number of
|
24
|
+
metric_presign_uri_errors: prometheus_client.Counter = prometheus_client.Counter(
|
25
|
+
"blob_store_presign_uri_request_errors",
|
26
|
+
"Number of presign URI request errors in BLOB store",
|
27
27
|
)
|
28
|
-
|
28
|
+
metric_presign_uri_latency: prometheus_client.Histogram = (
|
29
29
|
latency_metric_for_fast_operation(
|
30
|
-
"
|
31
|
-
"
|
30
|
+
"blob_store_presign_uri",
|
31
|
+
"BLOB store presign URI request",
|
32
|
+
)
|
33
|
+
)
|
34
|
+
|
35
|
+
metric_upload_blob_requests: prometheus_client.Counter = prometheus_client.Counter(
|
36
|
+
"blob_store_upload_requests",
|
37
|
+
"Number of upload BLOB requests in BLOB store",
|
38
|
+
)
|
39
|
+
metric_upload_blob_errors: prometheus_client.Counter = prometheus_client.Counter(
|
40
|
+
"blob_store_upload_request_errors",
|
41
|
+
"Number of upload BLOB request errors in BLOB store",
|
42
|
+
)
|
43
|
+
metric_upload_blob_latency: prometheus_client.Histogram = (
|
44
|
+
latency_metric_for_fast_operation(
|
45
|
+
"blob_store_upload",
|
46
|
+
"BLOB store upload BLOB request",
|
47
|
+
)
|
48
|
+
)
|
49
|
+
|
50
|
+
metric_create_multipart_upload_requests: prometheus_client.Counter = (
|
51
|
+
prometheus_client.Counter(
|
52
|
+
"blob_store_create_multipart_upload_requests",
|
53
|
+
"Number of create multipart upload requests in BLOB store",
|
54
|
+
)
|
55
|
+
)
|
56
|
+
metric_create_multipart_upload_errors: prometheus_client.Counter = (
|
57
|
+
prometheus_client.Counter(
|
58
|
+
"blob_store_create_multipart_upload_request_errors",
|
59
|
+
"Number of create multipart upload request errors in BLOB store",
|
60
|
+
)
|
61
|
+
)
|
62
|
+
metric_create_multipart_upload_latency: prometheus_client.Histogram = (
|
63
|
+
latency_metric_for_fast_operation(
|
64
|
+
"blob_store_create_multipart_upload_request",
|
65
|
+
"create multipart upload request in BLOB store",
|
66
|
+
)
|
67
|
+
)
|
68
|
+
|
69
|
+
metric_complete_multipart_upload_requests: prometheus_client.Counter = (
|
70
|
+
prometheus_client.Counter(
|
71
|
+
"blob_store_complete_multipart_upload_requests",
|
72
|
+
"Number of complete multipart upload requests in BLOB store",
|
73
|
+
)
|
74
|
+
)
|
75
|
+
metric_complete_multipart_upload_errors: prometheus_client.Counter = (
|
76
|
+
prometheus_client.Counter(
|
77
|
+
"blob_store_complete_multipart_upload_request_errors",
|
78
|
+
"Number of complete multipart upload request errors in BLOB store",
|
79
|
+
)
|
80
|
+
)
|
81
|
+
metric_complete_multipart_upload_latency: prometheus_client.Histogram = (
|
82
|
+
latency_metric_for_fast_operation(
|
83
|
+
"blob_store_complete_multipart_upload_request",
|
84
|
+
"complete multipart upload request in BLOB store",
|
85
|
+
)
|
86
|
+
)
|
87
|
+
|
88
|
+
metric_abort_multipart_upload_requests: prometheus_client.Counter = (
|
89
|
+
prometheus_client.Counter(
|
90
|
+
"blob_store_abort_multipart_upload_requests",
|
91
|
+
"Number of abort multipart upload requests in BLOB store",
|
92
|
+
)
|
93
|
+
)
|
94
|
+
metric_abort_multipart_upload_errors: prometheus_client.Counter = (
|
95
|
+
prometheus_client.Counter(
|
96
|
+
"blob_store_abort_multipart_upload_request_errors",
|
97
|
+
"Number of abort multipart upload request errors in BLOB store",
|
98
|
+
)
|
99
|
+
)
|
100
|
+
metric_abort_multipart_upload_latency: prometheus_client.Histogram = (
|
101
|
+
latency_metric_for_fast_operation(
|
102
|
+
"blob_store_abort_multipart_upload_request",
|
103
|
+
"abort multipart upload request in BLOB store",
|
32
104
|
)
|
33
105
|
)
|
@@ -59,7 +59,28 @@ class S3BLOBStore:
|
|
59
59
|
logger.error("failed to get S3 object", uri=uri, exc_info=e)
|
60
60
|
raise
|
61
61
|
|
62
|
-
async def
|
62
|
+
async def presign_get_uri(self, uri: str, expires_in_sec: int, logger: Any) -> str:
|
63
|
+
"""Returns a presigned URI for getting the S3 object at the supplied URI."""
|
64
|
+
self._lazy_create_client()
|
65
|
+
bucket_name, key = _bucket_name_and_object_key_from_uri(uri)
|
66
|
+
try:
|
67
|
+
s3_uri: str = await asyncio.to_thread(
|
68
|
+
self._s3_client.generate_presigned_url,
|
69
|
+
ClientMethod="get_object",
|
70
|
+
Params={"Bucket": bucket_name, "Key": key},
|
71
|
+
ExpiresIn=expires_in_sec,
|
72
|
+
)
|
73
|
+
return s3_uri.replace("https://", "s3://", 1)
|
74
|
+
except Exception as e:
|
75
|
+
logger.error(
|
76
|
+
"failed to presign URI for get_object operation",
|
77
|
+
uri=uri,
|
78
|
+
exc_info=e,
|
79
|
+
expires_in_sec=expires_in_sec,
|
80
|
+
)
|
81
|
+
raise
|
82
|
+
|
83
|
+
async def upload(self, uri: str, value: bytes, logger: Any) -> None:
|
63
84
|
"""Stores the supplied binary value in a S3 object at the supplied URI.
|
64
85
|
|
65
86
|
The URI must be S3 URI (starts with "s3://").
|
@@ -75,6 +96,96 @@ class S3BLOBStore:
|
|
75
96
|
logger.error("failed to set S3 object", uri=uri, exc_info=e)
|
76
97
|
raise
|
77
98
|
|
99
|
+
async def create_multipart_upload(self, uri: str, logger: Any) -> str:
|
100
|
+
"""Creates a multipart upload for S3 object and returns the upload ID."""
|
101
|
+
self._lazy_create_client()
|
102
|
+
bucket_name, key = _bucket_name_and_object_key_from_uri(uri)
|
103
|
+
try:
|
104
|
+
response = await asyncio.to_thread(
|
105
|
+
self._s3_client.create_multipart_upload,
|
106
|
+
Bucket=bucket_name,
|
107
|
+
Key=key,
|
108
|
+
)
|
109
|
+
return response["UploadId"]
|
110
|
+
except Exception as e:
|
111
|
+
logger.error("failed to create multipart upload", uri=uri, exc_info=e)
|
112
|
+
raise
|
113
|
+
|
114
|
+
async def complete_multipart_upload(
|
115
|
+
self, uri: str, upload_id: str, parts_etags: list[str], logger: Any
|
116
|
+
) -> None:
|
117
|
+
"""Completes a multipart upload for S3 object."""
|
118
|
+
self._lazy_create_client()
|
119
|
+
bucket_name, key = _bucket_name_and_object_key_from_uri(uri)
|
120
|
+
try:
|
121
|
+
await asyncio.to_thread(
|
122
|
+
self._s3_client.complete_multipart_upload,
|
123
|
+
Bucket=bucket_name,
|
124
|
+
Key=key,
|
125
|
+
UploadId=upload_id,
|
126
|
+
MultipartUpload={
|
127
|
+
"Parts": [
|
128
|
+
{"ETag": etag, "PartNumber": i + 1}
|
129
|
+
for i, etag in enumerate(parts_etags)
|
130
|
+
]
|
131
|
+
},
|
132
|
+
)
|
133
|
+
except Exception as e:
|
134
|
+
logger.error("failed to complete multipart upload", uri=uri, exc_info=e)
|
135
|
+
raise
|
136
|
+
|
137
|
+
async def abort_multipart_upload(
|
138
|
+
self, uri: str, upload_id: str, logger: Any
|
139
|
+
) -> None:
|
140
|
+
"""Aborts a multipart upload for S3 object."""
|
141
|
+
self._lazy_create_client()
|
142
|
+
bucket_name, key = _bucket_name_and_object_key_from_uri(uri)
|
143
|
+
try:
|
144
|
+
await asyncio.to_thread(
|
145
|
+
self._s3_client.abort_multipart_upload,
|
146
|
+
Bucket=bucket_name,
|
147
|
+
Key=key,
|
148
|
+
UploadId=upload_id,
|
149
|
+
)
|
150
|
+
except Exception as e:
|
151
|
+
logger.error("failed to abort multipart upload", uri=uri, exc_info=e)
|
152
|
+
raise
|
153
|
+
|
154
|
+
async def presign_upload_part_uri(
|
155
|
+
self,
|
156
|
+
uri: str,
|
157
|
+
part_number: int,
|
158
|
+
upload_id: str,
|
159
|
+
expires_in_sec: int,
|
160
|
+
logger: Any,
|
161
|
+
) -> str:
|
162
|
+
"""Returns a presigned URI for uploading a part in a multipart upload for S3 object."""
|
163
|
+
self._lazy_create_client()
|
164
|
+
bucket_name, key = _bucket_name_and_object_key_from_uri(uri)
|
165
|
+
try:
|
166
|
+
response = await asyncio.to_thread(
|
167
|
+
self._s3_client.generate_presigned_url,
|
168
|
+
ClientMethod="upload_part",
|
169
|
+
Params={
|
170
|
+
"Bucket": bucket_name,
|
171
|
+
"Key": key,
|
172
|
+
"UploadId": upload_id,
|
173
|
+
"PartNumber": part_number,
|
174
|
+
},
|
175
|
+
ExpiresIn=expires_in_sec,
|
176
|
+
)
|
177
|
+
return response
|
178
|
+
except Exception as e:
|
179
|
+
logger.error(
|
180
|
+
"failed to presign URI for upload_part operation",
|
181
|
+
uri=uri,
|
182
|
+
exc_info=e,
|
183
|
+
part_number=part_number,
|
184
|
+
upload_id=upload_id,
|
185
|
+
expires_in_sec=expires_in_sec,
|
186
|
+
)
|
187
|
+
raise
|
188
|
+
|
78
189
|
|
79
190
|
def _bucket_name_and_object_key_from_uri(uri: str) -> tuple[str, str]:
|
80
191
|
# Example S3 object URI:
|
@@ -1,14 +1,11 @@
|
|
1
1
|
import asyncio
|
2
2
|
from dataclasses import dataclass
|
3
|
-
from enum import Enum
|
4
3
|
from typing import Any, Optional
|
5
4
|
|
6
5
|
import grpc
|
7
6
|
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
8
7
|
InfoRequest,
|
9
8
|
InfoResponse,
|
10
|
-
InitializationFailureReason,
|
11
|
-
InitializationOutcomeCode,
|
12
9
|
InitializeRequest,
|
13
10
|
InitializeResponse,
|
14
11
|
)
|
@@ -18,6 +15,8 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
|
18
15
|
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
19
16
|
from tensorlake.utils.http_client import get_httpx_client
|
20
17
|
|
18
|
+
from indexify.executor.monitoring.metrics import IdempotentCounterChanger
|
19
|
+
|
21
20
|
from .health_checker import HealthChecker
|
22
21
|
from .invocation_state_client import InvocationStateClient
|
23
22
|
from .metrics.function_executor import (
|
@@ -60,19 +59,14 @@ from .server.function_executor_server_factory import (
|
|
60
59
|
)
|
61
60
|
|
62
61
|
|
63
|
-
class FunctionExecutorInitializationError(Enum):
|
64
|
-
FUNCTION_TIMEOUT = 1
|
65
|
-
FUNCTION_ERROR = 2
|
66
|
-
|
67
|
-
|
68
62
|
@dataclass
|
69
63
|
class FunctionExecutorInitializationResult:
|
70
64
|
"""Result of FunctionExecutor initialization."""
|
71
65
|
|
72
|
-
#
|
73
|
-
|
74
|
-
|
75
|
-
|
66
|
+
# If True, timed out waiting for the Function Executor to initialize.
|
67
|
+
is_timeout: bool
|
68
|
+
# FE is unresponsive if response is None.
|
69
|
+
response: Optional[InitializeResponse]
|
76
70
|
|
77
71
|
|
78
72
|
class FunctionExecutor:
|
@@ -89,12 +83,17 @@ class FunctionExecutor:
|
|
89
83
|
|
90
84
|
def __init__(self, server_factory: FunctionExecutorServerFactory, logger: Any):
|
91
85
|
self._server_factory: FunctionExecutorServerFactory = server_factory
|
92
|
-
self._logger = logger.bind(module=__name__)
|
86
|
+
self._logger: Any = logger.bind(module=__name__)
|
93
87
|
self._server: Optional[FunctionExecutorServer] = None
|
94
88
|
self._channel: Optional[grpc.aio.Channel] = None
|
95
89
|
self._invocation_state_client: Optional[InvocationStateClient] = None
|
96
90
|
self._health_checker: Optional[HealthChecker] = None
|
97
|
-
|
91
|
+
self._function_executors_counter_changer: IdempotentCounterChanger = (
|
92
|
+
IdempotentCounterChanger(
|
93
|
+
metric_function_executors_count,
|
94
|
+
)
|
95
|
+
)
|
96
|
+
self._function_executors_counter_changer.inc()
|
98
97
|
|
99
98
|
async def initialize(
|
100
99
|
self,
|
@@ -102,11 +101,11 @@ class FunctionExecutor:
|
|
102
101
|
initialize_request: InitializeRequest,
|
103
102
|
base_url: str,
|
104
103
|
config_path: Optional[str],
|
105
|
-
customer_code_timeout_sec:
|
104
|
+
customer_code_timeout_sec: float,
|
106
105
|
) -> FunctionExecutorInitializationResult:
|
107
106
|
"""Creates and initializes a FunctionExecutorServer and all resources associated with it.
|
108
107
|
|
109
|
-
Raises an Exception if an internal error occured."""
|
108
|
+
Raises an Exception if an Executor side internal error occured."""
|
110
109
|
try:
|
111
110
|
with (
|
112
111
|
metric_create_errors.count_exceptions(),
|
@@ -126,7 +125,7 @@ class FunctionExecutor:
|
|
126
125
|
await self._create_health_checker(self._channel, stub)
|
127
126
|
|
128
127
|
return await _initialize_server(
|
129
|
-
stub, initialize_request, customer_code_timeout_sec
|
128
|
+
stub, initialize_request, customer_code_timeout_sec, self._logger
|
130
129
|
)
|
131
130
|
except Exception:
|
132
131
|
await self.destroy()
|
@@ -152,7 +151,7 @@ class FunctionExecutor:
|
|
152
151
|
metric_destroy_errors.count_exceptions(),
|
153
152
|
metric_destroy_latency.time(),
|
154
153
|
):
|
155
|
-
|
154
|
+
self._function_executors_counter_changer.dec()
|
156
155
|
metric_destroys.inc()
|
157
156
|
await self._destroy_health_checker()
|
158
157
|
await self._destroy_invocation_state_client()
|
@@ -306,7 +305,8 @@ async def _collect_server_info(stub: FunctionExecutorStub) -> None:
|
|
306
305
|
async def _initialize_server(
|
307
306
|
stub: FunctionExecutorStub,
|
308
307
|
initialize_request: InitializeRequest,
|
309
|
-
customer_code_timeout_sec:
|
308
|
+
customer_code_timeout_sec: float,
|
309
|
+
logger: Any,
|
310
310
|
) -> FunctionExecutorInitializationResult:
|
311
311
|
with (
|
312
312
|
metric_initialize_rpc_errors.count_exceptions(),
|
@@ -317,46 +317,22 @@ async def _initialize_server(
|
|
317
317
|
initialize_request,
|
318
318
|
timeout=customer_code_timeout_sec,
|
319
319
|
)
|
320
|
-
|
321
|
-
|
322
|
-
initialize_response
|
323
|
-
|
324
|
-
|
320
|
+
return FunctionExecutorInitializationResult(
|
321
|
+
is_timeout=False,
|
322
|
+
response=initialize_response,
|
323
|
+
)
|
324
|
+
except grpc.aio.AioRpcError as e:
|
325
|
+
if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
|
325
326
|
return FunctionExecutorInitializationResult(
|
326
|
-
|
327
|
+
is_timeout=True,
|
328
|
+
response=None,
|
327
329
|
)
|
328
|
-
elif (
|
329
|
-
initialize_response.outcome_code
|
330
|
-
== InitializationOutcomeCode.INITIALIZE_OUTCOME_CODE_FAILURE
|
331
|
-
):
|
332
|
-
if (
|
333
|
-
initialize_response.failure_reason
|
334
|
-
== InitializationFailureReason.INITIALIZATION_FAILURE_REASON_FUNCTION_ERROR
|
335
|
-
):
|
336
|
-
return FunctionExecutorInitializationResult(
|
337
|
-
error=FunctionExecutorInitializationError.FUNCTION_ERROR,
|
338
|
-
stdout=initialize_response.stdout,
|
339
|
-
stderr=initialize_response.stderr,
|
340
|
-
)
|
341
|
-
elif (
|
342
|
-
initialize_response.failure_reason
|
343
|
-
== InitializationFailureReason.INITIALIZATION_FAILURE_REASON_INTERNAL_ERROR
|
344
|
-
):
|
345
|
-
# Don't add stdout/stderr because this is customer data.
|
346
|
-
raise RuntimeError("initialize RPC failed with internal error")
|
347
|
-
else:
|
348
|
-
raise ValueError(
|
349
|
-
f"unexpected failure reason {InitializationFailureReason.Name(initialize_response.failure_reason)} in initialize RPC response"
|
350
|
-
)
|
351
330
|
else:
|
352
|
-
|
353
|
-
|
331
|
+
logger.error(
|
332
|
+
"Function Executor initialize RPC failed",
|
333
|
+
exc_info=e,
|
354
334
|
)
|
355
|
-
|
356
|
-
except grpc.aio.AioRpcError as e:
|
357
|
-
if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
|
358
335
|
return FunctionExecutorInitializationResult(
|
359
|
-
|
360
|
-
|
336
|
+
is_timeout=False,
|
337
|
+
response=None,
|
361
338
|
)
|
362
|
-
raise
|