indexify 0.3.31__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/__init__.py +18 -0
- indexify/cli/build_image.py +51 -0
- indexify/cli/deploy.py +57 -0
- indexify/cli/executor.py +205 -0
- indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
- indexify/executor/executor.py +57 -313
- indexify/executor/function_allowlist.py +59 -0
- indexify/executor/function_executor/function_executor.py +12 -6
- indexify/executor/function_executor/invocation_state_client.py +25 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
- indexify/executor/function_executor_controller/__init__.py +13 -0
- indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
- indexify/executor/function_executor_controller/create_function_executor.py +158 -0
- indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
- indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
- indexify/executor/function_executor_controller/downloads.py +199 -0
- indexify/executor/function_executor_controller/events.py +172 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
- indexify/executor/function_executor_controller/loggers.py +57 -0
- indexify/executor/function_executor_controller/message_validators.py +69 -0
- indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
- indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
- indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
- indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
- indexify/executor/function_executor_controller/prepare_task.py +38 -0
- indexify/executor/function_executor_controller/run_task.py +201 -0
- indexify/executor/function_executor_controller/task_info.py +33 -0
- indexify/executor/function_executor_controller/task_output.py +122 -0
- indexify/executor/function_executor_controller/upload_task_output.py +234 -0
- indexify/executor/host_resources/host_resources.py +20 -25
- indexify/executor/host_resources/nvidia_gpu_allocator.py +8 -1
- indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
- indexify/executor/metrics/executor.py +0 -47
- indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
- indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
- indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
- indexify/executor/monitoring/health_checker/health_checker.py +0 -11
- indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
- indexify/executor/state_reporter.py +364 -0
- indexify/proto/executor_api.proto +68 -60
- indexify/proto/executor_api_pb2.py +52 -52
- indexify/proto/executor_api_pb2.pyi +129 -108
- indexify/proto/executor_api_pb2_grpc.py +0 -47
- {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/METADATA +2 -5
- indexify-0.4.3.dist-info/RECORD +68 -0
- indexify-0.4.3.dist-info/entry_points.txt +3 -0
- indexify/cli/cli.py +0 -268
- indexify/executor/api_objects.py +0 -92
- indexify/executor/downloader.py +0 -417
- indexify/executor/executor_flavor.py +0 -7
- indexify/executor/function_executor/function_executor_state.py +0 -107
- indexify/executor/function_executor/function_executor_states_container.py +0 -93
- indexify/executor/function_executor/function_executor_status.py +0 -95
- indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
- indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
- indexify/executor/function_executor/single_task_runner.py +0 -345
- indexify/executor/function_executor/task_input.py +0 -21
- indexify/executor/function_executor/task_output.py +0 -105
- indexify/executor/grpc/function_executor_controller.py +0 -418
- indexify/executor/grpc/metrics/task_controller.py +0 -8
- indexify/executor/grpc/state_reporter.py +0 -317
- indexify/executor/grpc/task_controller.py +0 -508
- indexify/executor/metrics/task_fetcher.py +0 -21
- indexify/executor/metrics/task_reporter.py +0 -53
- indexify/executor/metrics/task_runner.py +0 -52
- indexify/executor/monitoring/function_allowlist.py +0 -25
- indexify/executor/runtime_probes.py +0 -68
- indexify/executor/task_fetcher.py +0 -96
- indexify/executor/task_reporter.py +0 -459
- indexify/executor/task_runner.py +0 -177
- indexify-0.3.31.dist-info/RECORD +0 -68
- indexify-0.3.31.dist-info/entry_points.txt +0 -3
- {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/WHEEL +0 -0
indexify/executor/executor.py
CHANGED
@@ -1,54 +1,35 @@
|
|
1
1
|
import asyncio
|
2
2
|
import signal
|
3
|
-
import time
|
4
3
|
from pathlib import Path
|
5
4
|
from socket import gethostname
|
6
|
-
from typing import
|
5
|
+
from typing import Dict, List, Optional
|
7
6
|
|
8
7
|
import structlog
|
9
|
-
from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
|
10
|
-
from tensorlake.utils.logging import suppress as suppress_logging
|
11
8
|
|
12
9
|
from indexify.proto.executor_api_pb2 import ExecutorStatus
|
13
10
|
|
14
|
-
from .api_objects import FunctionURI, Task
|
15
11
|
from .blob_store.blob_store import BLOBStore
|
16
|
-
from .
|
17
|
-
from .
|
18
|
-
|
19
|
-
|
12
|
+
from .channel_manager import ChannelManager
|
13
|
+
from .function_allowlist import (
|
14
|
+
FunctionURI,
|
15
|
+
function_allowlist_to_indexed_dict,
|
16
|
+
parse_function_uris,
|
20
17
|
)
|
21
18
|
from .function_executor.server.function_executor_server_factory import (
|
22
19
|
FunctionExecutorServerFactory,
|
23
20
|
)
|
24
|
-
from .grpc.channel_manager import ChannelManager
|
25
|
-
from .grpc.state_reconciler import ExecutorStateReconciler
|
26
|
-
from .grpc.state_reporter import ExecutorStateReporter
|
27
21
|
from .host_resources.host_resources import HostResourcesProvider
|
28
22
|
from .metrics.executor import (
|
29
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ALL,
|
30
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
|
31
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
|
32
|
-
METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
|
33
23
|
metric_executor_info,
|
34
24
|
metric_executor_state,
|
35
|
-
metric_task_completion_latency,
|
36
|
-
metric_task_outcome_report_latency,
|
37
|
-
metric_task_outcome_report_retries,
|
38
|
-
metric_task_outcome_reports,
|
39
|
-
metric_tasks_completed,
|
40
|
-
metric_tasks_fetched,
|
41
|
-
metric_tasks_reporting_outcome,
|
42
25
|
)
|
43
|
-
from .monitoring.function_allowlist import function_allowlist_to_info_dict
|
44
26
|
from .monitoring.health_check_handler import HealthCheckHandler
|
45
27
|
from .monitoring.health_checker.health_checker import HealthChecker
|
46
28
|
from .monitoring.prometheus_metrics_handler import PrometheusMetricsHandler
|
47
29
|
from .monitoring.server import MonitoringServer
|
48
30
|
from .monitoring.startup_probe_handler import StartupProbeHandler
|
49
|
-
from .
|
50
|
-
from .
|
51
|
-
from .task_runner import TaskInput, TaskOutput, TaskRunner
|
31
|
+
from .state_reconciler import ExecutorStateReconciler
|
32
|
+
from .state_reporter import ExecutorStateReporter
|
52
33
|
|
53
34
|
metric_executor_state.state("starting")
|
54
35
|
|
@@ -57,33 +38,26 @@ class Executor:
|
|
57
38
|
def __init__(
|
58
39
|
self,
|
59
40
|
id: str,
|
60
|
-
development_mode: bool,
|
61
|
-
flavor: ExecutorFlavor,
|
62
41
|
version: str,
|
63
42
|
labels: Dict[str, str],
|
64
|
-
|
43
|
+
cache_path: Path,
|
65
44
|
health_checker: HealthChecker,
|
66
|
-
|
45
|
+
function_uris: List[str],
|
67
46
|
function_executor_server_factory: FunctionExecutorServerFactory,
|
68
47
|
server_addr: str,
|
69
48
|
grpc_server_addr: str,
|
70
49
|
config_path: Optional[str],
|
71
50
|
monitoring_server_host: str,
|
72
51
|
monitoring_server_port: int,
|
73
|
-
enable_grpc_state_reconciler: bool,
|
74
52
|
blob_store: BLOBStore,
|
75
53
|
host_resources_provider: HostResourcesProvider,
|
76
54
|
):
|
77
55
|
self._logger = structlog.get_logger(module=__name__)
|
78
|
-
self._is_shutdown: bool = False
|
79
56
|
protocol: str = "http"
|
80
57
|
if config_path:
|
81
58
|
self._logger.info("running the extractor with TLS enabled")
|
82
59
|
protocol = "https"
|
83
60
|
|
84
|
-
self._server_addr = server_addr
|
85
|
-
self._base_url = f"{protocol}://{self._server_addr}"
|
86
|
-
self._code_path = code_path
|
87
61
|
self._startup_probe_handler = StartupProbeHandler()
|
88
62
|
self._monitoring_server = MonitoringServer(
|
89
63
|
host=monitoring_server_host,
|
@@ -92,33 +66,17 @@ class Executor:
|
|
92
66
|
health_probe_handler=HealthCheckHandler(health_checker),
|
93
67
|
metrics_handler=PrometheusMetricsHandler(),
|
94
68
|
)
|
95
|
-
self._function_executor_states = FunctionExecutorStatesContainer(
|
96
|
-
logger=self._logger
|
97
|
-
)
|
98
|
-
health_checker.set_function_executor_states_container(
|
99
|
-
self._function_executor_states
|
100
|
-
)
|
101
|
-
self._downloader = Downloader(
|
102
|
-
code_path=code_path,
|
103
|
-
base_url=self._base_url,
|
104
|
-
blob_store=blob_store,
|
105
|
-
config_path=config_path,
|
106
|
-
)
|
107
|
-
self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
|
108
|
-
self._function_executor_server_factory = function_executor_server_factory
|
109
69
|
self._channel_manager = ChannelManager(
|
110
70
|
server_address=grpc_server_addr,
|
111
71
|
config_path=config_path,
|
112
72
|
logger=self._logger,
|
113
73
|
)
|
74
|
+
function_allowlist: List[FunctionURI] = parse_function_uris(function_uris)
|
114
75
|
self._state_reporter = ExecutorStateReporter(
|
115
76
|
executor_id=id,
|
116
|
-
development_mode=development_mode,
|
117
|
-
flavor=flavor,
|
118
77
|
version=version,
|
119
78
|
labels=labels,
|
120
|
-
function_allowlist=
|
121
|
-
function_executor_states=self._function_executor_states,
|
79
|
+
function_allowlist=function_allowlist,
|
122
80
|
channel_manager=self._channel_manager,
|
123
81
|
host_resources_provider=host_resources_provider,
|
124
82
|
logger=self._logger,
|
@@ -126,69 +84,48 @@ class Executor:
|
|
126
84
|
self._state_reporter.update_executor_status(
|
127
85
|
ExecutorStatus.EXECUTOR_STATUS_STARTING_UP
|
128
86
|
)
|
129
|
-
self.
|
130
|
-
base_url=self._base_url,
|
87
|
+
self._state_reconciler = ExecutorStateReconciler(
|
131
88
|
executor_id=id,
|
89
|
+
function_executor_server_factory=function_executor_server_factory,
|
90
|
+
base_url=f"{protocol}://{server_addr}",
|
132
91
|
config_path=config_path,
|
133
|
-
|
92
|
+
cache_path=cache_path,
|
134
93
|
blob_store=blob_store,
|
94
|
+
channel_manager=self._channel_manager,
|
95
|
+
state_reporter=self._state_reporter,
|
96
|
+
logger=self._logger,
|
135
97
|
)
|
136
|
-
|
137
|
-
|
138
|
-
self._task_runner: Optional[TaskRunner] = None
|
139
|
-
self._task_fetcher: Optional[TaskFetcher] = None
|
140
|
-
# gRPC mode state reconciler that runs tasks
|
141
|
-
self._state_reconciler: Optional[ExecutorStateReconciler] = None
|
142
|
-
|
143
|
-
if enable_grpc_state_reconciler:
|
144
|
-
self._state_reconciler = ExecutorStateReconciler(
|
145
|
-
executor_id=id,
|
146
|
-
function_executor_server_factory=self._function_executor_server_factory,
|
147
|
-
base_url=self._base_url,
|
148
|
-
function_executor_states=self._function_executor_states,
|
149
|
-
config_path=config_path,
|
150
|
-
downloader=self._downloader,
|
151
|
-
task_reporter=self._task_reporter,
|
152
|
-
channel_manager=self._channel_manager,
|
153
|
-
state_reporter=self._state_reporter,
|
154
|
-
logger=self._logger,
|
155
|
-
)
|
156
|
-
else:
|
157
|
-
self._task_runner = TaskRunner(
|
158
|
-
executor_id=id,
|
159
|
-
function_executor_server_factory=function_executor_server_factory,
|
160
|
-
base_url=self._base_url,
|
161
|
-
function_executor_states=self._function_executor_states,
|
162
|
-
config_path=config_path,
|
163
|
-
)
|
164
|
-
self._task_fetcher = TaskFetcher(
|
165
|
-
executor_id=id,
|
166
|
-
executor_version=version,
|
167
|
-
labels=labels,
|
168
|
-
function_allowlist=function_allowlist,
|
169
|
-
protocol=protocol,
|
170
|
-
indexify_server_addr=self._server_addr,
|
171
|
-
config_path=config_path,
|
172
|
-
)
|
98
|
+
self._run_aio_task: Optional[asyncio.Task] = None
|
99
|
+
self._shutdown_aio_task: Optional[asyncio.Task] = None
|
173
100
|
|
174
101
|
executor_info: Dict[str, str] = {
|
175
102
|
"id": id,
|
176
|
-
"flavor": flavor.name,
|
177
103
|
"version": version,
|
178
|
-
"
|
104
|
+
"cache_path": str(cache_path),
|
179
105
|
"server_addr": server_addr,
|
180
106
|
"grpc_server_addr": str(grpc_server_addr),
|
181
107
|
"config_path": str(config_path),
|
182
|
-
"enable_grpc_state_reconciler": str(enable_grpc_state_reconciler),
|
183
108
|
"hostname": gethostname(),
|
184
109
|
}
|
185
110
|
for key, value in labels.items():
|
186
111
|
executor_info["label_" + key] = value
|
187
|
-
executor_info.update(
|
112
|
+
executor_info.update(function_allowlist_to_indexed_dict(function_allowlist))
|
188
113
|
metric_executor_info.info(executor_info)
|
189
114
|
|
190
115
|
def run(self):
|
191
116
|
asyncio.new_event_loop()
|
117
|
+
|
118
|
+
self._run_aio_task = asyncio.get_event_loop().create_task(
|
119
|
+
self._run(),
|
120
|
+
name="executor startup and run loop",
|
121
|
+
)
|
122
|
+
|
123
|
+
try:
|
124
|
+
asyncio.get_event_loop().run_until_complete(self._run_aio_task)
|
125
|
+
except asyncio.CancelledError:
|
126
|
+
pass # Expected exception on shutdown
|
127
|
+
|
128
|
+
async def _run(self):
|
192
129
|
for signum in [
|
193
130
|
signal.SIGABRT,
|
194
131
|
signal.SIGINT,
|
@@ -197,235 +134,42 @@ class Executor:
|
|
197
134
|
signal.SIGHUP,
|
198
135
|
]:
|
199
136
|
asyncio.get_event_loop().add_signal_handler(
|
200
|
-
signum, self.
|
137
|
+
signum, self._shutdown_signal_handler, asyncio.get_event_loop()
|
201
138
|
)
|
202
139
|
|
203
|
-
asyncio.
|
140
|
+
asyncio.create_task(
|
204
141
|
self._monitoring_server.run(), name="monitoring server runner"
|
205
142
|
)
|
206
143
|
self._state_reporter.update_executor_status(
|
207
144
|
ExecutorStatus.EXECUTOR_STATUS_RUNNING
|
208
145
|
)
|
209
|
-
|
210
|
-
|
211
|
-
)
|
212
|
-
|
146
|
+
self._state_reporter.run()
|
147
|
+
self._state_reconciler.run()
|
213
148
|
metric_executor_state.state("running")
|
214
149
|
self._startup_probe_handler.set_ready()
|
215
150
|
|
216
|
-
|
217
|
-
if self._state_reconciler is None:
|
218
|
-
asyncio.get_event_loop().run_until_complete(
|
219
|
-
self._http_task_runner_loop()
|
220
|
-
)
|
221
|
-
else:
|
222
|
-
asyncio.get_event_loop().run_until_complete(
|
223
|
-
self._grpc_state_reconciler_loop()
|
224
|
-
)
|
225
|
-
except asyncio.CancelledError:
|
226
|
-
pass # Suppress this expected exception and return without error (normally).
|
227
|
-
|
228
|
-
async def _grpc_state_reconciler_loop(self):
|
229
|
-
"""Runs the gRPC state reconciler and state reporter.
|
230
|
-
|
231
|
-
Never raises any exceptions."""
|
232
|
-
await self._state_reconciler.run()
|
233
|
-
|
234
|
-
async def _http_task_runner_loop(self):
|
235
|
-
while not self._is_shutdown:
|
236
|
-
try:
|
237
|
-
async for task in self._task_fetcher.run():
|
238
|
-
metric_tasks_fetched.inc()
|
239
|
-
if not self._is_shutdown:
|
240
|
-
asyncio.create_task(
|
241
|
-
self._run_task(task), name="task runner (http mode)"
|
242
|
-
)
|
243
|
-
self._logger.info("fetching tasks finished, reconnecting in 5 seconds")
|
244
|
-
except Exception as e:
|
245
|
-
self._logger.error(
|
246
|
-
"failed fetching tasks, retrying in 5 seconds", exc_info=e
|
247
|
-
)
|
248
|
-
if not self._is_shutdown:
|
249
|
-
await asyncio.sleep(5)
|
250
|
-
|
251
|
-
async def _run_task(self, task: Task) -> None:
|
252
|
-
"""Runs the supplied task.
|
253
|
-
|
254
|
-
Doesn't raise any Exceptions. All errors are reported to the server."""
|
255
|
-
start_time: float = time.monotonic()
|
256
|
-
logger = self._task_logger(task)
|
257
|
-
output: Optional[TaskOutput] = None
|
258
|
-
|
259
|
-
try:
|
260
|
-
output = await self._run_task_and_get_output(task, logger)
|
261
|
-
logger.info("task execution finished", success=output.success)
|
262
|
-
except Exception as e:
|
263
|
-
output = TaskOutput.internal_error(
|
264
|
-
task_id=task.id,
|
265
|
-
namespace=task.namespace,
|
266
|
-
graph_name=task.compute_graph,
|
267
|
-
function_name=task.compute_fn,
|
268
|
-
graph_version=task.graph_version,
|
269
|
-
graph_invocation_id=task.invocation_id,
|
270
|
-
output_payload_uri_prefix=task.output_payload_uri_prefix,
|
271
|
-
)
|
272
|
-
logger.error("task execution failed", exc_info=e)
|
273
|
-
|
274
|
-
if output.metrics is not None:
|
275
|
-
self.log_function_metrics(output)
|
276
|
-
|
277
|
-
with (
|
278
|
-
metric_tasks_reporting_outcome.track_inprogress(),
|
279
|
-
metric_task_outcome_report_latency.time(),
|
280
|
-
):
|
281
|
-
metric_task_outcome_reports.inc()
|
282
|
-
await self._report_task_outcome(output=output, logger=logger)
|
283
|
-
|
284
|
-
metric_task_completion_latency.observe(time.monotonic() - start_time)
|
285
|
-
|
286
|
-
def log_function_metrics(self, output: TaskOutput):
|
287
|
-
for counter_name, counter_value in output.metrics.counters.items():
|
288
|
-
self._logger.info(
|
289
|
-
f"function_metric",
|
290
|
-
counter_name=counter_name,
|
291
|
-
counter_value=counter_value,
|
292
|
-
invocation_id=output.graph_invocation_id,
|
293
|
-
function_name=output.function_name,
|
294
|
-
graph_name=output.graph_name,
|
295
|
-
namespace=output.namespace,
|
296
|
-
)
|
297
|
-
for timer_name, timer_value in output.metrics.timers.items():
|
298
|
-
self._logger.info(
|
299
|
-
f"function_metric",
|
300
|
-
timer_name=timer_name,
|
301
|
-
timer_value=timer_value,
|
302
|
-
invocation_id=output.graph_invocation_id,
|
303
|
-
function_name=output.function_name,
|
304
|
-
graph_name=output.graph_name,
|
305
|
-
namespace=output.namespace,
|
306
|
-
)
|
307
|
-
|
308
|
-
async def _run_task_and_get_output(self, task: Task, logger: Any) -> TaskOutput:
|
309
|
-
graph: SerializedObject = await self._downloader.download_graph(
|
310
|
-
namespace=task.namespace,
|
311
|
-
graph_name=task.compute_graph,
|
312
|
-
graph_version=task.graph_version,
|
313
|
-
logger=logger,
|
314
|
-
data_payload=task.graph_payload,
|
315
|
-
)
|
316
|
-
input: SerializedObject = await self._downloader.download_input(
|
317
|
-
namespace=task.namespace,
|
318
|
-
graph_name=task.compute_graph,
|
319
|
-
graph_invocation_id=task.invocation_id,
|
320
|
-
input_key=task.input_key,
|
321
|
-
data_payload=task.input_payload,
|
322
|
-
logger=logger,
|
323
|
-
)
|
324
|
-
init_value: Optional[SerializedObject] = (
|
325
|
-
None
|
326
|
-
if task.reducer_output_id is None and task.reducer_input_payload is None
|
327
|
-
else (
|
328
|
-
await self._downloader.download_init_value(
|
329
|
-
namespace=task.namespace,
|
330
|
-
graph_name=task.compute_graph,
|
331
|
-
function_name=task.compute_fn,
|
332
|
-
graph_invocation_id=task.invocation_id,
|
333
|
-
reducer_output_key=task.reducer_output_id,
|
334
|
-
data_payload=task.reducer_input_payload,
|
335
|
-
logger=logger,
|
336
|
-
)
|
337
|
-
)
|
338
|
-
)
|
339
|
-
return await self._task_runner.run(
|
340
|
-
TaskInput(
|
341
|
-
task=task,
|
342
|
-
graph=graph,
|
343
|
-
input=input,
|
344
|
-
init_value=init_value,
|
345
|
-
),
|
346
|
-
logger=logger,
|
347
|
-
)
|
348
|
-
|
349
|
-
async def _report_task_outcome(self, output: TaskOutput, logger: Any) -> None:
|
350
|
-
"""Reports the task with the given output to the server.
|
351
|
-
|
352
|
-
Doesn't raise any Exceptions. Runs till the reporting is successful."""
|
353
|
-
reporting_retries: int = 0
|
354
|
-
|
151
|
+
# Run the Executor forever until it is shut down.
|
355
152
|
while True:
|
356
|
-
|
357
|
-
try:
|
358
|
-
await self._task_reporter.report(output=output, logger=logger)
|
359
|
-
break
|
360
|
-
except Exception as e:
|
361
|
-
logger.error(
|
362
|
-
"failed to report task",
|
363
|
-
exc_info=e,
|
364
|
-
)
|
365
|
-
reporting_retries += 1
|
366
|
-
metric_task_outcome_report_retries.inc()
|
367
|
-
await asyncio.sleep(5)
|
153
|
+
await asyncio.sleep(10)
|
368
154
|
|
369
|
-
|
370
|
-
if
|
371
|
-
|
372
|
-
|
373
|
-
).inc()
|
374
|
-
elif output.success:
|
375
|
-
metric_tasks_completed.labels(
|
376
|
-
outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
|
377
|
-
).inc()
|
378
|
-
else:
|
379
|
-
metric_tasks_completed.labels(
|
380
|
-
outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
|
381
|
-
).inc()
|
382
|
-
|
383
|
-
async def _shutdown(self, loop):
|
384
|
-
self._logger.info(
|
385
|
-
"shutting down, all Executor logs are suppressed, no task outcomes will be reported to Server from this point"
|
386
|
-
)
|
387
|
-
if self._state_reporter is not None:
|
388
|
-
self._state_reporter.update_executor_status(
|
389
|
-
ExecutorStatus.EXECUTOR_STATUS_STOPPING
|
155
|
+
def _shutdown_signal_handler(self, loop):
|
156
|
+
if self._shutdown_aio_task is None:
|
157
|
+
self._shutdown_aio_task = loop.create_task(
|
158
|
+
self._shutdown(), name="executor shutdown"
|
390
159
|
)
|
391
|
-
metric_executor_state.state("shutting_down")
|
392
|
-
# There will be lots of task cancellation exceptions and "X is shutting down"
|
393
|
-
# exceptions logged during Executor shutdown. Suppress their logs as they are
|
394
|
-
# expected and are confusing for users.
|
395
|
-
suppress_logging()
|
396
160
|
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
if self._task_runner is not None:
|
402
|
-
await self._task_runner.shutdown()
|
403
|
-
|
404
|
-
if self._state_reporter is not None:
|
405
|
-
await self._state_reporter.shutdown()
|
406
|
-
if self._state_reconciler is not None:
|
407
|
-
await self._state_reconciler.shutdown()
|
408
|
-
if self._channel_manager is not None:
|
409
|
-
await self._channel_manager.destroy()
|
410
|
-
|
411
|
-
# We need to shutdown all users of FE states first,
|
412
|
-
# otherwise states might disappear unexpectedly and we might
|
413
|
-
# report errors, etc that are expected.
|
414
|
-
await self._function_executor_states.shutdown()
|
415
|
-
# We mainly need to cancel the task that runs _.*_mode_loop().
|
416
|
-
for task in asyncio.all_tasks(loop):
|
417
|
-
task.cancel()
|
418
|
-
# The current task is cancelled, the code after this line will not run.
|
161
|
+
async def _shutdown(self):
|
162
|
+
self._logger.info("shutting down Executor")
|
163
|
+
metric_executor_state.state("shutting_down")
|
419
164
|
|
420
|
-
|
421
|
-
|
165
|
+
# Shutdown state reconciler first because it changes reported state on shutdown.
|
166
|
+
await self._state_reconciler.shutdown()
|
422
167
|
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
graph=task.compute_graph,
|
427
|
-
graph_version=task.graph_version,
|
428
|
-
invocation_id=task.invocation_id,
|
429
|
-
function_name=task.compute_fn,
|
430
|
-
task_id=task.id,
|
168
|
+
# Do one last state report with STOPPED status. This reduces latency in the system.
|
169
|
+
self._state_reporter.update_executor_status(
|
170
|
+
ExecutorStatus.EXECUTOR_STATUS_STOPPED
|
431
171
|
)
|
172
|
+
await self._state_reporter.shutdown()
|
173
|
+
await self._channel_manager.destroy()
|
174
|
+
await self._monitoring_server.shutdown()
|
175
|
+
self._run_aio_task.cancel()
|
@@ -0,0 +1,59 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from typing import Dict, List, Optional
|
3
|
+
|
4
|
+
|
5
|
+
@dataclass
|
6
|
+
class FunctionURI:
|
7
|
+
namespace: str
|
8
|
+
compute_graph: str
|
9
|
+
compute_fn: str
|
10
|
+
version: Optional[str] = None
|
11
|
+
|
12
|
+
|
13
|
+
def function_allowlist_to_indexed_dict(
|
14
|
+
function_allowlist: List[FunctionURI],
|
15
|
+
) -> Dict[str, str]:
|
16
|
+
"""Returns a dictionary with each function URI in the allowlist as a key-value pair.
|
17
|
+
|
18
|
+
The keys are prefixed indexes in function allowlist, and the values are the function URIs
|
19
|
+
"""
|
20
|
+
indexed_dict = {}
|
21
|
+
counter = 0
|
22
|
+
for function_uri in function_allowlist:
|
23
|
+
function_uri: FunctionURI
|
24
|
+
indexed_dict[f"function_allowlist_{counter}"] = ":".join(
|
25
|
+
[
|
26
|
+
function_uri.namespace,
|
27
|
+
function_uri.compute_graph,
|
28
|
+
function_uri.compute_fn,
|
29
|
+
str(function_uri.version),
|
30
|
+
]
|
31
|
+
)
|
32
|
+
counter += 1
|
33
|
+
return indexed_dict
|
34
|
+
|
35
|
+
|
36
|
+
def parse_function_uris(function_uri_strs: List[str]) -> List[FunctionURI]:
|
37
|
+
"""Parses a list of function URIs from strings to FunctionURI objects."""
|
38
|
+
uris: List[FunctionURI] = []
|
39
|
+
for uri_str in function_uri_strs:
|
40
|
+
tokens = uri_str.split(":")
|
41
|
+
if len(tokens) < 3 or len(tokens) > 4:
|
42
|
+
raise ValueError(
|
43
|
+
"Function should be specified as <namespace>:<workflow>:<function>:<version> or"
|
44
|
+
"<namespace>:<workflow>:<function>"
|
45
|
+
)
|
46
|
+
version: Optional[str] = None
|
47
|
+
if len(tokens) == 4:
|
48
|
+
version = tokens[3]
|
49
|
+
|
50
|
+
uris.append(
|
51
|
+
FunctionURI(
|
52
|
+
namespace=tokens[0],
|
53
|
+
compute_graph=tokens[1],
|
54
|
+
compute_fn=tokens[2],
|
55
|
+
version=version,
|
56
|
+
)
|
57
|
+
)
|
58
|
+
|
59
|
+
return uris
|
@@ -56,7 +56,11 @@ from .server.function_executor_server_factory import (
|
|
56
56
|
)
|
57
57
|
|
58
58
|
|
59
|
-
class
|
59
|
+
class FunctionError(RuntimeError):
|
60
|
+
pass
|
61
|
+
|
62
|
+
|
63
|
+
class FunctionTimeoutError(FunctionError):
|
60
64
|
pass
|
61
65
|
|
62
66
|
|
@@ -92,7 +96,7 @@ class FunctionExecutor:
|
|
92
96
|
):
|
93
97
|
"""Creates and initializes a FunctionExecutorServer and all resources associated with it.
|
94
98
|
|
95
|
-
Raises
|
99
|
+
Raises FunctionError if the server failed to initialize due to an error in customer owned code or data.
|
96
100
|
Raises an Exception if an internal error occured."""
|
97
101
|
try:
|
98
102
|
with (
|
@@ -134,7 +138,9 @@ class FunctionExecutor:
|
|
134
138
|
async def destroy(self):
|
135
139
|
"""Destroys all resources owned by this FunctionExecutor.
|
136
140
|
|
137
|
-
Never raises any exceptions but logs them.
|
141
|
+
Never raises any exceptions but logs them.
|
142
|
+
Idempotent.
|
143
|
+
"""
|
138
144
|
try:
|
139
145
|
with (
|
140
146
|
metric_destroy_errors.count_exceptions(),
|
@@ -312,12 +318,12 @@ async def _initialize_server(
|
|
312
318
|
if initialize_response.success:
|
313
319
|
return
|
314
320
|
if initialize_response.HasField("customer_error"):
|
315
|
-
raise
|
321
|
+
raise FunctionError(initialize_response.customer_error)
|
316
322
|
else:
|
317
323
|
raise Exception("initialize RPC failed at function executor server")
|
318
324
|
except grpc.aio.AioRpcError as e:
|
319
325
|
if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
|
320
|
-
raise
|
321
|
-
f"
|
326
|
+
raise FunctionTimeoutError(
|
327
|
+
f"Function initialization exceeded its configured timeout of {customer_code_timeout_sec:.3f} sec."
|
322
328
|
) from e
|
323
329
|
raise
|
@@ -15,7 +15,6 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
|
15
15
|
)
|
16
16
|
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
17
17
|
|
18
|
-
from ..downloader import serialized_object_from_http_response
|
19
18
|
from .metrics.invocation_state_client import (
|
20
19
|
metric_request_read_errors,
|
21
20
|
metric_server_get_state_request_errors,
|
@@ -78,11 +77,18 @@ class InvocationStateClient:
|
|
78
77
|
If a request is not comming from the task ID that was added here then it will
|
79
78
|
be rejected. It's caller's responsibility to only add task IDs that are being
|
80
79
|
executed by the Function Executor so the Function Executor can't get access to
|
81
|
-
invocation state of tasks it doesn't run.
|
80
|
+
invocation state of tasks it doesn't run.
|
81
|
+
|
82
|
+
Doesn't raise any exceptions.
|
83
|
+
"""
|
82
84
|
self._task_id_to_invocation_id[task_id] = invocation_id
|
83
85
|
|
84
86
|
def remove_task_to_invocation_id_entry(self, task_id: str) -> None:
|
85
|
-
|
87
|
+
"""Removes a task ID to invocation ID entry from the client's internal state.
|
88
|
+
|
89
|
+
Doesn't raise any exceptions.
|
90
|
+
"""
|
91
|
+
self._task_id_to_invocation_id.pop(task_id, None)
|
86
92
|
|
87
93
|
async def destroy(self) -> None:
|
88
94
|
if self._request_loop_task is not None:
|
@@ -257,3 +263,19 @@ class InvocationStateClient:
|
|
257
263
|
)
|
258
264
|
else:
|
259
265
|
raise ValueError("unknown request type")
|
266
|
+
|
267
|
+
|
268
|
+
def serialized_object_from_http_response(response: httpx.Response) -> SerializedObject:
|
269
|
+
# We're hardcoding the content type currently used by Python SDK. It might change in the future.
|
270
|
+
# There's no other way for now to determine if the response is a bytes or string.
|
271
|
+
if response.headers["content-type"] in [
|
272
|
+
"application/octet-stream",
|
273
|
+
"application/pickle",
|
274
|
+
]:
|
275
|
+
return SerializedObject(
|
276
|
+
bytes=response.content, content_type=response.headers["content-type"]
|
277
|
+
)
|
278
|
+
else:
|
279
|
+
return SerializedObject(
|
280
|
+
string=response.text, content_type=response.headers["content-type"]
|
281
|
+
)
|
@@ -24,9 +24,9 @@ class FunctionExecutorServerConfiguration:
|
|
24
24
|
graph_version: str
|
25
25
|
image_uri: Optional[str]
|
26
26
|
secret_names: List[str]
|
27
|
-
cpu_ms_per_sec:
|
28
|
-
memory_bytes:
|
29
|
-
disk_bytes:
|
27
|
+
cpu_ms_per_sec: int
|
28
|
+
memory_bytes: int
|
29
|
+
disk_bytes: int
|
30
30
|
gpu_count: int
|
31
31
|
|
32
32
|
|