indexify 0.3.30__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/__init__.py +18 -0
- indexify/cli/build_image.py +51 -0
- indexify/cli/deploy.py +57 -0
- indexify/cli/executor.py +205 -0
- indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
- indexify/executor/executor.py +57 -311
- indexify/executor/function_allowlist.py +59 -0
- indexify/executor/function_executor/function_executor.py +12 -6
- indexify/executor/function_executor/invocation_state_client.py +25 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
- indexify/executor/function_executor_controller/__init__.py +13 -0
- indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
- indexify/executor/function_executor_controller/create_function_executor.py +154 -0
- indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
- indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
- indexify/executor/function_executor_controller/downloads.py +199 -0
- indexify/executor/function_executor_controller/events.py +172 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
- indexify/executor/function_executor_controller/loggers.py +57 -0
- indexify/executor/function_executor_controller/message_validators.py +65 -0
- indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
- indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
- indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
- indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
- indexify/executor/function_executor_controller/prepare_task.py +38 -0
- indexify/executor/function_executor_controller/run_task.py +201 -0
- indexify/executor/function_executor_controller/task_info.py +33 -0
- indexify/executor/function_executor_controller/task_output.py +122 -0
- indexify/executor/function_executor_controller/upload_task_output.py +234 -0
- indexify/executor/host_resources/host_resources.py +20 -25
- indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
- indexify/executor/metrics/executor.py +0 -47
- indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
- indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
- indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
- indexify/executor/monitoring/health_checker/health_checker.py +0 -11
- indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
- indexify/executor/state_reporter.py +364 -0
- indexify/proto/executor_api.proto +67 -59
- indexify/proto/executor_api_pb2.py +52 -52
- indexify/proto/executor_api_pb2.pyi +125 -104
- indexify/proto/executor_api_pb2_grpc.py +0 -47
- {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/METADATA +1 -3
- indexify-0.4.2.dist-info/RECORD +68 -0
- indexify-0.4.2.dist-info/entry_points.txt +3 -0
- indexify/cli/cli.py +0 -267
- indexify/executor/api_objects.py +0 -92
- indexify/executor/downloader.py +0 -417
- indexify/executor/executor_flavor.py +0 -7
- indexify/executor/function_executor/function_executor_state.py +0 -107
- indexify/executor/function_executor/function_executor_states_container.py +0 -93
- indexify/executor/function_executor/function_executor_status.py +0 -95
- indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
- indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
- indexify/executor/function_executor/single_task_runner.py +0 -345
- indexify/executor/function_executor/task_input.py +0 -21
- indexify/executor/function_executor/task_output.py +0 -105
- indexify/executor/grpc/function_executor_controller.py +0 -418
- indexify/executor/grpc/metrics/task_controller.py +0 -8
- indexify/executor/grpc/state_reporter.py +0 -314
- indexify/executor/grpc/task_controller.py +0 -508
- indexify/executor/metrics/task_fetcher.py +0 -21
- indexify/executor/metrics/task_reporter.py +0 -53
- indexify/executor/metrics/task_runner.py +0 -52
- indexify/executor/monitoring/function_allowlist.py +0 -25
- indexify/executor/runtime_probes.py +0 -68
- indexify/executor/task_fetcher.py +0 -96
- indexify/executor/task_reporter.py +0 -459
- indexify/executor/task_runner.py +0 -177
- indexify-0.3.30.dist-info/RECORD +0 -68
- indexify-0.3.30.dist-info/entry_points.txt +0 -3
- {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/WHEEL +0 -0
indexify/executor/executor.py
CHANGED
@@ -1,54 +1,35 @@
|
|
1
1
|
import asyncio
|
2
2
|
import signal
|
3
|
-
import time
|
4
3
|
from pathlib import Path
|
5
4
|
from socket import gethostname
|
6
|
-
from typing import
|
5
|
+
from typing import Dict, List, Optional
|
7
6
|
|
8
7
|
import structlog
|
9
|
-
from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
|
10
|
-
from tensorlake.utils.logging import suppress as suppress_logging
|
11
8
|
|
12
9
|
from indexify.proto.executor_api_pb2 import ExecutorStatus
|
13
10
|
|
14
|
-
from .api_objects import FunctionURI, Task
|
15
11
|
from .blob_store.blob_store import BLOBStore
|
16
|
-
from .
|
17
|
-
from .
|
18
|
-
|
19
|
-
|
12
|
+
from .channel_manager import ChannelManager
|
13
|
+
from .function_allowlist import (
|
14
|
+
FunctionURI,
|
15
|
+
function_allowlist_to_indexed_dict,
|
16
|
+
parse_function_uris,
|
20
17
|
)
|
21
18
|
from .function_executor.server.function_executor_server_factory import (
|
22
19
|
FunctionExecutorServerFactory,
|
23
20
|
)
|
24
|
-
from .grpc.channel_manager import ChannelManager
|
25
|
-
from .grpc.state_reconciler import ExecutorStateReconciler
|
26
|
-
from .grpc.state_reporter import ExecutorStateReporter
|
27
21
|
from .host_resources.host_resources import HostResourcesProvider
|
28
22
|
from .metrics.executor import (
|
29
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ALL,
|
30
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
|
31
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
|
32
|
-
METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
|
33
23
|
metric_executor_info,
|
34
24
|
metric_executor_state,
|
35
|
-
metric_task_completion_latency,
|
36
|
-
metric_task_outcome_report_latency,
|
37
|
-
metric_task_outcome_report_retries,
|
38
|
-
metric_task_outcome_reports,
|
39
|
-
metric_tasks_completed,
|
40
|
-
metric_tasks_fetched,
|
41
|
-
metric_tasks_reporting_outcome,
|
42
25
|
)
|
43
|
-
from .monitoring.function_allowlist import function_allowlist_to_info_dict
|
44
26
|
from .monitoring.health_check_handler import HealthCheckHandler
|
45
27
|
from .monitoring.health_checker.health_checker import HealthChecker
|
46
28
|
from .monitoring.prometheus_metrics_handler import PrometheusMetricsHandler
|
47
29
|
from .monitoring.server import MonitoringServer
|
48
30
|
from .monitoring.startup_probe_handler import StartupProbeHandler
|
49
|
-
from .
|
50
|
-
from .
|
51
|
-
from .task_runner import TaskInput, TaskOutput, TaskRunner
|
31
|
+
from .state_reconciler import ExecutorStateReconciler
|
32
|
+
from .state_reporter import ExecutorStateReporter
|
52
33
|
|
53
34
|
metric_executor_state.state("starting")
|
54
35
|
|
@@ -57,32 +38,26 @@ class Executor:
|
|
57
38
|
def __init__(
|
58
39
|
self,
|
59
40
|
id: str,
|
60
|
-
flavor: ExecutorFlavor,
|
61
41
|
version: str,
|
62
42
|
labels: Dict[str, str],
|
63
|
-
|
43
|
+
cache_path: Path,
|
64
44
|
health_checker: HealthChecker,
|
65
|
-
|
45
|
+
function_uris: List[str],
|
66
46
|
function_executor_server_factory: FunctionExecutorServerFactory,
|
67
47
|
server_addr: str,
|
68
48
|
grpc_server_addr: str,
|
69
49
|
config_path: Optional[str],
|
70
50
|
monitoring_server_host: str,
|
71
51
|
monitoring_server_port: int,
|
72
|
-
enable_grpc_state_reconciler: bool,
|
73
52
|
blob_store: BLOBStore,
|
74
53
|
host_resources_provider: HostResourcesProvider,
|
75
54
|
):
|
76
55
|
self._logger = structlog.get_logger(module=__name__)
|
77
|
-
self._is_shutdown: bool = False
|
78
56
|
protocol: str = "http"
|
79
57
|
if config_path:
|
80
58
|
self._logger.info("running the extractor with TLS enabled")
|
81
59
|
protocol = "https"
|
82
60
|
|
83
|
-
self._server_addr = server_addr
|
84
|
-
self._base_url = f"{protocol}://{self._server_addr}"
|
85
|
-
self._code_path = code_path
|
86
61
|
self._startup_probe_handler = StartupProbeHandler()
|
87
62
|
self._monitoring_server = MonitoringServer(
|
88
63
|
host=monitoring_server_host,
|
@@ -91,32 +66,17 @@ class Executor:
|
|
91
66
|
health_probe_handler=HealthCheckHandler(health_checker),
|
92
67
|
metrics_handler=PrometheusMetricsHandler(),
|
93
68
|
)
|
94
|
-
self._function_executor_states = FunctionExecutorStatesContainer(
|
95
|
-
logger=self._logger
|
96
|
-
)
|
97
|
-
health_checker.set_function_executor_states_container(
|
98
|
-
self._function_executor_states
|
99
|
-
)
|
100
|
-
self._downloader = Downloader(
|
101
|
-
code_path=code_path,
|
102
|
-
base_url=self._base_url,
|
103
|
-
blob_store=blob_store,
|
104
|
-
config_path=config_path,
|
105
|
-
)
|
106
|
-
self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
|
107
|
-
self._function_executor_server_factory = function_executor_server_factory
|
108
69
|
self._channel_manager = ChannelManager(
|
109
70
|
server_address=grpc_server_addr,
|
110
71
|
config_path=config_path,
|
111
72
|
logger=self._logger,
|
112
73
|
)
|
74
|
+
function_allowlist: List[FunctionURI] = parse_function_uris(function_uris)
|
113
75
|
self._state_reporter = ExecutorStateReporter(
|
114
76
|
executor_id=id,
|
115
|
-
flavor=flavor,
|
116
77
|
version=version,
|
117
78
|
labels=labels,
|
118
|
-
function_allowlist=
|
119
|
-
function_executor_states=self._function_executor_states,
|
79
|
+
function_allowlist=function_allowlist,
|
120
80
|
channel_manager=self._channel_manager,
|
121
81
|
host_resources_provider=host_resources_provider,
|
122
82
|
logger=self._logger,
|
@@ -124,69 +84,48 @@ class Executor:
|
|
124
84
|
self._state_reporter.update_executor_status(
|
125
85
|
ExecutorStatus.EXECUTOR_STATUS_STARTING_UP
|
126
86
|
)
|
127
|
-
self.
|
128
|
-
base_url=self._base_url,
|
87
|
+
self._state_reconciler = ExecutorStateReconciler(
|
129
88
|
executor_id=id,
|
89
|
+
function_executor_server_factory=function_executor_server_factory,
|
90
|
+
base_url=f"{protocol}://{server_addr}",
|
130
91
|
config_path=config_path,
|
131
|
-
|
92
|
+
cache_path=cache_path,
|
132
93
|
blob_store=blob_store,
|
94
|
+
channel_manager=self._channel_manager,
|
95
|
+
state_reporter=self._state_reporter,
|
96
|
+
logger=self._logger,
|
133
97
|
)
|
134
|
-
|
135
|
-
|
136
|
-
self._task_runner: Optional[TaskRunner] = None
|
137
|
-
self._task_fetcher: Optional[TaskFetcher] = None
|
138
|
-
# gRPC mode state reconciler that runs tasks
|
139
|
-
self._state_reconciler: Optional[ExecutorStateReconciler] = None
|
140
|
-
|
141
|
-
if enable_grpc_state_reconciler:
|
142
|
-
self._state_reconciler = ExecutorStateReconciler(
|
143
|
-
executor_id=id,
|
144
|
-
function_executor_server_factory=self._function_executor_server_factory,
|
145
|
-
base_url=self._base_url,
|
146
|
-
function_executor_states=self._function_executor_states,
|
147
|
-
config_path=config_path,
|
148
|
-
downloader=self._downloader,
|
149
|
-
task_reporter=self._task_reporter,
|
150
|
-
channel_manager=self._channel_manager,
|
151
|
-
state_reporter=self._state_reporter,
|
152
|
-
logger=self._logger,
|
153
|
-
)
|
154
|
-
else:
|
155
|
-
self._task_runner = TaskRunner(
|
156
|
-
executor_id=id,
|
157
|
-
function_executor_server_factory=function_executor_server_factory,
|
158
|
-
base_url=self._base_url,
|
159
|
-
function_executor_states=self._function_executor_states,
|
160
|
-
config_path=config_path,
|
161
|
-
)
|
162
|
-
self._task_fetcher = TaskFetcher(
|
163
|
-
executor_id=id,
|
164
|
-
executor_version=version,
|
165
|
-
labels=labels,
|
166
|
-
function_allowlist=function_allowlist,
|
167
|
-
protocol=protocol,
|
168
|
-
indexify_server_addr=self._server_addr,
|
169
|
-
config_path=config_path,
|
170
|
-
)
|
98
|
+
self._run_aio_task: Optional[asyncio.Task] = None
|
99
|
+
self._shutdown_aio_task: Optional[asyncio.Task] = None
|
171
100
|
|
172
101
|
executor_info: Dict[str, str] = {
|
173
102
|
"id": id,
|
174
|
-
"flavor": flavor.name,
|
175
103
|
"version": version,
|
176
|
-
"
|
104
|
+
"cache_path": str(cache_path),
|
177
105
|
"server_addr": server_addr,
|
178
106
|
"grpc_server_addr": str(grpc_server_addr),
|
179
107
|
"config_path": str(config_path),
|
180
|
-
"enable_grpc_state_reconciler": str(enable_grpc_state_reconciler),
|
181
108
|
"hostname": gethostname(),
|
182
109
|
}
|
183
110
|
for key, value in labels.items():
|
184
111
|
executor_info["label_" + key] = value
|
185
|
-
executor_info.update(
|
112
|
+
executor_info.update(function_allowlist_to_indexed_dict(function_allowlist))
|
186
113
|
metric_executor_info.info(executor_info)
|
187
114
|
|
188
115
|
def run(self):
|
189
116
|
asyncio.new_event_loop()
|
117
|
+
|
118
|
+
self._run_aio_task = asyncio.get_event_loop().create_task(
|
119
|
+
self._run(),
|
120
|
+
name="executor startup and run loop",
|
121
|
+
)
|
122
|
+
|
123
|
+
try:
|
124
|
+
asyncio.get_event_loop().run_until_complete(self._run_aio_task)
|
125
|
+
except asyncio.CancelledError:
|
126
|
+
pass # Expected exception on shutdown
|
127
|
+
|
128
|
+
async def _run(self):
|
190
129
|
for signum in [
|
191
130
|
signal.SIGABRT,
|
192
131
|
signal.SIGINT,
|
@@ -195,235 +134,42 @@ class Executor:
|
|
195
134
|
signal.SIGHUP,
|
196
135
|
]:
|
197
136
|
asyncio.get_event_loop().add_signal_handler(
|
198
|
-
signum, self.
|
137
|
+
signum, self._shutdown_signal_handler, asyncio.get_event_loop()
|
199
138
|
)
|
200
139
|
|
201
|
-
asyncio.
|
140
|
+
asyncio.create_task(
|
202
141
|
self._monitoring_server.run(), name="monitoring server runner"
|
203
142
|
)
|
204
143
|
self._state_reporter.update_executor_status(
|
205
144
|
ExecutorStatus.EXECUTOR_STATUS_RUNNING
|
206
145
|
)
|
207
|
-
|
208
|
-
|
209
|
-
)
|
210
|
-
|
146
|
+
self._state_reporter.run()
|
147
|
+
self._state_reconciler.run()
|
211
148
|
metric_executor_state.state("running")
|
212
149
|
self._startup_probe_handler.set_ready()
|
213
150
|
|
214
|
-
|
215
|
-
if self._state_reconciler is None:
|
216
|
-
asyncio.get_event_loop().run_until_complete(
|
217
|
-
self._http_task_runner_loop()
|
218
|
-
)
|
219
|
-
else:
|
220
|
-
asyncio.get_event_loop().run_until_complete(
|
221
|
-
self._grpc_state_reconciler_loop()
|
222
|
-
)
|
223
|
-
except asyncio.CancelledError:
|
224
|
-
pass # Suppress this expected exception and return without error (normally).
|
225
|
-
|
226
|
-
async def _grpc_state_reconciler_loop(self):
|
227
|
-
"""Runs the gRPC state reconciler and state reporter.
|
228
|
-
|
229
|
-
Never raises any exceptions."""
|
230
|
-
await self._state_reconciler.run()
|
231
|
-
|
232
|
-
async def _http_task_runner_loop(self):
|
233
|
-
while not self._is_shutdown:
|
234
|
-
try:
|
235
|
-
async for task in self._task_fetcher.run():
|
236
|
-
metric_tasks_fetched.inc()
|
237
|
-
if not self._is_shutdown:
|
238
|
-
asyncio.create_task(
|
239
|
-
self._run_task(task), name="task runner (http mode)"
|
240
|
-
)
|
241
|
-
self._logger.info("fetching tasks finished, reconnecting in 5 seconds")
|
242
|
-
except Exception as e:
|
243
|
-
self._logger.error(
|
244
|
-
"failed fetching tasks, retrying in 5 seconds", exc_info=e
|
245
|
-
)
|
246
|
-
if not self._is_shutdown:
|
247
|
-
await asyncio.sleep(5)
|
248
|
-
|
249
|
-
async def _run_task(self, task: Task) -> None:
|
250
|
-
"""Runs the supplied task.
|
251
|
-
|
252
|
-
Doesn't raise any Exceptions. All errors are reported to the server."""
|
253
|
-
start_time: float = time.monotonic()
|
254
|
-
logger = self._task_logger(task)
|
255
|
-
output: Optional[TaskOutput] = None
|
256
|
-
|
257
|
-
try:
|
258
|
-
output = await self._run_task_and_get_output(task, logger)
|
259
|
-
logger.info("task execution finished", success=output.success)
|
260
|
-
except Exception as e:
|
261
|
-
output = TaskOutput.internal_error(
|
262
|
-
task_id=task.id,
|
263
|
-
namespace=task.namespace,
|
264
|
-
graph_name=task.compute_graph,
|
265
|
-
function_name=task.compute_fn,
|
266
|
-
graph_version=task.graph_version,
|
267
|
-
graph_invocation_id=task.invocation_id,
|
268
|
-
output_payload_uri_prefix=task.output_payload_uri_prefix,
|
269
|
-
)
|
270
|
-
logger.error("task execution failed", exc_info=e)
|
271
|
-
|
272
|
-
if output.metrics is not None:
|
273
|
-
self.log_function_metrics(output)
|
274
|
-
|
275
|
-
with (
|
276
|
-
metric_tasks_reporting_outcome.track_inprogress(),
|
277
|
-
metric_task_outcome_report_latency.time(),
|
278
|
-
):
|
279
|
-
metric_task_outcome_reports.inc()
|
280
|
-
await self._report_task_outcome(output=output, logger=logger)
|
281
|
-
|
282
|
-
metric_task_completion_latency.observe(time.monotonic() - start_time)
|
283
|
-
|
284
|
-
def log_function_metrics(self, output: TaskOutput):
|
285
|
-
for counter_name, counter_value in output.metrics.counters.items():
|
286
|
-
self._logger.info(
|
287
|
-
f"function_metric",
|
288
|
-
counter_name=counter_name,
|
289
|
-
counter_value=counter_value,
|
290
|
-
invocation_id=output.graph_invocation_id,
|
291
|
-
function_name=output.function_name,
|
292
|
-
graph_name=output.graph_name,
|
293
|
-
namespace=output.namespace,
|
294
|
-
)
|
295
|
-
for timer_name, timer_value in output.metrics.timers.items():
|
296
|
-
self._logger.info(
|
297
|
-
f"function_metric",
|
298
|
-
timer_name=timer_name,
|
299
|
-
timer_value=timer_value,
|
300
|
-
invocation_id=output.graph_invocation_id,
|
301
|
-
function_name=output.function_name,
|
302
|
-
graph_name=output.graph_name,
|
303
|
-
namespace=output.namespace,
|
304
|
-
)
|
305
|
-
|
306
|
-
async def _run_task_and_get_output(self, task: Task, logger: Any) -> TaskOutput:
|
307
|
-
graph: SerializedObject = await self._downloader.download_graph(
|
308
|
-
namespace=task.namespace,
|
309
|
-
graph_name=task.compute_graph,
|
310
|
-
graph_version=task.graph_version,
|
311
|
-
logger=logger,
|
312
|
-
data_payload=task.graph_payload,
|
313
|
-
)
|
314
|
-
input: SerializedObject = await self._downloader.download_input(
|
315
|
-
namespace=task.namespace,
|
316
|
-
graph_name=task.compute_graph,
|
317
|
-
graph_invocation_id=task.invocation_id,
|
318
|
-
input_key=task.input_key,
|
319
|
-
data_payload=task.input_payload,
|
320
|
-
logger=logger,
|
321
|
-
)
|
322
|
-
init_value: Optional[SerializedObject] = (
|
323
|
-
None
|
324
|
-
if task.reducer_output_id is None and task.reducer_input_payload is None
|
325
|
-
else (
|
326
|
-
await self._downloader.download_init_value(
|
327
|
-
namespace=task.namespace,
|
328
|
-
graph_name=task.compute_graph,
|
329
|
-
function_name=task.compute_fn,
|
330
|
-
graph_invocation_id=task.invocation_id,
|
331
|
-
reducer_output_key=task.reducer_output_id,
|
332
|
-
data_payload=task.reducer_input_payload,
|
333
|
-
logger=logger,
|
334
|
-
)
|
335
|
-
)
|
336
|
-
)
|
337
|
-
return await self._task_runner.run(
|
338
|
-
TaskInput(
|
339
|
-
task=task,
|
340
|
-
graph=graph,
|
341
|
-
input=input,
|
342
|
-
init_value=init_value,
|
343
|
-
),
|
344
|
-
logger=logger,
|
345
|
-
)
|
346
|
-
|
347
|
-
async def _report_task_outcome(self, output: TaskOutput, logger: Any) -> None:
|
348
|
-
"""Reports the task with the given output to the server.
|
349
|
-
|
350
|
-
Doesn't raise any Exceptions. Runs till the reporting is successful."""
|
351
|
-
reporting_retries: int = 0
|
352
|
-
|
151
|
+
# Run the Executor forever until it is shut down.
|
353
152
|
while True:
|
354
|
-
|
355
|
-
try:
|
356
|
-
await self._task_reporter.report(output=output, logger=logger)
|
357
|
-
break
|
358
|
-
except Exception as e:
|
359
|
-
logger.error(
|
360
|
-
"failed to report task",
|
361
|
-
exc_info=e,
|
362
|
-
)
|
363
|
-
reporting_retries += 1
|
364
|
-
metric_task_outcome_report_retries.inc()
|
365
|
-
await asyncio.sleep(5)
|
153
|
+
await asyncio.sleep(10)
|
366
154
|
|
367
|
-
|
368
|
-
if
|
369
|
-
|
370
|
-
|
371
|
-
).inc()
|
372
|
-
elif output.success:
|
373
|
-
metric_tasks_completed.labels(
|
374
|
-
outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
|
375
|
-
).inc()
|
376
|
-
else:
|
377
|
-
metric_tasks_completed.labels(
|
378
|
-
outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
|
379
|
-
).inc()
|
380
|
-
|
381
|
-
async def _shutdown(self, loop):
|
382
|
-
self._logger.info(
|
383
|
-
"shutting down, all Executor logs are suppressed, no task outcomes will be reported to Server from this point"
|
384
|
-
)
|
385
|
-
if self._state_reporter is not None:
|
386
|
-
self._state_reporter.update_executor_status(
|
387
|
-
ExecutorStatus.EXECUTOR_STATUS_STOPPING
|
155
|
+
def _shutdown_signal_handler(self, loop):
|
156
|
+
if self._shutdown_aio_task is None:
|
157
|
+
self._shutdown_aio_task = loop.create_task(
|
158
|
+
self._shutdown(), name="executor shutdown"
|
388
159
|
)
|
389
|
-
metric_executor_state.state("shutting_down")
|
390
|
-
# There will be lots of task cancellation exceptions and "X is shutting down"
|
391
|
-
# exceptions logged during Executor shutdown. Suppress their logs as they are
|
392
|
-
# expected and are confusing for users.
|
393
|
-
suppress_logging()
|
394
160
|
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
if self._task_runner is not None:
|
400
|
-
await self._task_runner.shutdown()
|
401
|
-
|
402
|
-
if self._state_reporter is not None:
|
403
|
-
await self._state_reporter.shutdown()
|
404
|
-
if self._state_reconciler is not None:
|
405
|
-
await self._state_reconciler.shutdown()
|
406
|
-
if self._channel_manager is not None:
|
407
|
-
await self._channel_manager.destroy()
|
408
|
-
|
409
|
-
# We need to shutdown all users of FE states first,
|
410
|
-
# otherwise states might disappear unexpectedly and we might
|
411
|
-
# report errors, etc that are expected.
|
412
|
-
await self._function_executor_states.shutdown()
|
413
|
-
# We mainly need to cancel the task that runs _.*_mode_loop().
|
414
|
-
for task in asyncio.all_tasks(loop):
|
415
|
-
task.cancel()
|
416
|
-
# The current task is cancelled, the code after this line will not run.
|
161
|
+
async def _shutdown(self):
|
162
|
+
self._logger.info("shutting down Executor")
|
163
|
+
metric_executor_state.state("shutting_down")
|
417
164
|
|
418
|
-
|
419
|
-
|
165
|
+
# Shutdown state reconciler first because it changes reported state on shutdown.
|
166
|
+
await self._state_reconciler.shutdown()
|
420
167
|
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
graph=task.compute_graph,
|
425
|
-
graph_version=task.graph_version,
|
426
|
-
invocation_id=task.invocation_id,
|
427
|
-
function_name=task.compute_fn,
|
428
|
-
task_id=task.id,
|
168
|
+
# Do one last state report with STOPPED status. This reduces latency in the system.
|
169
|
+
self._state_reporter.update_executor_status(
|
170
|
+
ExecutorStatus.EXECUTOR_STATUS_STOPPED
|
429
171
|
)
|
172
|
+
await self._state_reporter.shutdown()
|
173
|
+
await self._channel_manager.destroy()
|
174
|
+
await self._monitoring_server.shutdown()
|
175
|
+
self._run_aio_task.cancel()
|
@@ -0,0 +1,59 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from typing import Dict, List, Optional
|
3
|
+
|
4
|
+
|
5
|
+
@dataclass
|
6
|
+
class FunctionURI:
|
7
|
+
namespace: str
|
8
|
+
compute_graph: str
|
9
|
+
compute_fn: str
|
10
|
+
version: Optional[str] = None
|
11
|
+
|
12
|
+
|
13
|
+
def function_allowlist_to_indexed_dict(
|
14
|
+
function_allowlist: List[FunctionURI],
|
15
|
+
) -> Dict[str, str]:
|
16
|
+
"""Returns a dictionary with each function URI in the allowlist as a key-value pair.
|
17
|
+
|
18
|
+
The keys are prefixed indexes in function allowlist, and the values are the function URIs
|
19
|
+
"""
|
20
|
+
indexed_dict = {}
|
21
|
+
counter = 0
|
22
|
+
for function_uri in function_allowlist:
|
23
|
+
function_uri: FunctionURI
|
24
|
+
indexed_dict[f"function_allowlist_{counter}"] = ":".join(
|
25
|
+
[
|
26
|
+
function_uri.namespace,
|
27
|
+
function_uri.compute_graph,
|
28
|
+
function_uri.compute_fn,
|
29
|
+
str(function_uri.version),
|
30
|
+
]
|
31
|
+
)
|
32
|
+
counter += 1
|
33
|
+
return indexed_dict
|
34
|
+
|
35
|
+
|
36
|
+
def parse_function_uris(function_uri_strs: List[str]) -> List[FunctionURI]:
|
37
|
+
"""Parses a list of function URIs from strings to FunctionURI objects."""
|
38
|
+
uris: List[FunctionURI] = []
|
39
|
+
for uri_str in function_uri_strs:
|
40
|
+
tokens = uri_str.split(":")
|
41
|
+
if len(tokens) < 3 or len(tokens) > 4:
|
42
|
+
raise ValueError(
|
43
|
+
"Function should be specified as <namespace>:<workflow>:<function>:<version> or"
|
44
|
+
"<namespace>:<workflow>:<function>"
|
45
|
+
)
|
46
|
+
version: Optional[str] = None
|
47
|
+
if len(tokens) == 4:
|
48
|
+
version = tokens[3]
|
49
|
+
|
50
|
+
uris.append(
|
51
|
+
FunctionURI(
|
52
|
+
namespace=tokens[0],
|
53
|
+
compute_graph=tokens[1],
|
54
|
+
compute_fn=tokens[2],
|
55
|
+
version=version,
|
56
|
+
)
|
57
|
+
)
|
58
|
+
|
59
|
+
return uris
|
@@ -56,7 +56,11 @@ from .server.function_executor_server_factory import (
|
|
56
56
|
)
|
57
57
|
|
58
58
|
|
59
|
-
class
|
59
|
+
class FunctionError(RuntimeError):
|
60
|
+
pass
|
61
|
+
|
62
|
+
|
63
|
+
class FunctionTimeoutError(FunctionError):
|
60
64
|
pass
|
61
65
|
|
62
66
|
|
@@ -92,7 +96,7 @@ class FunctionExecutor:
|
|
92
96
|
):
|
93
97
|
"""Creates and initializes a FunctionExecutorServer and all resources associated with it.
|
94
98
|
|
95
|
-
Raises
|
99
|
+
Raises FunctionError if the server failed to initialize due to an error in customer owned code or data.
|
96
100
|
Raises an Exception if an internal error occured."""
|
97
101
|
try:
|
98
102
|
with (
|
@@ -134,7 +138,9 @@ class FunctionExecutor:
|
|
134
138
|
async def destroy(self):
|
135
139
|
"""Destroys all resources owned by this FunctionExecutor.
|
136
140
|
|
137
|
-
Never raises any exceptions but logs them.
|
141
|
+
Never raises any exceptions but logs them.
|
142
|
+
Idempotent.
|
143
|
+
"""
|
138
144
|
try:
|
139
145
|
with (
|
140
146
|
metric_destroy_errors.count_exceptions(),
|
@@ -312,12 +318,12 @@ async def _initialize_server(
|
|
312
318
|
if initialize_response.success:
|
313
319
|
return
|
314
320
|
if initialize_response.HasField("customer_error"):
|
315
|
-
raise
|
321
|
+
raise FunctionError(initialize_response.customer_error)
|
316
322
|
else:
|
317
323
|
raise Exception("initialize RPC failed at function executor server")
|
318
324
|
except grpc.aio.AioRpcError as e:
|
319
325
|
if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
|
320
|
-
raise
|
321
|
-
f"
|
326
|
+
raise FunctionTimeoutError(
|
327
|
+
f"Function initialization exceeded its configured timeout of {customer_code_timeout_sec:.3f} sec."
|
322
328
|
) from e
|
323
329
|
raise
|
@@ -15,7 +15,6 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
|
15
15
|
)
|
16
16
|
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
17
17
|
|
18
|
-
from ..downloader import serialized_object_from_http_response
|
19
18
|
from .metrics.invocation_state_client import (
|
20
19
|
metric_request_read_errors,
|
21
20
|
metric_server_get_state_request_errors,
|
@@ -78,11 +77,18 @@ class InvocationStateClient:
|
|
78
77
|
If a request is not comming from the task ID that was added here then it will
|
79
78
|
be rejected. It's caller's responsibility to only add task IDs that are being
|
80
79
|
executed by the Function Executor so the Function Executor can't get access to
|
81
|
-
invocation state of tasks it doesn't run.
|
80
|
+
invocation state of tasks it doesn't run.
|
81
|
+
|
82
|
+
Doesn't raise any exceptions.
|
83
|
+
"""
|
82
84
|
self._task_id_to_invocation_id[task_id] = invocation_id
|
83
85
|
|
84
86
|
def remove_task_to_invocation_id_entry(self, task_id: str) -> None:
|
85
|
-
|
87
|
+
"""Removes a task ID to invocation ID entry from the client's internal state.
|
88
|
+
|
89
|
+
Doesn't raise any exceptions.
|
90
|
+
"""
|
91
|
+
self._task_id_to_invocation_id.pop(task_id, None)
|
86
92
|
|
87
93
|
async def destroy(self) -> None:
|
88
94
|
if self._request_loop_task is not None:
|
@@ -257,3 +263,19 @@ class InvocationStateClient:
|
|
257
263
|
)
|
258
264
|
else:
|
259
265
|
raise ValueError("unknown request type")
|
266
|
+
|
267
|
+
|
268
|
+
def serialized_object_from_http_response(response: httpx.Response) -> SerializedObject:
|
269
|
+
# We're hardcoding the content type currently used by Python SDK. It might change in the future.
|
270
|
+
# There's no other way for now to determine if the response is a bytes or string.
|
271
|
+
if response.headers["content-type"] in [
|
272
|
+
"application/octet-stream",
|
273
|
+
"application/pickle",
|
274
|
+
]:
|
275
|
+
return SerializedObject(
|
276
|
+
bytes=response.content, content_type=response.headers["content-type"]
|
277
|
+
)
|
278
|
+
else:
|
279
|
+
return SerializedObject(
|
280
|
+
string=response.text, content_type=response.headers["content-type"]
|
281
|
+
)
|
@@ -24,9 +24,9 @@ class FunctionExecutorServerConfiguration:
|
|
24
24
|
graph_version: str
|
25
25
|
image_uri: Optional[str]
|
26
26
|
secret_names: List[str]
|
27
|
-
cpu_ms_per_sec:
|
28
|
-
memory_bytes:
|
29
|
-
disk_bytes:
|
27
|
+
cpu_ms_per_sec: int
|
28
|
+
memory_bytes: int
|
29
|
+
disk_bytes: int
|
30
30
|
gpu_count: int
|
31
31
|
|
32
32
|
|