indexify 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +21 -18
- indexify/executor/api_objects.py +12 -0
- indexify/executor/downloader.py +4 -1
- indexify/executor/executor.py +65 -28
- indexify/executor/executor_flavor.py +7 -0
- indexify/executor/function_executor/function_executor.py +24 -11
- indexify/executor/function_executor/function_executor_state.py +9 -1
- indexify/executor/function_executor/function_executor_states_container.py +3 -1
- indexify/executor/function_executor/function_executor_status.py +2 -0
- indexify/executor/function_executor/health_checker.py +20 -2
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
- indexify/executor/function_executor/single_task_runner.py +15 -11
- indexify/executor/function_executor/task_output.py +35 -2
- indexify/executor/grpc/channel_manager.py +160 -0
- indexify/executor/grpc/completed_tasks_container.py +26 -0
- indexify/executor/grpc/function_executor_controller.py +421 -0
- indexify/executor/grpc/state_reconciler.py +33 -38
- indexify/executor/grpc/state_reporter.py +100 -39
- indexify/executor/grpc/task_controller.py +449 -0
- indexify/executor/metrics/task_reporter.py +14 -0
- indexify/executor/task_fetcher.py +8 -3
- indexify/executor/task_reporter.py +112 -4
- indexify/executor/task_runner.py +1 -0
- indexify/proto/{task_scheduler.proto → executor_api.proto} +86 -11
- indexify/proto/executor_api_pb2.py +80 -0
- indexify/proto/{task_scheduler_pb2.pyi → executor_api_pb2.pyi} +162 -7
- indexify/proto/executor_api_pb2_grpc.py +227 -0
- {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/METADATA +1 -1
- {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/RECORD +32 -28
- indexify/executor/grpc/channel_creator.py +0 -53
- indexify/proto/task_scheduler_pb2.py +0 -64
- indexify/proto/task_scheduler_pb2_grpc.py +0 -170
- /indexify/executor/grpc/metrics/{channel_creator.py → channel_manager.py} +0 -0
- {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/WHEEL +0 -0
- {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/entry_points.txt +0 -0
indexify/cli/cli.py
CHANGED
@@ -13,7 +13,7 @@ import sys
|
|
13
13
|
from importlib.metadata import version
|
14
14
|
from pathlib import Path
|
15
15
|
from socket import gethostname
|
16
|
-
from typing import Annotated, List, Optional, Tuple
|
16
|
+
from typing import Annotated, Dict, List, Optional, Tuple
|
17
17
|
|
18
18
|
import nanoid
|
19
19
|
import prometheus_client
|
@@ -26,6 +26,7 @@ from tensorlake.functions_sdk.image import Image
|
|
26
26
|
|
27
27
|
from indexify.executor.api_objects import FunctionURI
|
28
28
|
from indexify.executor.executor import Executor
|
29
|
+
from indexify.executor.executor_flavor import ExecutorFlavor
|
29
30
|
from indexify.executor.function_executor.server.subprocess_function_executor_server_factory import (
|
30
31
|
SubprocessFunctionExecutorServerFactory,
|
31
32
|
)
|
@@ -77,6 +78,7 @@ def build_image(
|
|
77
78
|
)
|
78
79
|
def executor(
|
79
80
|
server_addr: str = "localhost:8900",
|
81
|
+
grpc_server_addr: str = "localhost:8901",
|
80
82
|
dev: Annotated[
|
81
83
|
bool, typer.Option("--dev", "-d", help="Run the executor in development mode")
|
82
84
|
] = False,
|
@@ -119,17 +121,6 @@ def executor(
|
|
119
121
|
help="Port where to run Executor Monitoring server",
|
120
122
|
),
|
121
123
|
] = 7000,
|
122
|
-
# TODO: Figure out mTLS for gRPC.
|
123
|
-
grpc_server_addr: Annotated[
|
124
|
-
Optional[str],
|
125
|
-
typer.Option(
|
126
|
-
"--grpc-server-addr",
|
127
|
-
help=(
|
128
|
-
"(exprimental) Address of server gRPC API to connect to, e.g. 'localhost:8901'.\n"
|
129
|
-
"Enables gRPC state reporter that will periodically report the state of the Function Executors to Server\n"
|
130
|
-
),
|
131
|
-
),
|
132
|
-
] = None,
|
133
124
|
enable_grpc_state_reconciler: Annotated[
|
134
125
|
bool,
|
135
126
|
typer.Option(
|
@@ -140,6 +131,15 @@ def executor(
|
|
140
131
|
),
|
141
132
|
),
|
142
133
|
] = False,
|
134
|
+
labels: Annotated[
|
135
|
+
List[str],
|
136
|
+
typer.Option(
|
137
|
+
"--label",
|
138
|
+
"-l",
|
139
|
+
help="Executor key-value label to be sent to the Server. "
|
140
|
+
"Specified as <key>=<value>",
|
141
|
+
),
|
142
|
+
] = [],
|
143
143
|
):
|
144
144
|
if dev:
|
145
145
|
configure_development_mode_logging()
|
@@ -157,10 +157,10 @@ def executor(
|
|
157
157
|
"--executor-id should be at least 10 characters long and only include characters _-[0-9][a-z][A-Z]"
|
158
158
|
)
|
159
159
|
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
160
|
+
kv_labels: Dict[str, str] = {}
|
161
|
+
for label in labels:
|
162
|
+
key, value = label.split("=")
|
163
|
+
kv_labels[key] = value
|
164
164
|
|
165
165
|
executor_version = version("indexify")
|
166
166
|
logger = structlog.get_logger(module=__name__, executor_id=executor_id)
|
@@ -169,15 +169,16 @@ def executor(
|
|
169
169
|
"starting executor",
|
170
170
|
hostname=gethostname(),
|
171
171
|
server_addr=server_addr,
|
172
|
+
grpc_server_addr=grpc_server_addr,
|
172
173
|
config_path=config_path,
|
173
174
|
executor_version=executor_version,
|
175
|
+
labels=kv_labels,
|
174
176
|
executor_cache=executor_cache,
|
175
177
|
ports=ports,
|
176
178
|
functions=function_uris,
|
177
179
|
dev_mode=dev,
|
178
180
|
monitoring_server_host=monitoring_server_host,
|
179
181
|
monitoring_server_port=monitoring_server_port,
|
180
|
-
grpc_server_addr=grpc_server_addr,
|
181
182
|
enable_grpc_state_reconciler=enable_grpc_state_reconciler,
|
182
183
|
)
|
183
184
|
|
@@ -205,7 +206,9 @@ def executor(
|
|
205
206
|
Executor(
|
206
207
|
id=executor_id,
|
207
208
|
development_mode=dev,
|
209
|
+
flavor=ExecutorFlavor.OSS,
|
208
210
|
version=executor_version,
|
211
|
+
labels=kv_labels,
|
209
212
|
health_checker=GenericHealthChecker(),
|
210
213
|
code_path=executor_cache,
|
211
214
|
function_allowlist=_parse_function_uris(function_uris),
|
@@ -214,10 +217,10 @@ def executor(
|
|
214
217
|
server_ports=range(ports[0], ports[1]),
|
215
218
|
),
|
216
219
|
server_addr=server_addr,
|
220
|
+
grpc_server_addr=grpc_server_addr,
|
217
221
|
config_path=config_path,
|
218
222
|
monitoring_server_host=monitoring_server_host,
|
219
223
|
monitoring_server_port=monitoring_server_port,
|
220
|
-
grpc_server_addr=grpc_server_addr,
|
221
224
|
enable_grpc_state_reconciler=enable_grpc_state_reconciler,
|
222
225
|
).run()
|
223
226
|
|
indexify/executor/api_objects.py
CHANGED
@@ -49,5 +49,17 @@ class TaskResult(BaseModel):
|
|
49
49
|
reducer: bool = False
|
50
50
|
|
51
51
|
|
52
|
+
class DataPayload(BaseModel):
|
53
|
+
path: str
|
54
|
+
size: int
|
55
|
+
sha256_hash: str
|
56
|
+
|
57
|
+
|
58
|
+
class IngestFnOutputsResponse(BaseModel):
|
59
|
+
data_payloads: List[DataPayload]
|
60
|
+
stdout: Optional[DataPayload] = None
|
61
|
+
stderr: Optional[DataPayload] = None
|
62
|
+
|
63
|
+
|
52
64
|
TASK_OUTCOME_SUCCESS = "success"
|
53
65
|
TASK_OUTCOME_FAILURE = "failure"
|
indexify/executor/downloader.py
CHANGED
@@ -241,7 +241,10 @@ class Downloader:
|
|
241
241
|
def serialized_object_from_http_response(response: httpx.Response) -> SerializedObject:
|
242
242
|
# We're hardcoding the content type currently used by Python SDK. It might change in the future.
|
243
243
|
# There's no other way for now to determine if the response is a bytes or string.
|
244
|
-
if response.headers["content-type"]
|
244
|
+
if response.headers["content-type"] in [
|
245
|
+
"application/octet-stream",
|
246
|
+
"application/pickle",
|
247
|
+
]:
|
245
248
|
return SerializedObject(
|
246
249
|
bytes=response.content, content_type=response.headers["content-type"]
|
247
250
|
)
|
indexify/executor/executor.py
CHANGED
@@ -9,17 +9,18 @@ import structlog
|
|
9
9
|
from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
|
10
10
|
from tensorlake.utils.logging import suppress as suppress_logging
|
11
11
|
|
12
|
-
from indexify.proto.
|
12
|
+
from indexify.proto.executor_api_pb2 import ExecutorStatus
|
13
13
|
|
14
14
|
from .api_objects import FunctionURI, Task
|
15
15
|
from .downloader import Downloader
|
16
|
+
from .executor_flavor import ExecutorFlavor
|
16
17
|
from .function_executor.function_executor_states_container import (
|
17
18
|
FunctionExecutorStatesContainer,
|
18
19
|
)
|
19
20
|
from .function_executor.server.function_executor_server_factory import (
|
20
21
|
FunctionExecutorServerFactory,
|
21
22
|
)
|
22
|
-
from .grpc.
|
23
|
+
from .grpc.channel_manager import ChannelManager
|
23
24
|
from .grpc.state_reconciler import ExecutorStateReconciler
|
24
25
|
from .grpc.state_reporter import ExecutorStateReporter
|
25
26
|
from .metrics.executor import (
|
@@ -55,16 +56,18 @@ class Executor:
|
|
55
56
|
self,
|
56
57
|
id: str,
|
57
58
|
development_mode: bool,
|
59
|
+
flavor: ExecutorFlavor,
|
58
60
|
version: str,
|
61
|
+
labels: Dict[str, str],
|
59
62
|
code_path: Path,
|
60
63
|
health_checker: HealthChecker,
|
61
64
|
function_allowlist: Optional[List[FunctionURI]],
|
62
65
|
function_executor_server_factory: FunctionExecutorServerFactory,
|
63
66
|
server_addr: str,
|
67
|
+
grpc_server_addr: str,
|
64
68
|
config_path: Optional[str],
|
65
69
|
monitoring_server_host: str,
|
66
70
|
monitoring_server_port: int,
|
67
|
-
grpc_server_addr: Optional[str],
|
68
71
|
enable_grpc_state_reconciler: bool,
|
69
72
|
):
|
70
73
|
self._logger = structlog.get_logger(module=__name__)
|
@@ -94,36 +97,40 @@ class Executor:
|
|
94
97
|
self._downloader = Downloader(
|
95
98
|
code_path=code_path, base_url=self._base_url, config_path=config_path
|
96
99
|
)
|
100
|
+
self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
|
101
|
+
self._function_executor_server_factory = function_executor_server_factory
|
102
|
+
self._channel_manager = ChannelManager(
|
103
|
+
server_address=grpc_server_addr,
|
104
|
+
config_path=config_path,
|
105
|
+
logger=self._logger,
|
106
|
+
)
|
107
|
+
self._state_reporter = ExecutorStateReporter(
|
108
|
+
executor_id=id,
|
109
|
+
flavor=flavor,
|
110
|
+
version=version,
|
111
|
+
labels=labels,
|
112
|
+
development_mode=development_mode,
|
113
|
+
function_allowlist=self._function_allowlist,
|
114
|
+
function_executor_states=self._function_executor_states,
|
115
|
+
channel_manager=self._channel_manager,
|
116
|
+
logger=self._logger,
|
117
|
+
)
|
118
|
+
self._state_reporter.update_executor_status(
|
119
|
+
ExecutorStatus.EXECUTOR_STATUS_STARTING_UP
|
120
|
+
)
|
97
121
|
self._task_reporter = TaskReporter(
|
98
122
|
base_url=self._base_url,
|
99
123
|
executor_id=id,
|
100
124
|
config_path=config_path,
|
125
|
+
channel_manager=self._channel_manager,
|
101
126
|
)
|
102
|
-
self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
|
103
|
-
self._function_executor_server_factory = function_executor_server_factory
|
104
127
|
|
105
|
-
# HTTP mode
|
128
|
+
# HTTP mode task runner
|
106
129
|
self._task_runner: Optional[TaskRunner] = None
|
107
130
|
self._task_fetcher: Optional[TaskFetcher] = None
|
108
|
-
# gRPC mode
|
109
|
-
self._channel_creator: Optional[ChannelCreator] = None
|
110
|
-
self._state_reporter: Optional[ExecutorStateReporter] = None
|
131
|
+
# gRPC mode state reconciler that runs tasks
|
111
132
|
self._state_reconciler: Optional[ExecutorStateReconciler] = None
|
112
133
|
|
113
|
-
if grpc_server_addr is not None:
|
114
|
-
self._channel_creator = ChannelCreator(grpc_server_addr, self._logger)
|
115
|
-
self._state_reporter = ExecutorStateReporter(
|
116
|
-
executor_id=id,
|
117
|
-
development_mode=development_mode,
|
118
|
-
function_allowlist=self._function_allowlist,
|
119
|
-
function_executor_states=self._function_executor_states,
|
120
|
-
channel_creator=self._channel_creator,
|
121
|
-
logger=self._logger,
|
122
|
-
)
|
123
|
-
self._state_reporter.update_executor_status(
|
124
|
-
ExecutorStatus.EXECUTOR_STATUS_STARTING_UP
|
125
|
-
)
|
126
|
-
|
127
134
|
if enable_grpc_state_reconciler:
|
128
135
|
self._state_reconciler = ExecutorStateReconciler(
|
129
136
|
executor_id=id,
|
@@ -133,7 +140,8 @@ class Executor:
|
|
133
140
|
config_path=config_path,
|
134
141
|
downloader=self._downloader,
|
135
142
|
task_reporter=self._task_reporter,
|
136
|
-
|
143
|
+
channel_manager=self._channel_manager,
|
144
|
+
state_reporter=self._state_reporter,
|
137
145
|
logger=self._logger,
|
138
146
|
)
|
139
147
|
else:
|
@@ -147,6 +155,7 @@ class Executor:
|
|
147
155
|
self._task_fetcher = TaskFetcher(
|
148
156
|
executor_id=id,
|
149
157
|
executor_version=version,
|
158
|
+
labels=labels,
|
150
159
|
function_allowlist=function_allowlist,
|
151
160
|
protocol=protocol,
|
152
161
|
indexify_server_addr=self._server_addr,
|
@@ -159,8 +168,8 @@ class Executor:
|
|
159
168
|
"version": version,
|
160
169
|
"code_path": str(code_path),
|
161
170
|
"server_addr": server_addr,
|
162
|
-
"config_path": str(config_path),
|
163
171
|
"grpc_server_addr": str(grpc_server_addr),
|
172
|
+
"config_path": str(config_path),
|
164
173
|
"enable_grpc_state_reconciler": str(enable_grpc_state_reconciler),
|
165
174
|
"hostname": gethostname(),
|
166
175
|
}
|
@@ -244,6 +253,9 @@ class Executor:
|
|
244
253
|
)
|
245
254
|
logger.error("task execution failed", exc_info=e)
|
246
255
|
|
256
|
+
if output.metrics is not None:
|
257
|
+
self.log_function_metrics(output)
|
258
|
+
|
247
259
|
with (
|
248
260
|
metric_tasks_reporting_outcome.track_inprogress(),
|
249
261
|
metric_task_outcome_report_latency.time(),
|
@@ -253,6 +265,28 @@ class Executor:
|
|
253
265
|
|
254
266
|
metric_task_completion_latency.observe(time.monotonic() - start_time)
|
255
267
|
|
268
|
+
def log_function_metrics(self, output: TaskOutput):
|
269
|
+
for counter_name, counter_value in output.metrics.counters.items():
|
270
|
+
self._logger.info(
|
271
|
+
f"function_metric",
|
272
|
+
counter_name=counter_name,
|
273
|
+
counter_value=counter_value,
|
274
|
+
invocation_id=output.graph_invocation_id,
|
275
|
+
function_name=output.function_name,
|
276
|
+
graph_name=output.graph_name,
|
277
|
+
namespace=output.namespace,
|
278
|
+
)
|
279
|
+
for timer_name, timer_value in output.metrics.timers.items():
|
280
|
+
self._logger.info(
|
281
|
+
f"function_metric",
|
282
|
+
timer_name=timer_name,
|
283
|
+
timer_value=timer_value,
|
284
|
+
invocation_id=output.graph_invocation_id,
|
285
|
+
function_name=output.function_name,
|
286
|
+
graph_name=output.graph_name,
|
287
|
+
namespace=output.namespace,
|
288
|
+
)
|
289
|
+
|
256
290
|
async def _run_task_and_get_output(self, task: Task, logger: Any) -> TaskOutput:
|
257
291
|
graph: SerializedObject = await self._downloader.download_graph(
|
258
292
|
namespace=task.namespace,
|
@@ -326,7 +360,9 @@ class Executor:
|
|
326
360
|
).inc()
|
327
361
|
|
328
362
|
async def _shutdown(self, loop):
|
329
|
-
self._logger.info(
|
363
|
+
self._logger.info(
|
364
|
+
"shutting down, all Executor logs are suppressed, no task outcomes will be reported to Server from this point"
|
365
|
+
)
|
330
366
|
if self._state_reporter is not None:
|
331
367
|
self._state_reporter.update_executor_status(
|
332
368
|
ExecutorStatus.EXECUTOR_STATUS_STOPPING
|
@@ -339,12 +375,13 @@ class Executor:
|
|
339
375
|
|
340
376
|
self._is_shutdown = True
|
341
377
|
await self._monitoring_server.shutdown()
|
378
|
+
await self._task_reporter.shutdown()
|
342
379
|
|
343
380
|
if self._task_runner is not None:
|
344
381
|
await self._task_runner.shutdown()
|
345
382
|
|
346
|
-
if self.
|
347
|
-
await self.
|
383
|
+
if self._channel_manager is not None:
|
384
|
+
await self._channel_manager.shutdown()
|
348
385
|
if self._state_reporter is not None:
|
349
386
|
await self._state_reporter.shutdown()
|
350
387
|
if self._state_reconciler is not None:
|
@@ -88,6 +88,7 @@ class FunctionExecutor:
|
|
88
88
|
initialize_request: InitializeRequest,
|
89
89
|
base_url: str,
|
90
90
|
config_path: Optional[str],
|
91
|
+
customer_code_timeout_sec: Optional[float] = None,
|
91
92
|
):
|
92
93
|
"""Creates and initializes a FunctionExecutorServer and all resources associated with it.
|
93
94
|
|
@@ -103,7 +104,9 @@ class FunctionExecutor:
|
|
103
104
|
await self._establish_channel()
|
104
105
|
stub: FunctionExecutorStub = FunctionExecutorStub(self._channel)
|
105
106
|
await _collect_server_info(stub)
|
106
|
-
await _initialize_server(
|
107
|
+
await _initialize_server(
|
108
|
+
stub, initialize_request, customer_code_timeout_sec
|
109
|
+
)
|
107
110
|
await self._create_invocation_state_client(
|
108
111
|
stub=stub,
|
109
112
|
base_url=base_url,
|
@@ -293,18 +296,28 @@ async def _collect_server_info(stub: FunctionExecutorStub) -> None:
|
|
293
296
|
|
294
297
|
|
295
298
|
async def _initialize_server(
|
296
|
-
stub: FunctionExecutorStub,
|
299
|
+
stub: FunctionExecutorStub,
|
300
|
+
initialize_request: InitializeRequest,
|
301
|
+
customer_code_timeout_sec: Optional[float],
|
297
302
|
) -> None:
|
298
303
|
with (
|
299
304
|
metric_initialize_rpc_errors.count_exceptions(),
|
300
305
|
metric_initialize_rpc_latency.time(),
|
301
306
|
):
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
307
|
+
try:
|
308
|
+
initialize_response: InitializeResponse = await stub.initialize(
|
309
|
+
initialize_request,
|
310
|
+
timeout=customer_code_timeout_sec,
|
311
|
+
)
|
312
|
+
if initialize_response.success:
|
313
|
+
return
|
314
|
+
if initialize_response.HasField("customer_error"):
|
315
|
+
raise CustomerError(initialize_response.customer_error)
|
316
|
+
else:
|
317
|
+
raise Exception("initialize RPC failed at function executor server")
|
318
|
+
except grpc.aio.AioRpcError as e:
|
319
|
+
if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
|
320
|
+
raise CustomerError(
|
321
|
+
f"Customer code timeout {customer_code_timeout_sec} sec expired"
|
322
|
+
) from e
|
323
|
+
raise
|
@@ -25,6 +25,7 @@ class FunctionExecutorState:
|
|
25
25
|
graph_version: str,
|
26
26
|
function_name: str,
|
27
27
|
image_uri: Optional[str],
|
28
|
+
secret_names: List[str],
|
28
29
|
logger: Any,
|
29
30
|
):
|
30
31
|
# Read only fields.
|
@@ -33,6 +34,7 @@ class FunctionExecutorState:
|
|
33
34
|
self.graph_name: str = graph_name
|
34
35
|
self.function_name: str = function_name
|
35
36
|
self.image_uri: Optional[str] = image_uri
|
37
|
+
self.secret_names: List[str] = secret_names
|
36
38
|
self._logger: Any = logger.bind(
|
37
39
|
module=__name__,
|
38
40
|
function_executor_id=id,
|
@@ -47,6 +49,7 @@ class FunctionExecutorState:
|
|
47
49
|
# TODO: Move graph_version to immutable fields once we migrate to gRPC State Reconciler.
|
48
50
|
self.graph_version: str = graph_version
|
49
51
|
self.status: FunctionExecutorStatus = FunctionExecutorStatus.DESTROYED
|
52
|
+
self.status_message: str = ""
|
50
53
|
self.status_change_notifier: asyncio.Condition = asyncio.Condition(
|
51
54
|
lock=self.lock
|
52
55
|
)
|
@@ -62,7 +65,9 @@ class FunctionExecutorState:
|
|
62
65
|
while self.status not in allowlist:
|
63
66
|
await self.status_change_notifier.wait()
|
64
67
|
|
65
|
-
async def set_status(
|
68
|
+
async def set_status(
|
69
|
+
self, new_status: FunctionExecutorStatus, status_message: str = ""
|
70
|
+
) -> None:
|
66
71
|
"""Sets the status of the Function Executor.
|
67
72
|
|
68
73
|
The caller must hold the lock.
|
@@ -70,6 +75,7 @@ class FunctionExecutorState:
|
|
70
75
|
"""
|
71
76
|
self.check_locked()
|
72
77
|
if is_status_change_allowed(self.status, new_status):
|
78
|
+
# If status didn't change then still log it for visibility.
|
73
79
|
self._logger.info(
|
74
80
|
"function executor status changed",
|
75
81
|
old_status=self.status.name,
|
@@ -78,12 +84,14 @@ class FunctionExecutorState:
|
|
78
84
|
metric_function_executors_with_status.labels(status=self.status.name).dec()
|
79
85
|
metric_function_executors_with_status.labels(status=new_status.name).inc()
|
80
86
|
self.status = new_status
|
87
|
+
self.status_message = status_message
|
81
88
|
self.status_change_notifier.notify_all()
|
82
89
|
else:
|
83
90
|
raise ValueError(
|
84
91
|
f"Invalid status change from {self.status} to {new_status}"
|
85
92
|
)
|
86
93
|
|
94
|
+
# TODO: Delete this method once HTTP protocol is removed as it's used only there.
|
87
95
|
async def destroy_function_executor(self) -> None:
|
88
96
|
"""Destroys the Function Executor if it exists.
|
89
97
|
|
@@ -1,5 +1,5 @@
|
|
1
1
|
import asyncio
|
2
|
-
from typing import Any, AsyncGenerator, Dict, Optional
|
2
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional
|
3
3
|
|
4
4
|
from .function_executor_state import FunctionExecutorState
|
5
5
|
from .function_executor_status import FunctionExecutorStatus
|
@@ -26,6 +26,7 @@ class FunctionExecutorStatesContainer:
|
|
26
26
|
graph_version: str,
|
27
27
|
function_name: str,
|
28
28
|
image_uri: Optional[str],
|
29
|
+
secret_names: List[str],
|
29
30
|
) -> FunctionExecutorState:
|
30
31
|
"""Get or create a function executor state with the given ID.
|
31
32
|
|
@@ -45,6 +46,7 @@ class FunctionExecutorStatesContainer:
|
|
45
46
|
graph_version=graph_version,
|
46
47
|
function_name=function_name,
|
47
48
|
image_uri=image_uri,
|
49
|
+
secret_names=secret_names,
|
48
50
|
logger=self._logger,
|
49
51
|
)
|
50
52
|
self._states[id] = state
|
@@ -23,6 +23,7 @@ class FunctionExecutorStatus(Enum):
|
|
23
23
|
UNHEALTHY = "Unhealthy"
|
24
24
|
# STARTUP_FAILED_CUSTOMER_ERROR -> DESTROYING
|
25
25
|
# STARTUP_FAILED_PLATFORM_ERROR -> DESTROYING
|
26
|
+
# RUNNING_TASK -> DESTROYING
|
26
27
|
# UNHEALTHY -> DESTROYING
|
27
28
|
# IDLE -> DESTROYING
|
28
29
|
DESTROYING = "Destroying"
|
@@ -69,6 +70,7 @@ def is_status_change_allowed(
|
|
69
70
|
],
|
70
71
|
FunctionExecutorStatus.RUNNING_TASK: [
|
71
72
|
FunctionExecutorStatus.RUNNING_TASK,
|
73
|
+
FunctionExecutorStatus.DESTROYING,
|
72
74
|
FunctionExecutorStatus.IDLE,
|
73
75
|
FunctionExecutorStatus.UNHEALTHY,
|
74
76
|
FunctionExecutorStatus.SHUTDOWN,
|
@@ -70,8 +70,10 @@ class HealthChecker:
|
|
70
70
|
# code is not involved when TCP connections are established to FE. Problems reestablishing
|
71
71
|
# the TCP connection are usually due to the FE process crashing and its gRPC server socket
|
72
72
|
# not being available anymore or due to prolonged local networking failures on Executor.
|
73
|
-
|
74
|
-
|
73
|
+
if (
|
74
|
+
_channel_state(self._channel, self._logger)
|
75
|
+
== grpc.ChannelConnectivity.TRANSIENT_FAILURE
|
76
|
+
):
|
75
77
|
return HealthCheckResult(
|
76
78
|
is_healthy=False,
|
77
79
|
reason="Channel is in TRANSIENT_FAILURE state, assuming Function Executor crashed.",
|
@@ -126,3 +128,19 @@ class HealthChecker:
|
|
126
128
|
|
127
129
|
asyncio.create_task(self._health_check_failed_callback(result))
|
128
130
|
self._health_check_loop_task = None
|
131
|
+
|
132
|
+
|
133
|
+
def _channel_state(channel: grpc.aio.Channel, logger: Any) -> grpc.ChannelConnectivity:
|
134
|
+
"""Get channel connectivity state and suppresses all exceptions.
|
135
|
+
|
136
|
+
Suppressing the exceptions is important because the channel connectivity state is an experimental
|
137
|
+
feature. On error fallse back to READY state which assumes that the channel is okay.
|
138
|
+
"""
|
139
|
+
try:
|
140
|
+
return channel.get_state()
|
141
|
+
except Exception as e:
|
142
|
+
logger.error(
|
143
|
+
"Failed getting channel state, falling back to default READY state",
|
144
|
+
exc_info=e,
|
145
|
+
)
|
146
|
+
return grpc.ChannelConnectivity.READY
|
@@ -25,6 +25,12 @@ class SubprocessFunctionExecutorServerFactory(FunctionExecutorServerFactory):
|
|
25
25
|
logger = logger.bind(module=__name__)
|
26
26
|
port: Optional[int] = None
|
27
27
|
|
28
|
+
if len(config.secret_names) > 0:
|
29
|
+
logger.warning(
|
30
|
+
"Subprocess Function Executor does not support secrets. Please supply secrets as environment variables.",
|
31
|
+
secret_names=config.secret_names,
|
32
|
+
)
|
33
|
+
|
28
34
|
try:
|
29
35
|
port = self._allocate_port()
|
30
36
|
args = [
|
@@ -10,6 +10,7 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
|
|
10
10
|
from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
11
11
|
FunctionExecutorStub,
|
12
12
|
)
|
13
|
+
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
13
14
|
|
14
15
|
from ..api_objects import Task
|
15
16
|
from .function_executor import CustomerError, FunctionExecutor
|
@@ -26,7 +27,7 @@ from .server.function_executor_server_factory import (
|
|
26
27
|
FunctionExecutorServerFactory,
|
27
28
|
)
|
28
29
|
from .task_input import TaskInput
|
29
|
-
from .task_output import TaskOutput
|
30
|
+
from .task_output import TaskMetrics, TaskOutput
|
30
31
|
|
31
32
|
|
32
33
|
class SingleTaskRunner:
|
@@ -286,16 +287,17 @@ class _RunningTaskContextManager:
|
|
286
287
|
|
287
288
|
|
288
289
|
def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
if
|
298
|
-
|
290
|
+
response_validator = MessageValidator(response)
|
291
|
+
response_validator.required_field("stdout")
|
292
|
+
response_validator.required_field("stderr")
|
293
|
+
response_validator.required_field("is_reducer")
|
294
|
+
response_validator.required_field("success")
|
295
|
+
|
296
|
+
metrics = TaskMetrics(counters={}, timers={})
|
297
|
+
if response.HasField("metrics"):
|
298
|
+
# Can be None if e.g. function failed.
|
299
|
+
metrics.counters = dict(response.metrics.counters)
|
300
|
+
metrics.timers = dict(response.metrics.timers)
|
299
301
|
|
300
302
|
output = TaskOutput(
|
301
303
|
task_id=task.id,
|
@@ -308,10 +310,12 @@ def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
|
|
308
310
|
stderr=response.stderr,
|
309
311
|
reducer=response.is_reducer,
|
310
312
|
success=response.success,
|
313
|
+
metrics=metrics,
|
311
314
|
)
|
312
315
|
|
313
316
|
if response.HasField("function_output"):
|
314
317
|
output.function_output = response.function_output
|
318
|
+
output.output_encoding = response.function_output.output_encoding
|
315
319
|
if response.HasField("router_output"):
|
316
320
|
output.router_output = response.router_output
|
317
321
|
|
@@ -1,11 +1,17 @@
|
|
1
|
-
from typing import Optional
|
1
|
+
from typing import Dict, Optional
|
2
2
|
|
3
3
|
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
4
4
|
FunctionOutput,
|
5
5
|
RouterOutput,
|
6
6
|
)
|
7
7
|
|
8
|
-
|
8
|
+
|
9
|
+
class TaskMetrics:
|
10
|
+
"""Metrics for a task."""
|
11
|
+
|
12
|
+
def __init__(self, counters: Dict[str, int], timers: Dict[str, float]):
|
13
|
+
self.counters = counters
|
14
|
+
self.timers = timers
|
9
15
|
|
10
16
|
|
11
17
|
class TaskOutput:
|
@@ -19,6 +25,7 @@ class TaskOutput:
|
|
19
25
|
function_name: str,
|
20
26
|
graph_version: str,
|
21
27
|
graph_invocation_id: str,
|
28
|
+
output_encoding: Optional[str] = None,
|
22
29
|
function_output: Optional[FunctionOutput] = None,
|
23
30
|
router_output: Optional[RouterOutput] = None,
|
24
31
|
stdout: Optional[str] = None,
|
@@ -26,6 +33,7 @@ class TaskOutput:
|
|
26
33
|
reducer: bool = False,
|
27
34
|
success: bool = False,
|
28
35
|
is_internal_error: bool = False,
|
36
|
+
metrics: Optional[TaskMetrics] = None,
|
29
37
|
):
|
30
38
|
self.task_id = task_id
|
31
39
|
self.namespace = namespace
|
@@ -40,6 +48,8 @@ class TaskOutput:
|
|
40
48
|
self.reducer = reducer
|
41
49
|
self.success = success
|
42
50
|
self.is_internal_error = is_internal_error
|
51
|
+
self.metrics = metrics
|
52
|
+
self.output_encoding = output_encoding
|
43
53
|
|
44
54
|
@classmethod
|
45
55
|
def internal_error(
|
@@ -63,3 +73,26 @@ class TaskOutput:
|
|
63
73
|
stderr="Platform failed to execute the function.",
|
64
74
|
is_internal_error=True,
|
65
75
|
)
|
76
|
+
|
77
|
+
@classmethod
|
78
|
+
def function_timeout(
|
79
|
+
cls,
|
80
|
+
task_id: str,
|
81
|
+
namespace: str,
|
82
|
+
graph_name: str,
|
83
|
+
function_name: str,
|
84
|
+
graph_version: str,
|
85
|
+
graph_invocation_id: str,
|
86
|
+
) -> "TaskOutput":
|
87
|
+
"""Creates a TaskOutput for an function timeout error."""
|
88
|
+
# Task stdout, stderr is not available.
|
89
|
+
return TaskOutput(
|
90
|
+
task_id=task_id,
|
91
|
+
namespace=namespace,
|
92
|
+
graph_name=graph_name,
|
93
|
+
function_name=function_name,
|
94
|
+
graph_version=graph_version,
|
95
|
+
graph_invocation_id=graph_invocation_id,
|
96
|
+
stderr="Function execution timed out.",
|
97
|
+
is_internal_error=False,
|
98
|
+
)
|