indexify 0.3.18__py3-none-any.whl → 0.3.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +15 -17
- indexify/executor/api_objects.py +12 -0
- indexify/executor/blob_store/blob_store.py +69 -0
- indexify/executor/blob_store/local_fs_blob_store.py +48 -0
- indexify/executor/blob_store/metrics/blob_store.py +33 -0
- indexify/executor/blob_store/s3_blob_store.py +85 -0
- indexify/executor/downloader.py +149 -25
- indexify/executor/executor.py +77 -41
- indexify/executor/function_executor/function_executor.py +24 -11
- indexify/executor/function_executor/function_executor_state.py +9 -1
- indexify/executor/function_executor/function_executor_states_container.py +8 -1
- indexify/executor/function_executor/function_executor_status.py +4 -0
- indexify/executor/function_executor/health_checker.py +7 -2
- indexify/executor/function_executor/invocation_state_client.py +4 -2
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
- indexify/executor/function_executor/single_task_runner.py +15 -11
- indexify/executor/function_executor/task_output.py +36 -2
- indexify/executor/grpc/channel_manager.py +4 -3
- indexify/executor/grpc/function_executor_controller.py +391 -0
- indexify/executor/grpc/metrics/state_reconciler.py +17 -0
- indexify/executor/grpc/metrics/task_controller.py +8 -0
- indexify/executor/grpc/state_reconciler.py +324 -217
- indexify/executor/grpc/state_reporter.py +52 -41
- indexify/executor/grpc/task_controller.py +492 -0
- indexify/executor/metrics/task_reporter.py +14 -0
- indexify/executor/task_reporter.py +115 -6
- indexify/executor/task_runner.py +1 -0
- indexify/proto/executor_api.proto +91 -7
- indexify/proto/executor_api_pb2.py +49 -37
- indexify/proto/executor_api_pb2.pyi +158 -3
- indexify/proto/executor_api_pb2_grpc.py +47 -0
- {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/METADATA +2 -1
- {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/RECORD +35 -27
- {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/WHEEL +0 -0
- {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/entry_points.txt +0 -0
indexify/executor/executor.py
CHANGED
@@ -12,6 +12,7 @@ from tensorlake.utils.logging import suppress as suppress_logging
|
|
12
12
|
from indexify.proto.executor_api_pb2 import ExecutorStatus
|
13
13
|
|
14
14
|
from .api_objects import FunctionURI, Task
|
15
|
+
from .blob_store.blob_store import BLOBStore
|
15
16
|
from .downloader import Downloader
|
16
17
|
from .executor_flavor import ExecutorFlavor
|
17
18
|
from .function_executor.function_executor_states_container import (
|
@@ -64,11 +65,12 @@ class Executor:
|
|
64
65
|
function_allowlist: Optional[List[FunctionURI]],
|
65
66
|
function_executor_server_factory: FunctionExecutorServerFactory,
|
66
67
|
server_addr: str,
|
68
|
+
grpc_server_addr: str,
|
67
69
|
config_path: Optional[str],
|
68
70
|
monitoring_server_host: str,
|
69
71
|
monitoring_server_port: int,
|
70
|
-
grpc_server_addr: Optional[str],
|
71
72
|
enable_grpc_state_reconciler: bool,
|
73
|
+
blob_store: BLOBStore,
|
72
74
|
):
|
73
75
|
self._logger = structlog.get_logger(module=__name__)
|
74
76
|
self._is_shutdown: bool = False
|
@@ -95,45 +97,45 @@ class Executor:
|
|
95
97
|
self._function_executor_states
|
96
98
|
)
|
97
99
|
self._downloader = Downloader(
|
98
|
-
code_path=code_path,
|
100
|
+
code_path=code_path,
|
101
|
+
base_url=self._base_url,
|
102
|
+
blob_store=blob_store,
|
103
|
+
config_path=config_path,
|
104
|
+
)
|
105
|
+
self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
|
106
|
+
self._function_executor_server_factory = function_executor_server_factory
|
107
|
+
self._channel_manager = ChannelManager(
|
108
|
+
server_address=grpc_server_addr,
|
109
|
+
config_path=config_path,
|
110
|
+
logger=self._logger,
|
111
|
+
)
|
112
|
+
self._state_reporter = ExecutorStateReporter(
|
113
|
+
executor_id=id,
|
114
|
+
flavor=flavor,
|
115
|
+
version=version,
|
116
|
+
labels=labels,
|
117
|
+
development_mode=development_mode,
|
118
|
+
function_allowlist=self._function_allowlist,
|
119
|
+
function_executor_states=self._function_executor_states,
|
120
|
+
channel_manager=self._channel_manager,
|
121
|
+
logger=self._logger,
|
122
|
+
)
|
123
|
+
self._state_reporter.update_executor_status(
|
124
|
+
ExecutorStatus.EXECUTOR_STATUS_STARTING_UP
|
99
125
|
)
|
100
126
|
self._task_reporter = TaskReporter(
|
101
127
|
base_url=self._base_url,
|
102
128
|
executor_id=id,
|
103
129
|
config_path=config_path,
|
130
|
+
channel_manager=self._channel_manager,
|
104
131
|
)
|
105
|
-
self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
|
106
|
-
self._function_executor_server_factory = function_executor_server_factory
|
107
132
|
|
108
|
-
# HTTP mode
|
133
|
+
# HTTP mode task runner
|
109
134
|
self._task_runner: Optional[TaskRunner] = None
|
110
135
|
self._task_fetcher: Optional[TaskFetcher] = None
|
111
|
-
# gRPC mode
|
112
|
-
self._channel_manager: Optional[ChannelManager] = None
|
113
|
-
self._state_reporter: Optional[ExecutorStateReporter] = None
|
136
|
+
# gRPC mode state reconciler that runs tasks
|
114
137
|
self._state_reconciler: Optional[ExecutorStateReconciler] = None
|
115
138
|
|
116
|
-
if grpc_server_addr is not None:
|
117
|
-
self._channel_manager = ChannelManager(
|
118
|
-
server_address=grpc_server_addr,
|
119
|
-
config_path=config_path,
|
120
|
-
logger=self._logger,
|
121
|
-
)
|
122
|
-
self._state_reporter = ExecutorStateReporter(
|
123
|
-
executor_id=id,
|
124
|
-
flavor=flavor,
|
125
|
-
version=version,
|
126
|
-
labels=labels,
|
127
|
-
development_mode=development_mode,
|
128
|
-
function_allowlist=self._function_allowlist,
|
129
|
-
function_executor_states=self._function_executor_states,
|
130
|
-
channel_manager=self._channel_manager,
|
131
|
-
logger=self._logger,
|
132
|
-
)
|
133
|
-
self._state_reporter.update_executor_status(
|
134
|
-
ExecutorStatus.EXECUTOR_STATUS_STARTING_UP
|
135
|
-
)
|
136
|
-
|
137
139
|
if enable_grpc_state_reconciler:
|
138
140
|
self._state_reconciler = ExecutorStateReconciler(
|
139
141
|
executor_id=id,
|
@@ -171,8 +173,8 @@ class Executor:
|
|
171
173
|
"version": version,
|
172
174
|
"code_path": str(code_path),
|
173
175
|
"server_addr": server_addr,
|
174
|
-
"config_path": str(config_path),
|
175
176
|
"grpc_server_addr": str(grpc_server_addr),
|
177
|
+
"config_path": str(config_path),
|
176
178
|
"enable_grpc_state_reconciler": str(enable_grpc_state_reconciler),
|
177
179
|
"hostname": gethostname(),
|
178
180
|
}
|
@@ -192,12 +194,15 @@ class Executor:
|
|
192
194
|
signum, self.shutdown, asyncio.get_event_loop()
|
193
195
|
)
|
194
196
|
|
195
|
-
asyncio.get_event_loop().create_task(
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
197
|
+
asyncio.get_event_loop().create_task(
|
198
|
+
self._monitoring_server.run(), name="monitoring server runner"
|
199
|
+
)
|
200
|
+
self._state_reporter.update_executor_status(
|
201
|
+
ExecutorStatus.EXECUTOR_STATUS_RUNNING
|
202
|
+
)
|
203
|
+
asyncio.get_event_loop().create_task(
|
204
|
+
self._state_reporter.run(), name="state reporter runner"
|
205
|
+
)
|
201
206
|
|
202
207
|
metric_executor_state.state("running")
|
203
208
|
self._startup_probe_handler.set_ready()
|
@@ -218,7 +223,6 @@ class Executor:
|
|
218
223
|
"""Runs the gRPC state reconciler and state reporter.
|
219
224
|
|
220
225
|
Never raises any exceptions."""
|
221
|
-
asyncio.create_task(self._state_reporter.run())
|
222
226
|
await self._state_reconciler.run()
|
223
227
|
|
224
228
|
async def _http_task_runner_loop(self):
|
@@ -227,11 +231,15 @@ class Executor:
|
|
227
231
|
async for task in self._task_fetcher.run():
|
228
232
|
metric_tasks_fetched.inc()
|
229
233
|
if not self._is_shutdown:
|
230
|
-
asyncio.create_task(
|
234
|
+
asyncio.create_task(
|
235
|
+
self._run_task(task), name="task runner (http mode)"
|
236
|
+
)
|
237
|
+
self._logger.info("fetching tasks finished, reconnecting in 5 seconds")
|
231
238
|
except Exception as e:
|
232
239
|
self._logger.error(
|
233
240
|
"failed fetching tasks, retrying in 5 seconds", exc_info=e
|
234
241
|
)
|
242
|
+
if not self._is_shutdown:
|
235
243
|
await asyncio.sleep(5)
|
236
244
|
|
237
245
|
async def _run_task(self, task: Task) -> None:
|
@@ -256,6 +264,9 @@ class Executor:
|
|
256
264
|
)
|
257
265
|
logger.error("task execution failed", exc_info=e)
|
258
266
|
|
267
|
+
if output.metrics is not None:
|
268
|
+
self.log_function_metrics(output)
|
269
|
+
|
259
270
|
with (
|
260
271
|
metric_tasks_reporting_outcome.track_inprogress(),
|
261
272
|
metric_task_outcome_report_latency.time(),
|
@@ -265,18 +276,42 @@ class Executor:
|
|
265
276
|
|
266
277
|
metric_task_completion_latency.observe(time.monotonic() - start_time)
|
267
278
|
|
279
|
+
def log_function_metrics(self, output: TaskOutput):
|
280
|
+
for counter_name, counter_value in output.metrics.counters.items():
|
281
|
+
self._logger.info(
|
282
|
+
f"function_metric",
|
283
|
+
counter_name=counter_name,
|
284
|
+
counter_value=counter_value,
|
285
|
+
invocation_id=output.graph_invocation_id,
|
286
|
+
function_name=output.function_name,
|
287
|
+
graph_name=output.graph_name,
|
288
|
+
namespace=output.namespace,
|
289
|
+
)
|
290
|
+
for timer_name, timer_value in output.metrics.timers.items():
|
291
|
+
self._logger.info(
|
292
|
+
f"function_metric",
|
293
|
+
timer_name=timer_name,
|
294
|
+
timer_value=timer_value,
|
295
|
+
invocation_id=output.graph_invocation_id,
|
296
|
+
function_name=output.function_name,
|
297
|
+
graph_name=output.graph_name,
|
298
|
+
namespace=output.namespace,
|
299
|
+
)
|
300
|
+
|
268
301
|
async def _run_task_and_get_output(self, task: Task, logger: Any) -> TaskOutput:
|
269
302
|
graph: SerializedObject = await self._downloader.download_graph(
|
270
303
|
namespace=task.namespace,
|
271
304
|
graph_name=task.compute_graph,
|
272
305
|
graph_version=task.graph_version,
|
273
306
|
logger=logger,
|
307
|
+
data_payload=None,
|
274
308
|
)
|
275
309
|
input: SerializedObject = await self._downloader.download_input(
|
276
310
|
namespace=task.namespace,
|
277
311
|
graph_name=task.compute_graph,
|
278
312
|
graph_invocation_id=task.invocation_id,
|
279
313
|
input_key=task.input_key,
|
314
|
+
data_payload=None,
|
280
315
|
logger=logger,
|
281
316
|
)
|
282
317
|
init_value: Optional[SerializedObject] = (
|
@@ -289,6 +324,7 @@ class Executor:
|
|
289
324
|
function_name=task.compute_fn,
|
290
325
|
graph_invocation_id=task.invocation_id,
|
291
326
|
reducer_output_key=task.reducer_output_id,
|
327
|
+
data_payload=None,
|
292
328
|
logger=logger,
|
293
329
|
)
|
294
330
|
)
|
@@ -358,12 +394,12 @@ class Executor:
|
|
358
394
|
if self._task_runner is not None:
|
359
395
|
await self._task_runner.shutdown()
|
360
396
|
|
361
|
-
if self._channel_manager is not None:
|
362
|
-
await self._channel_manager.shutdown()
|
363
397
|
if self._state_reporter is not None:
|
364
398
|
await self._state_reporter.shutdown()
|
365
399
|
if self._state_reconciler is not None:
|
366
400
|
await self._state_reconciler.shutdown()
|
401
|
+
if self._channel_manager is not None:
|
402
|
+
await self._channel_manager.destroy()
|
367
403
|
|
368
404
|
# We need to shutdown all users of FE states first,
|
369
405
|
# otherwise states might disappear unexpectedly and we might
|
@@ -375,7 +411,7 @@ class Executor:
|
|
375
411
|
# The current task is cancelled, the code after this line will not run.
|
376
412
|
|
377
413
|
def shutdown(self, loop):
|
378
|
-
loop.create_task(self._shutdown(loop))
|
414
|
+
loop.create_task(self._shutdown(loop), name="executor shutdown")
|
379
415
|
|
380
416
|
def _task_logger(self, task: Task) -> Any:
|
381
417
|
return self._logger.bind(
|
@@ -88,6 +88,7 @@ class FunctionExecutor:
|
|
88
88
|
initialize_request: InitializeRequest,
|
89
89
|
base_url: str,
|
90
90
|
config_path: Optional[str],
|
91
|
+
customer_code_timeout_sec: Optional[float] = None,
|
91
92
|
):
|
92
93
|
"""Creates and initializes a FunctionExecutorServer and all resources associated with it.
|
93
94
|
|
@@ -103,7 +104,9 @@ class FunctionExecutor:
|
|
103
104
|
await self._establish_channel()
|
104
105
|
stub: FunctionExecutorStub = FunctionExecutorStub(self._channel)
|
105
106
|
await _collect_server_info(stub)
|
106
|
-
await _initialize_server(
|
107
|
+
await _initialize_server(
|
108
|
+
stub, initialize_request, customer_code_timeout_sec
|
109
|
+
)
|
107
110
|
await self._create_invocation_state_client(
|
108
111
|
stub=stub,
|
109
112
|
base_url=base_url,
|
@@ -293,18 +296,28 @@ async def _collect_server_info(stub: FunctionExecutorStub) -> None:
|
|
293
296
|
|
294
297
|
|
295
298
|
async def _initialize_server(
|
296
|
-
stub: FunctionExecutorStub,
|
299
|
+
stub: FunctionExecutorStub,
|
300
|
+
initialize_request: InitializeRequest,
|
301
|
+
customer_code_timeout_sec: Optional[float],
|
297
302
|
) -> None:
|
298
303
|
with (
|
299
304
|
metric_initialize_rpc_errors.count_exceptions(),
|
300
305
|
metric_initialize_rpc_latency.time(),
|
301
306
|
):
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
307
|
+
try:
|
308
|
+
initialize_response: InitializeResponse = await stub.initialize(
|
309
|
+
initialize_request,
|
310
|
+
timeout=customer_code_timeout_sec,
|
311
|
+
)
|
312
|
+
if initialize_response.success:
|
313
|
+
return
|
314
|
+
if initialize_response.HasField("customer_error"):
|
315
|
+
raise CustomerError(initialize_response.customer_error)
|
316
|
+
else:
|
317
|
+
raise Exception("initialize RPC failed at function executor server")
|
318
|
+
except grpc.aio.AioRpcError as e:
|
319
|
+
if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
|
320
|
+
raise CustomerError(
|
321
|
+
f"Customer code timeout of {customer_code_timeout_sec:.3f} sec expired"
|
322
|
+
) from e
|
323
|
+
raise
|
@@ -25,6 +25,7 @@ class FunctionExecutorState:
|
|
25
25
|
graph_version: str,
|
26
26
|
function_name: str,
|
27
27
|
image_uri: Optional[str],
|
28
|
+
secret_names: List[str],
|
28
29
|
logger: Any,
|
29
30
|
):
|
30
31
|
# Read only fields.
|
@@ -33,6 +34,7 @@ class FunctionExecutorState:
|
|
33
34
|
self.graph_name: str = graph_name
|
34
35
|
self.function_name: str = function_name
|
35
36
|
self.image_uri: Optional[str] = image_uri
|
37
|
+
self.secret_names: List[str] = secret_names
|
36
38
|
self._logger: Any = logger.bind(
|
37
39
|
module=__name__,
|
38
40
|
function_executor_id=id,
|
@@ -47,6 +49,7 @@ class FunctionExecutorState:
|
|
47
49
|
# TODO: Move graph_version to immutable fields once we migrate to gRPC State Reconciler.
|
48
50
|
self.graph_version: str = graph_version
|
49
51
|
self.status: FunctionExecutorStatus = FunctionExecutorStatus.DESTROYED
|
52
|
+
self.status_message: str = ""
|
50
53
|
self.status_change_notifier: asyncio.Condition = asyncio.Condition(
|
51
54
|
lock=self.lock
|
52
55
|
)
|
@@ -62,7 +65,9 @@ class FunctionExecutorState:
|
|
62
65
|
while self.status not in allowlist:
|
63
66
|
await self.status_change_notifier.wait()
|
64
67
|
|
65
|
-
async def set_status(
|
68
|
+
async def set_status(
|
69
|
+
self, new_status: FunctionExecutorStatus, status_message: str = ""
|
70
|
+
) -> None:
|
66
71
|
"""Sets the status of the Function Executor.
|
67
72
|
|
68
73
|
The caller must hold the lock.
|
@@ -70,6 +75,7 @@ class FunctionExecutorState:
|
|
70
75
|
"""
|
71
76
|
self.check_locked()
|
72
77
|
if is_status_change_allowed(self.status, new_status):
|
78
|
+
# If status didn't change then still log it for visibility.
|
73
79
|
self._logger.info(
|
74
80
|
"function executor status changed",
|
75
81
|
old_status=self.status.name,
|
@@ -78,12 +84,14 @@ class FunctionExecutorState:
|
|
78
84
|
metric_function_executors_with_status.labels(status=self.status.name).dec()
|
79
85
|
metric_function_executors_with_status.labels(status=new_status.name).inc()
|
80
86
|
self.status = new_status
|
87
|
+
self.status_message = status_message
|
81
88
|
self.status_change_notifier.notify_all()
|
82
89
|
else:
|
83
90
|
raise ValueError(
|
84
91
|
f"Invalid status change from {self.status} to {new_status}"
|
85
92
|
)
|
86
93
|
|
94
|
+
# TODO: Delete this method once HTTP protocol is removed as it's used only there.
|
87
95
|
async def destroy_function_executor(self) -> None:
|
88
96
|
"""Destroys the Function Executor if it exists.
|
89
97
|
|
@@ -1,5 +1,5 @@
|
|
1
1
|
import asyncio
|
2
|
-
from typing import Any, AsyncGenerator, Dict, Optional
|
2
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional
|
3
3
|
|
4
4
|
from .function_executor_state import FunctionExecutorState
|
5
5
|
from .function_executor_status import FunctionExecutorStatus
|
@@ -26,6 +26,7 @@ class FunctionExecutorStatesContainer:
|
|
26
26
|
graph_version: str,
|
27
27
|
function_name: str,
|
28
28
|
image_uri: Optional[str],
|
29
|
+
secret_names: List[str],
|
29
30
|
) -> FunctionExecutorState:
|
30
31
|
"""Get or create a function executor state with the given ID.
|
31
32
|
|
@@ -45,6 +46,7 @@ class FunctionExecutorStatesContainer:
|
|
45
46
|
graph_version=graph_version,
|
46
47
|
function_name=function_name,
|
47
48
|
image_uri=image_uri,
|
49
|
+
secret_names=secret_names,
|
48
50
|
logger=self._logger,
|
49
51
|
)
|
50
52
|
self._states[id] = state
|
@@ -52,6 +54,11 @@ class FunctionExecutorStatesContainer:
|
|
52
54
|
|
53
55
|
return self._states[id]
|
54
56
|
|
57
|
+
async def get(self, id: str) -> FunctionExecutorState:
|
58
|
+
"""Get the state with the given ID. Raises Exception if the state does not exist."""
|
59
|
+
async with self._lock:
|
60
|
+
return self._states[id]
|
61
|
+
|
55
62
|
async def __aiter__(self) -> AsyncGenerator[FunctionExecutorState, None]:
|
56
63
|
async with self._lock:
|
57
64
|
for state in self._states.values():
|
@@ -23,6 +23,7 @@ class FunctionExecutorStatus(Enum):
|
|
23
23
|
UNHEALTHY = "Unhealthy"
|
24
24
|
# STARTUP_FAILED_CUSTOMER_ERROR -> DESTROYING
|
25
25
|
# STARTUP_FAILED_PLATFORM_ERROR -> DESTROYING
|
26
|
+
# RUNNING_TASK -> DESTROYING
|
26
27
|
# UNHEALTHY -> DESTROYING
|
27
28
|
# IDLE -> DESTROYING
|
28
29
|
DESTROYING = "Destroying"
|
@@ -33,6 +34,8 @@ class FunctionExecutorStatus(Enum):
|
|
33
34
|
SHUTDOWN = "Shutdown" # Permanent stop state
|
34
35
|
|
35
36
|
|
37
|
+
# TODO: After removing HTTP code simplify state transitions by not allowing to
|
38
|
+
# startup an FE after it was destroyed. grpc protocol treats FEs as ephimeral and never revives them.
|
36
39
|
def is_status_change_allowed(
|
37
40
|
current_status: FunctionExecutorStatus, new_status: FunctionExecutorStatus
|
38
41
|
) -> bool:
|
@@ -69,6 +72,7 @@ def is_status_change_allowed(
|
|
69
72
|
],
|
70
73
|
FunctionExecutorStatus.RUNNING_TASK: [
|
71
74
|
FunctionExecutorStatus.RUNNING_TASK,
|
75
|
+
FunctionExecutorStatus.DESTROYING,
|
72
76
|
FunctionExecutorStatus.IDLE,
|
73
77
|
FunctionExecutorStatus.UNHEALTHY,
|
74
78
|
FunctionExecutorStatus.SHUTDOWN,
|
@@ -107,7 +107,9 @@ class HealthChecker:
|
|
107
107
|
return
|
108
108
|
|
109
109
|
self._health_check_failed_callback = callback
|
110
|
-
self._health_check_loop_task = asyncio.create_task(
|
110
|
+
self._health_check_loop_task = asyncio.create_task(
|
111
|
+
self._health_check_loop(), name="function executor health checker loop"
|
112
|
+
)
|
111
113
|
|
112
114
|
def stop(self) -> None:
|
113
115
|
"""Stops the periodic health checks.
|
@@ -126,7 +128,10 @@ class HealthChecker:
|
|
126
128
|
break
|
127
129
|
await asyncio.sleep(HEALTH_CHECK_POLL_PERIOD_SEC)
|
128
130
|
|
129
|
-
asyncio.create_task(
|
131
|
+
asyncio.create_task(
|
132
|
+
self._health_check_failed_callback(result),
|
133
|
+
name="function executor health check failure callback",
|
134
|
+
)
|
130
135
|
self._health_check_loop_task = None
|
131
136
|
|
132
137
|
|
@@ -67,7 +67,8 @@ class InvocationStateClient:
|
|
67
67
|
self._response_generator()
|
68
68
|
)
|
69
69
|
self._request_loop_task = asyncio.create_task(
|
70
|
-
self._request_loop(server_requests)
|
70
|
+
self._request_loop(server_requests),
|
71
|
+
name="graph invocation state client request processing loop",
|
71
72
|
)
|
72
73
|
|
73
74
|
def add_task_to_invocation_id_entry(self, task_id: str, invocation_id: str) -> None:
|
@@ -100,7 +101,8 @@ class InvocationStateClient:
|
|
100
101
|
pass
|
101
102
|
except asyncio.CancelledError:
|
102
103
|
# This async task was cancelled by destroy(). Normal situation too.
|
103
|
-
|
104
|
+
# This exception should not be suppressed, see Python asyncio docs.
|
105
|
+
raise
|
104
106
|
except Exception as e:
|
105
107
|
metric_request_read_errors.inc()
|
106
108
|
self._logger.error(
|
@@ -25,6 +25,12 @@ class SubprocessFunctionExecutorServerFactory(FunctionExecutorServerFactory):
|
|
25
25
|
logger = logger.bind(module=__name__)
|
26
26
|
port: Optional[int] = None
|
27
27
|
|
28
|
+
if len(config.secret_names) > 0:
|
29
|
+
logger.warning(
|
30
|
+
"Subprocess Function Executor does not support secrets. Please supply secrets as environment variables.",
|
31
|
+
secret_names=config.secret_names,
|
32
|
+
)
|
33
|
+
|
28
34
|
try:
|
29
35
|
port = self._allocate_port()
|
30
36
|
args = [
|
@@ -10,6 +10,7 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
|
|
10
10
|
from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
11
11
|
FunctionExecutorStub,
|
12
12
|
)
|
13
|
+
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
13
14
|
|
14
15
|
from ..api_objects import Task
|
15
16
|
from .function_executor import CustomerError, FunctionExecutor
|
@@ -26,7 +27,7 @@ from .server.function_executor_server_factory import (
|
|
26
27
|
FunctionExecutorServerFactory,
|
27
28
|
)
|
28
29
|
from .task_input import TaskInput
|
29
|
-
from .task_output import TaskOutput
|
30
|
+
from .task_output import TaskMetrics, TaskOutput
|
30
31
|
|
31
32
|
|
32
33
|
class SingleTaskRunner:
|
@@ -286,16 +287,17 @@ class _RunningTaskContextManager:
|
|
286
287
|
|
287
288
|
|
288
289
|
def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
if
|
298
|
-
|
290
|
+
response_validator = MessageValidator(response)
|
291
|
+
response_validator.required_field("stdout")
|
292
|
+
response_validator.required_field("stderr")
|
293
|
+
response_validator.required_field("is_reducer")
|
294
|
+
response_validator.required_field("success")
|
295
|
+
|
296
|
+
metrics = TaskMetrics(counters={}, timers={})
|
297
|
+
if response.HasField("metrics"):
|
298
|
+
# Can be None if e.g. function failed.
|
299
|
+
metrics.counters = dict(response.metrics.counters)
|
300
|
+
metrics.timers = dict(response.metrics.timers)
|
299
301
|
|
300
302
|
output = TaskOutput(
|
301
303
|
task_id=task.id,
|
@@ -308,10 +310,12 @@ def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
|
|
308
310
|
stderr=response.stderr,
|
309
311
|
reducer=response.is_reducer,
|
310
312
|
success=response.success,
|
313
|
+
metrics=metrics,
|
311
314
|
)
|
312
315
|
|
313
316
|
if response.HasField("function_output"):
|
314
317
|
output.function_output = response.function_output
|
318
|
+
output.output_encoding = response.function_output.output_encoding
|
315
319
|
if response.HasField("router_output"):
|
316
320
|
output.router_output = response.router_output
|
317
321
|
|
@@ -1,11 +1,17 @@
|
|
1
|
-
from typing import Optional
|
1
|
+
from typing import Dict, Optional
|
2
2
|
|
3
3
|
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
4
4
|
FunctionOutput,
|
5
5
|
RouterOutput,
|
6
6
|
)
|
7
7
|
|
8
|
-
|
8
|
+
|
9
|
+
class TaskMetrics:
|
10
|
+
"""Metrics for a task."""
|
11
|
+
|
12
|
+
def __init__(self, counters: Dict[str, int], timers: Dict[str, float]):
|
13
|
+
self.counters = counters
|
14
|
+
self.timers = timers
|
9
15
|
|
10
16
|
|
11
17
|
class TaskOutput:
|
@@ -19,6 +25,7 @@ class TaskOutput:
|
|
19
25
|
function_name: str,
|
20
26
|
graph_version: str,
|
21
27
|
graph_invocation_id: str,
|
28
|
+
output_encoding: Optional[str] = None,
|
22
29
|
function_output: Optional[FunctionOutput] = None,
|
23
30
|
router_output: Optional[RouterOutput] = None,
|
24
31
|
stdout: Optional[str] = None,
|
@@ -26,6 +33,7 @@ class TaskOutput:
|
|
26
33
|
reducer: bool = False,
|
27
34
|
success: bool = False,
|
28
35
|
is_internal_error: bool = False,
|
36
|
+
metrics: Optional[TaskMetrics] = None,
|
29
37
|
):
|
30
38
|
self.task_id = task_id
|
31
39
|
self.namespace = namespace
|
@@ -40,6 +48,8 @@ class TaskOutput:
|
|
40
48
|
self.reducer = reducer
|
41
49
|
self.success = success
|
42
50
|
self.is_internal_error = is_internal_error
|
51
|
+
self.metrics = metrics
|
52
|
+
self.output_encoding = output_encoding
|
43
53
|
|
44
54
|
@classmethod
|
45
55
|
def internal_error(
|
@@ -63,3 +73,27 @@ class TaskOutput:
|
|
63
73
|
stderr="Platform failed to execute the function.",
|
64
74
|
is_internal_error=True,
|
65
75
|
)
|
76
|
+
|
77
|
+
@classmethod
|
78
|
+
def function_timeout(
|
79
|
+
cls,
|
80
|
+
task_id: str,
|
81
|
+
namespace: str,
|
82
|
+
graph_name: str,
|
83
|
+
function_name: str,
|
84
|
+
graph_version: str,
|
85
|
+
graph_invocation_id: str,
|
86
|
+
timeout_sec: float,
|
87
|
+
) -> "TaskOutput":
|
88
|
+
"""Creates a TaskOutput for an function timeout error."""
|
89
|
+
# Task stdout, stderr is not available.
|
90
|
+
return TaskOutput(
|
91
|
+
task_id=task_id,
|
92
|
+
namespace=namespace,
|
93
|
+
graph_name=graph_name,
|
94
|
+
function_name=function_name,
|
95
|
+
graph_version=graph_version,
|
96
|
+
graph_invocation_id=graph_invocation_id,
|
97
|
+
stderr=f"Function exceeded its configured timeout of {timeout_sec:.3f} sec.",
|
98
|
+
is_internal_error=False,
|
99
|
+
)
|
@@ -69,6 +69,10 @@ class ChannelManager:
|
|
69
69
|
certificate_chain=certificate_chain,
|
70
70
|
)
|
71
71
|
|
72
|
+
async def destroy(self):
|
73
|
+
if self._channel is not None:
|
74
|
+
await self._destroy_locked_channel()
|
75
|
+
|
72
76
|
async def get_channel(self) -> grpc.aio.Channel:
|
73
77
|
"""Returns a channel to the gRPC server.
|
74
78
|
|
@@ -155,6 +159,3 @@ class ChannelManager:
|
|
155
159
|
except Exception as e:
|
156
160
|
self._logger.error("failed closing channel", exc_info=e)
|
157
161
|
self._channel = None
|
158
|
-
|
159
|
-
async def shutdown(self):
|
160
|
-
pass
|