indexify 0.3.31__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/__init__.py +18 -0
- indexify/cli/build_image.py +51 -0
- indexify/cli/deploy.py +57 -0
- indexify/cli/executor.py +205 -0
- indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
- indexify/executor/executor.py +57 -313
- indexify/executor/function_allowlist.py +59 -0
- indexify/executor/function_executor/function_executor.py +12 -6
- indexify/executor/function_executor/invocation_state_client.py +25 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
- indexify/executor/function_executor_controller/__init__.py +13 -0
- indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
- indexify/executor/function_executor_controller/create_function_executor.py +158 -0
- indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
- indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
- indexify/executor/function_executor_controller/downloads.py +199 -0
- indexify/executor/function_executor_controller/events.py +172 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
- indexify/executor/function_executor_controller/loggers.py +57 -0
- indexify/executor/function_executor_controller/message_validators.py +69 -0
- indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
- indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
- indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
- indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
- indexify/executor/function_executor_controller/prepare_task.py +38 -0
- indexify/executor/function_executor_controller/run_task.py +201 -0
- indexify/executor/function_executor_controller/task_info.py +33 -0
- indexify/executor/function_executor_controller/task_output.py +122 -0
- indexify/executor/function_executor_controller/upload_task_output.py +234 -0
- indexify/executor/host_resources/host_resources.py +20 -25
- indexify/executor/host_resources/nvidia_gpu_allocator.py +8 -1
- indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
- indexify/executor/metrics/executor.py +0 -47
- indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
- indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
- indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
- indexify/executor/monitoring/health_checker/health_checker.py +0 -11
- indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
- indexify/executor/state_reporter.py +364 -0
- indexify/proto/executor_api.proto +68 -60
- indexify/proto/executor_api_pb2.py +52 -52
- indexify/proto/executor_api_pb2.pyi +129 -108
- indexify/proto/executor_api_pb2_grpc.py +0 -47
- {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/METADATA +2 -5
- indexify-0.4.3.dist-info/RECORD +68 -0
- indexify-0.4.3.dist-info/entry_points.txt +3 -0
- indexify/cli/cli.py +0 -268
- indexify/executor/api_objects.py +0 -92
- indexify/executor/downloader.py +0 -417
- indexify/executor/executor_flavor.py +0 -7
- indexify/executor/function_executor/function_executor_state.py +0 -107
- indexify/executor/function_executor/function_executor_states_container.py +0 -93
- indexify/executor/function_executor/function_executor_status.py +0 -95
- indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
- indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
- indexify/executor/function_executor/single_task_runner.py +0 -345
- indexify/executor/function_executor/task_input.py +0 -21
- indexify/executor/function_executor/task_output.py +0 -105
- indexify/executor/grpc/function_executor_controller.py +0 -418
- indexify/executor/grpc/metrics/task_controller.py +0 -8
- indexify/executor/grpc/state_reporter.py +0 -317
- indexify/executor/grpc/task_controller.py +0 -508
- indexify/executor/metrics/task_fetcher.py +0 -21
- indexify/executor/metrics/task_reporter.py +0 -53
- indexify/executor/metrics/task_runner.py +0 -52
- indexify/executor/monitoring/function_allowlist.py +0 -25
- indexify/executor/runtime_probes.py +0 -68
- indexify/executor/task_fetcher.py +0 -96
- indexify/executor/task_reporter.py +0 -459
- indexify/executor/task_runner.py +0 -177
- indexify-0.3.31.dist-info/RECORD +0 -68
- indexify-0.3.31.dist-info/entry_points.txt +0 -3
- {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/WHEEL +0 -0
@@ -1,95 +0,0 @@
|
|
1
|
-
from enum import Enum
|
2
|
-
|
3
|
-
|
4
|
-
class FunctionExecutorStatus(Enum):
|
5
|
-
"""Status of a Function Executor.
|
6
|
-
|
7
|
-
Each status lists transitions allowed to it.
|
8
|
-
"""
|
9
|
-
|
10
|
-
# DESTROYED -> STARTING_UP
|
11
|
-
STARTING_UP = "Starting Up"
|
12
|
-
# STARTING_UP -> STARTUP_FAILED_CUSTOMER_ERROR
|
13
|
-
STARTUP_FAILED_CUSTOMER_ERROR = "Startup Failed (Customer Error)"
|
14
|
-
# STARTING_UP -> STARTUP_FAILED_PLATFORM_ERROR
|
15
|
-
STARTUP_FAILED_PLATFORM_ERROR = "Startup Failed (Platform Error)"
|
16
|
-
# STARTING_UP -> IDLE
|
17
|
-
# RUNNING_TASK -> IDLE
|
18
|
-
IDLE = "Idle"
|
19
|
-
# IDLE -> RUNNING_TASK
|
20
|
-
RUNNING_TASK = "Running Task"
|
21
|
-
# IDLE -> UNHEALTHY
|
22
|
-
# RUNNING_TASK -> UNHEALTHY
|
23
|
-
UNHEALTHY = "Unhealthy"
|
24
|
-
# STARTUP_FAILED_CUSTOMER_ERROR -> DESTROYING
|
25
|
-
# STARTUP_FAILED_PLATFORM_ERROR -> DESTROYING
|
26
|
-
# RUNNING_TASK -> DESTROYING
|
27
|
-
# UNHEALTHY -> DESTROYING
|
28
|
-
# IDLE -> DESTROYING
|
29
|
-
DESTROYING = "Destroying"
|
30
|
-
# DESTROYED (initial status)
|
31
|
-
# DESTROYING -> DESTROYED
|
32
|
-
DESTROYED = "Destroyed"
|
33
|
-
# Any state -> SHUTDOWN
|
34
|
-
SHUTDOWN = "Shutdown" # Permanent stop state
|
35
|
-
|
36
|
-
|
37
|
-
# TODO: After removing HTTP code simplify state transitions by not allowing to
|
38
|
-
# startup an FE after it was destroyed. grpc protocol treats FEs as ephimeral and never revives them.
|
39
|
-
def is_status_change_allowed(
|
40
|
-
current_status: FunctionExecutorStatus, new_status: FunctionExecutorStatus
|
41
|
-
) -> bool:
|
42
|
-
"""Returns True if the transition is allowed."""
|
43
|
-
allowed_transitions = {
|
44
|
-
FunctionExecutorStatus.DESTROYED: [
|
45
|
-
FunctionExecutorStatus.DESTROYED,
|
46
|
-
FunctionExecutorStatus.STARTING_UP,
|
47
|
-
FunctionExecutorStatus.SHUTDOWN,
|
48
|
-
],
|
49
|
-
FunctionExecutorStatus.STARTING_UP: [
|
50
|
-
FunctionExecutorStatus.STARTING_UP,
|
51
|
-
FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
|
52
|
-
FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
|
53
|
-
FunctionExecutorStatus.IDLE,
|
54
|
-
FunctionExecutorStatus.SHUTDOWN,
|
55
|
-
],
|
56
|
-
FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR: [
|
57
|
-
FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
|
58
|
-
FunctionExecutorStatus.DESTROYING,
|
59
|
-
FunctionExecutorStatus.SHUTDOWN,
|
60
|
-
],
|
61
|
-
FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR: [
|
62
|
-
FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
|
63
|
-
FunctionExecutorStatus.DESTROYING,
|
64
|
-
FunctionExecutorStatus.SHUTDOWN,
|
65
|
-
],
|
66
|
-
FunctionExecutorStatus.IDLE: [
|
67
|
-
FunctionExecutorStatus.IDLE,
|
68
|
-
FunctionExecutorStatus.RUNNING_TASK,
|
69
|
-
FunctionExecutorStatus.UNHEALTHY,
|
70
|
-
FunctionExecutorStatus.DESTROYING,
|
71
|
-
FunctionExecutorStatus.SHUTDOWN,
|
72
|
-
],
|
73
|
-
FunctionExecutorStatus.RUNNING_TASK: [
|
74
|
-
FunctionExecutorStatus.RUNNING_TASK,
|
75
|
-
FunctionExecutorStatus.DESTROYING,
|
76
|
-
FunctionExecutorStatus.IDLE,
|
77
|
-
FunctionExecutorStatus.UNHEALTHY,
|
78
|
-
FunctionExecutorStatus.SHUTDOWN,
|
79
|
-
],
|
80
|
-
FunctionExecutorStatus.UNHEALTHY: [
|
81
|
-
FunctionExecutorStatus.UNHEALTHY,
|
82
|
-
FunctionExecutorStatus.DESTROYING,
|
83
|
-
FunctionExecutorStatus.SHUTDOWN,
|
84
|
-
],
|
85
|
-
FunctionExecutorStatus.DESTROYING: [
|
86
|
-
FunctionExecutorStatus.DESTROYING,
|
87
|
-
FunctionExecutorStatus.DESTROYED,
|
88
|
-
FunctionExecutorStatus.SHUTDOWN,
|
89
|
-
],
|
90
|
-
FunctionExecutorStatus.SHUTDOWN: [
|
91
|
-
FunctionExecutorStatus.SHUTDOWN
|
92
|
-
], # No transitions allowed from SHUTDOWN
|
93
|
-
}
|
94
|
-
|
95
|
-
return new_status in allowed_transitions.get(current_status, [])
|
@@ -1,46 +0,0 @@
|
|
1
|
-
import prometheus_client
|
2
|
-
|
3
|
-
from ..function_executor_status import FunctionExecutorStatus
|
4
|
-
|
5
|
-
# This file contains all metrics used by FunctionExecutorState.
|
6
|
-
|
7
|
-
metric_function_executor_state_not_locked_errors: prometheus_client.Counter = (
|
8
|
-
prometheus_client.Counter(
|
9
|
-
"function_executor_state_not_locked_errors",
|
10
|
-
"Number of times a Function Executor state was used without acquiring its lock",
|
11
|
-
)
|
12
|
-
)
|
13
|
-
|
14
|
-
# Function Executors count with a particular status.
|
15
|
-
metric_function_executors_with_status: prometheus_client.Gauge = (
|
16
|
-
prometheus_client.Gauge(
|
17
|
-
"function_executors_with_status",
|
18
|
-
"Number of Function Executors with a particular status",
|
19
|
-
["status"],
|
20
|
-
)
|
21
|
-
)
|
22
|
-
metric_function_executors_with_status.labels(
|
23
|
-
status=FunctionExecutorStatus.STARTING_UP.name
|
24
|
-
)
|
25
|
-
metric_function_executors_with_status.labels(
|
26
|
-
status=FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR.name
|
27
|
-
)
|
28
|
-
metric_function_executors_with_status.labels(
|
29
|
-
status=FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR.name
|
30
|
-
)
|
31
|
-
metric_function_executors_with_status.labels(status=FunctionExecutorStatus.IDLE.name)
|
32
|
-
metric_function_executors_with_status.labels(
|
33
|
-
status=FunctionExecutorStatus.RUNNING_TASK.name
|
34
|
-
)
|
35
|
-
metric_function_executors_with_status.labels(
|
36
|
-
status=FunctionExecutorStatus.UNHEALTHY.name
|
37
|
-
)
|
38
|
-
metric_function_executors_with_status.labels(
|
39
|
-
status=FunctionExecutorStatus.DESTROYING.name
|
40
|
-
)
|
41
|
-
metric_function_executors_with_status.labels(
|
42
|
-
status=FunctionExecutorStatus.DESTROYED.name
|
43
|
-
)
|
44
|
-
metric_function_executors_with_status.labels(
|
45
|
-
status=FunctionExecutorStatus.SHUTDOWN.name
|
46
|
-
)
|
@@ -1,10 +0,0 @@
|
|
1
|
-
import prometheus_client
|
2
|
-
|
3
|
-
# This file contains all metrics used by FunctionExecutorStatesContainer.
|
4
|
-
|
5
|
-
metric_function_executor_states_count: prometheus_client.Gauge = (
|
6
|
-
prometheus_client.Gauge(
|
7
|
-
"function_executor_states_count",
|
8
|
-
"Number of existing Function Executor states",
|
9
|
-
)
|
10
|
-
)
|
@@ -1,345 +0,0 @@
|
|
1
|
-
from collections.abc import Awaitable, Callable
|
2
|
-
from math import ceil
|
3
|
-
from typing import Any, Optional
|
4
|
-
|
5
|
-
import grpc
|
6
|
-
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
7
|
-
InitializeRequest,
|
8
|
-
RunTaskRequest,
|
9
|
-
RunTaskResponse,
|
10
|
-
)
|
11
|
-
from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
12
|
-
FunctionExecutorStub,
|
13
|
-
)
|
14
|
-
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
15
|
-
|
16
|
-
from ..api_objects import Task
|
17
|
-
from .function_executor import CustomerError, FunctionExecutor
|
18
|
-
from .function_executor_state import FunctionExecutorState
|
19
|
-
from .function_executor_status import FunctionExecutorStatus
|
20
|
-
from .health_checker import HealthChecker, HealthCheckResult
|
21
|
-
from .metrics.single_task_runner import (
|
22
|
-
metric_function_executor_run_task_rpc_errors,
|
23
|
-
metric_function_executor_run_task_rpc_latency,
|
24
|
-
metric_function_executor_run_task_rpcs,
|
25
|
-
)
|
26
|
-
from .server.function_executor_server_factory import (
|
27
|
-
FunctionExecutorServerConfiguration,
|
28
|
-
FunctionExecutorServerFactory,
|
29
|
-
)
|
30
|
-
from .task_input import TaskInput
|
31
|
-
from .task_output import TaskMetrics, TaskOutput
|
32
|
-
|
33
|
-
|
34
|
-
class SingleTaskRunner:
|
35
|
-
def __init__(
|
36
|
-
self,
|
37
|
-
executor_id: str,
|
38
|
-
function_executor_state: FunctionExecutorState,
|
39
|
-
task_input: TaskInput,
|
40
|
-
function_executor_server_factory: FunctionExecutorServerFactory,
|
41
|
-
base_url: str,
|
42
|
-
config_path: Optional[str],
|
43
|
-
logger: Any,
|
44
|
-
):
|
45
|
-
self._executor_id: str = executor_id
|
46
|
-
self._function_executor_state: FunctionExecutorState = function_executor_state
|
47
|
-
self._task_input: TaskInput = task_input
|
48
|
-
self._function_executor_server_factory: FunctionExecutorServerFactory = (
|
49
|
-
function_executor_server_factory
|
50
|
-
)
|
51
|
-
self._base_url: str = base_url
|
52
|
-
self._config_path: Optional[str] = config_path
|
53
|
-
self._logger = logger.bind(module=__name__)
|
54
|
-
|
55
|
-
async def run(self) -> TaskOutput:
|
56
|
-
"""Runs the task in the Function Executor.
|
57
|
-
|
58
|
-
The FunctionExecutorState must be locked by the caller.
|
59
|
-
The lock is released during actual task run in the server.
|
60
|
-
The lock is relocked on return.
|
61
|
-
|
62
|
-
Raises an exception if an error occured.
|
63
|
-
|
64
|
-
On enter the Function Executor status is either IDLE, UNHEALTHY or DESTROYED.
|
65
|
-
On return the Function Executor status is either IDLE, UNHEALTHY or DESTROYED.
|
66
|
-
"""
|
67
|
-
self._function_executor_state.check_locked()
|
68
|
-
|
69
|
-
if self._function_executor_state.status not in [
|
70
|
-
FunctionExecutorStatus.IDLE,
|
71
|
-
FunctionExecutorStatus.UNHEALTHY,
|
72
|
-
FunctionExecutorStatus.DESTROYED,
|
73
|
-
]:
|
74
|
-
self._logger.error(
|
75
|
-
"Function Executor is not in oneof [IDLE, UNHEALTHY, DESTROYED] state, cannot run the task",
|
76
|
-
status=self._function_executor_state.status,
|
77
|
-
)
|
78
|
-
raise RuntimeError(
|
79
|
-
f"Unexpected Function Executor state {self._function_executor_state.status}"
|
80
|
-
)
|
81
|
-
|
82
|
-
# If Function Executor became unhealthy while was idle then destroy it.
|
83
|
-
# It'll be recreated below.
|
84
|
-
await self._destroy_existing_function_executor_if_unhealthy()
|
85
|
-
|
86
|
-
# Create Function Executor if it doesn't exist yet.
|
87
|
-
if self._function_executor_state.status == FunctionExecutorStatus.DESTROYED:
|
88
|
-
try:
|
89
|
-
await self._create_function_executor()
|
90
|
-
except CustomerError as e:
|
91
|
-
return TaskOutput(
|
92
|
-
task_id=self._task_input.task.id,
|
93
|
-
namespace=self._task_input.task.namespace,
|
94
|
-
graph_name=self._task_input.task.compute_graph,
|
95
|
-
function_name=self._task_input.task.compute_fn,
|
96
|
-
graph_version=self._task_input.task.graph_version,
|
97
|
-
graph_invocation_id=self._task_input.task.invocation_id,
|
98
|
-
stderr=str(e),
|
99
|
-
success=False,
|
100
|
-
output_payload_uri_prefix=self._task_input.task.output_payload_uri_prefix,
|
101
|
-
)
|
102
|
-
|
103
|
-
try:
|
104
|
-
return await self._run()
|
105
|
-
finally:
|
106
|
-
# If Function Executor became unhealthy while running the task then destroy it.
|
107
|
-
# The periodic health checker might not notice this as it does only periodic checks.
|
108
|
-
await self._destroy_existing_function_executor_if_unhealthy()
|
109
|
-
|
110
|
-
if self._function_executor_state.status not in [
|
111
|
-
FunctionExecutorStatus.IDLE,
|
112
|
-
FunctionExecutorStatus.UNHEALTHY,
|
113
|
-
FunctionExecutorStatus.DESTROYED,
|
114
|
-
]:
|
115
|
-
self._logger.error(
|
116
|
-
"Function Executor status is not oneof [IDLE, UNHEALTHY, DESTROYED] after running the task, resetting the state to mitigate a possible bug",
|
117
|
-
status=self._function_executor_state.status,
|
118
|
-
)
|
119
|
-
if self._function_executor_state.function_executor is None:
|
120
|
-
await self._function_executor_state.set_status(
|
121
|
-
FunctionExecutorStatus.DESTROYED
|
122
|
-
)
|
123
|
-
else:
|
124
|
-
await self._function_executor_state.set_status(
|
125
|
-
FunctionExecutorStatus.UNHEALTHY
|
126
|
-
)
|
127
|
-
|
128
|
-
async def _create_function_executor(self) -> None:
|
129
|
-
await self._function_executor_state.set_status(
|
130
|
-
FunctionExecutorStatus.STARTING_UP
|
131
|
-
)
|
132
|
-
self._function_executor_state.function_executor = FunctionExecutor(
|
133
|
-
server_factory=self._function_executor_server_factory, logger=self._logger
|
134
|
-
)
|
135
|
-
task: Task = self._task_input.task
|
136
|
-
config: FunctionExecutorServerConfiguration = (
|
137
|
-
FunctionExecutorServerConfiguration(
|
138
|
-
executor_id=self._executor_id,
|
139
|
-
function_executor_id=self._function_executor_state.id,
|
140
|
-
namespace=task.namespace,
|
141
|
-
graph_name=task.compute_graph,
|
142
|
-
graph_version=task.graph_version,
|
143
|
-
function_name=task.compute_fn,
|
144
|
-
image_uri=task.image_uri,
|
145
|
-
secret_names=task.secret_names or [],
|
146
|
-
cpu_ms_per_sec=(
|
147
|
-
None
|
148
|
-
if task.resources.cpus is None
|
149
|
-
else ceil(task.resources.cpus * 1000)
|
150
|
-
),
|
151
|
-
memory_bytes=(
|
152
|
-
None
|
153
|
-
if task.resources.memory_mb is None
|
154
|
-
else task.resources.memory_mb * 1024 * 1024
|
155
|
-
),
|
156
|
-
disk_bytes=(
|
157
|
-
None
|
158
|
-
if task.resources.ephemeral_disk_mb is None
|
159
|
-
else task.resources.ephemeral_disk_mb * 1024 * 1024
|
160
|
-
),
|
161
|
-
gpu_count=0 if task.resources.gpu is None else task.resources.gpu.count,
|
162
|
-
)
|
163
|
-
)
|
164
|
-
initialize_request: InitializeRequest = InitializeRequest(
|
165
|
-
namespace=self._task_input.task.namespace,
|
166
|
-
graph_name=self._task_input.task.compute_graph,
|
167
|
-
graph_version=self._task_input.task.graph_version,
|
168
|
-
function_name=self._task_input.task.compute_fn,
|
169
|
-
graph=self._task_input.graph,
|
170
|
-
)
|
171
|
-
|
172
|
-
try:
|
173
|
-
await self._function_executor_state.function_executor.initialize(
|
174
|
-
config=config,
|
175
|
-
initialize_request=initialize_request,
|
176
|
-
base_url=self._base_url,
|
177
|
-
config_path=self._config_path,
|
178
|
-
)
|
179
|
-
except CustomerError:
|
180
|
-
# We have to follow the valid state transition sequence.
|
181
|
-
await self._function_executor_state.set_status(
|
182
|
-
FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR
|
183
|
-
)
|
184
|
-
await self._function_executor_state.destroy_function_executor()
|
185
|
-
raise
|
186
|
-
except Exception:
|
187
|
-
# We have to follow the valid state transition sequence.
|
188
|
-
await self._function_executor_state.set_status(
|
189
|
-
FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
|
190
|
-
)
|
191
|
-
await self._function_executor_state.destroy_function_executor()
|
192
|
-
raise
|
193
|
-
|
194
|
-
await self._function_executor_state.set_status(FunctionExecutorStatus.IDLE)
|
195
|
-
|
196
|
-
async def _run(self) -> TaskOutput:
|
197
|
-
request: RunTaskRequest = RunTaskRequest(
|
198
|
-
namespace=self._task_input.task.namespace,
|
199
|
-
graph_name=self._task_input.task.compute_graph,
|
200
|
-
graph_version=self._task_input.task.graph_version,
|
201
|
-
function_name=self._task_input.task.compute_fn,
|
202
|
-
graph_invocation_id=self._task_input.task.invocation_id,
|
203
|
-
task_id=self._task_input.task.id,
|
204
|
-
function_input=self._task_input.input,
|
205
|
-
)
|
206
|
-
if self._task_input.init_value is not None:
|
207
|
-
request.function_init_value.CopyFrom(self._task_input.init_value)
|
208
|
-
channel: grpc.aio.Channel = (
|
209
|
-
self._function_executor_state.function_executor.channel()
|
210
|
-
)
|
211
|
-
|
212
|
-
async with _RunningTaskContextManager(
|
213
|
-
invocation_id=self._task_input.task.invocation_id,
|
214
|
-
task_id=self._task_input.task.id,
|
215
|
-
health_check_failed_callback=self._health_check_failed_callback,
|
216
|
-
function_executor_state=self._function_executor_state,
|
217
|
-
):
|
218
|
-
with (
|
219
|
-
metric_function_executor_run_task_rpc_errors.count_exceptions(),
|
220
|
-
metric_function_executor_run_task_rpc_latency.time(),
|
221
|
-
):
|
222
|
-
metric_function_executor_run_task_rpcs.inc()
|
223
|
-
# If this RPC failed due to customer code crashing the server we won't be
|
224
|
-
# able to detect this. We'll treat this as our own error for now and thus
|
225
|
-
# let the AioRpcError to be raised here.
|
226
|
-
response: RunTaskResponse = await FunctionExecutorStub(
|
227
|
-
channel
|
228
|
-
).run_task(request)
|
229
|
-
return _task_output(task=self._task_input.task, response=response)
|
230
|
-
|
231
|
-
async def _health_check_failed_callback(self, result: HealthCheckResult):
|
232
|
-
# Function Executor destroy due to the periodic health check failure ensures that
|
233
|
-
# a running task RPC stuck in unhealthy Function Executor fails immidiately.
|
234
|
-
async with self._function_executor_state.lock:
|
235
|
-
if (
|
236
|
-
self._function_executor_state.status
|
237
|
-
!= FunctionExecutorStatus.RUNNING_TASK
|
238
|
-
):
|
239
|
-
# Protection in case the callback gets delivered after we finished running the task.
|
240
|
-
return
|
241
|
-
|
242
|
-
await self._function_executor_state.set_status(
|
243
|
-
FunctionExecutorStatus.UNHEALTHY
|
244
|
-
)
|
245
|
-
await self._destroy_function_executor_on_failed_health_check(result.reason)
|
246
|
-
|
247
|
-
async def _destroy_existing_function_executor_if_unhealthy(self):
|
248
|
-
self._function_executor_state.check_locked()
|
249
|
-
if self._function_executor_state.status == FunctionExecutorStatus.IDLE:
|
250
|
-
result: HealthCheckResult = (
|
251
|
-
await self._function_executor_state.function_executor.health_checker().check()
|
252
|
-
)
|
253
|
-
if not result.is_healthy:
|
254
|
-
await self._function_executor_state.set_status(
|
255
|
-
FunctionExecutorStatus.UNHEALTHY
|
256
|
-
)
|
257
|
-
|
258
|
-
if self._function_executor_state.status == FunctionExecutorStatus.UNHEALTHY:
|
259
|
-
await self._destroy_function_executor_on_failed_health_check(result.reason)
|
260
|
-
|
261
|
-
async def _destroy_function_executor_on_failed_health_check(self, reason: str):
|
262
|
-
self._function_executor_state.check_locked()
|
263
|
-
self._logger.error(
|
264
|
-
"Function Executor health check failed, destroying Function Executor",
|
265
|
-
health_check_fail_reason=reason,
|
266
|
-
)
|
267
|
-
await self._function_executor_state.destroy_function_executor()
|
268
|
-
|
269
|
-
|
270
|
-
class _RunningTaskContextManager:
|
271
|
-
"""Performs all the actions required before and after running a task."""
|
272
|
-
|
273
|
-
def __init__(
|
274
|
-
self,
|
275
|
-
invocation_id: str,
|
276
|
-
task_id: str,
|
277
|
-
health_check_failed_callback: Callable[[], Awaitable[None]],
|
278
|
-
function_executor_state: FunctionExecutorState,
|
279
|
-
):
|
280
|
-
self._invocation_id: str = invocation_id
|
281
|
-
self._task_id: str = task_id
|
282
|
-
self._health_check_failed_callback: Callable[[], Awaitable[None]] = (
|
283
|
-
health_check_failed_callback
|
284
|
-
)
|
285
|
-
self._state: FunctionExecutorState = function_executor_state
|
286
|
-
|
287
|
-
async def __aenter__(self):
|
288
|
-
await self._state.set_status(FunctionExecutorStatus.RUNNING_TASK)
|
289
|
-
self._state.function_executor.invocation_state_client().add_task_to_invocation_id_entry(
|
290
|
-
task_id=self._task_id,
|
291
|
-
invocation_id=self._invocation_id,
|
292
|
-
)
|
293
|
-
self._state.function_executor.health_checker().start(
|
294
|
-
self._health_check_failed_callback
|
295
|
-
)
|
296
|
-
# Unlock the state so other tasks can act depending on it.
|
297
|
-
self._state.lock.release()
|
298
|
-
return self
|
299
|
-
|
300
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
301
|
-
await self._state.lock.acquire()
|
302
|
-
# Health check callback could destroy the FunctionExecutor and set status to UNHEALTHY
|
303
|
-
if self._state.status == FunctionExecutorStatus.RUNNING_TASK:
|
304
|
-
await self._state.set_status(FunctionExecutorStatus.IDLE)
|
305
|
-
self._state.function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
|
306
|
-
task_id=self._task_id
|
307
|
-
)
|
308
|
-
self._state.function_executor.health_checker().stop()
|
309
|
-
|
310
|
-
|
311
|
-
def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
|
312
|
-
response_validator = MessageValidator(response)
|
313
|
-
response_validator.required_field("stdout")
|
314
|
-
response_validator.required_field("stderr")
|
315
|
-
response_validator.required_field("is_reducer")
|
316
|
-
response_validator.required_field("success")
|
317
|
-
|
318
|
-
metrics = TaskMetrics(counters={}, timers={})
|
319
|
-
if response.HasField("metrics"):
|
320
|
-
# Can be None if e.g. function failed.
|
321
|
-
metrics.counters = dict(response.metrics.counters)
|
322
|
-
metrics.timers = dict(response.metrics.timers)
|
323
|
-
|
324
|
-
output = TaskOutput(
|
325
|
-
task_id=task.id,
|
326
|
-
namespace=task.namespace,
|
327
|
-
graph_name=task.compute_graph,
|
328
|
-
function_name=task.compute_fn,
|
329
|
-
graph_version=task.graph_version,
|
330
|
-
graph_invocation_id=task.invocation_id,
|
331
|
-
stdout=response.stdout,
|
332
|
-
stderr=response.stderr,
|
333
|
-
reducer=response.is_reducer,
|
334
|
-
success=response.success,
|
335
|
-
metrics=metrics,
|
336
|
-
output_payload_uri_prefix=task.output_payload_uri_prefix,
|
337
|
-
)
|
338
|
-
|
339
|
-
if response.HasField("function_output"):
|
340
|
-
output.function_output = response.function_output
|
341
|
-
output.output_encoding = response.function_output.output_encoding
|
342
|
-
if response.HasField("router_output"):
|
343
|
-
output.router_output = response.router_output
|
344
|
-
|
345
|
-
return output
|
@@ -1,21 +0,0 @@
|
|
1
|
-
from typing import Optional
|
2
|
-
|
3
|
-
from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
|
4
|
-
|
5
|
-
from ..api_objects import Task
|
6
|
-
|
7
|
-
|
8
|
-
class TaskInput:
|
9
|
-
"""Task with all the resources required to run it."""
|
10
|
-
|
11
|
-
def __init__(
|
12
|
-
self,
|
13
|
-
task: Task,
|
14
|
-
graph: SerializedObject,
|
15
|
-
input: SerializedObject,
|
16
|
-
init_value: Optional[SerializedObject],
|
17
|
-
):
|
18
|
-
self.task: Task = task
|
19
|
-
self.graph: SerializedObject = graph
|
20
|
-
self.input: SerializedObject = input
|
21
|
-
self.init_value: Optional[SerializedObject] = init_value
|
@@ -1,105 +0,0 @@
|
|
1
|
-
from typing import Dict, Optional
|
2
|
-
|
3
|
-
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
4
|
-
FunctionOutput,
|
5
|
-
RouterOutput,
|
6
|
-
)
|
7
|
-
|
8
|
-
|
9
|
-
class TaskMetrics:
|
10
|
-
"""Metrics for a task."""
|
11
|
-
|
12
|
-
def __init__(self, counters: Dict[str, int], timers: Dict[str, float]):
|
13
|
-
self.counters = counters
|
14
|
-
self.timers = timers
|
15
|
-
|
16
|
-
|
17
|
-
class TaskOutput:
|
18
|
-
"""Result of running a task."""
|
19
|
-
|
20
|
-
def __init__(
|
21
|
-
self,
|
22
|
-
task_id: str,
|
23
|
-
namespace: str,
|
24
|
-
graph_name: str,
|
25
|
-
function_name: str,
|
26
|
-
graph_version: str,
|
27
|
-
graph_invocation_id: str,
|
28
|
-
output_payload_uri_prefix: Optional[str],
|
29
|
-
output_encoding: Optional[str] = None,
|
30
|
-
function_output: Optional[FunctionOutput] = None,
|
31
|
-
router_output: Optional[RouterOutput] = None,
|
32
|
-
stdout: Optional[str] = None,
|
33
|
-
stderr: Optional[str] = None,
|
34
|
-
reducer: bool = False,
|
35
|
-
success: bool = False,
|
36
|
-
is_internal_error: bool = False,
|
37
|
-
metrics: Optional[TaskMetrics] = None,
|
38
|
-
):
|
39
|
-
self.task_id = task_id
|
40
|
-
self.namespace = namespace
|
41
|
-
self.graph_name = graph_name
|
42
|
-
self.function_name = function_name
|
43
|
-
self.graph_version = graph_version
|
44
|
-
self.graph_invocation_id = graph_invocation_id
|
45
|
-
self.function_output = function_output
|
46
|
-
self.router_output = router_output
|
47
|
-
self.stdout = stdout
|
48
|
-
self.stderr = stderr
|
49
|
-
self.reducer = reducer
|
50
|
-
self.success = success
|
51
|
-
self.is_internal_error = is_internal_error
|
52
|
-
self.metrics = metrics
|
53
|
-
self.output_encoding = output_encoding
|
54
|
-
self.output_payload_uri_prefix = output_payload_uri_prefix
|
55
|
-
|
56
|
-
@classmethod
|
57
|
-
def internal_error(
|
58
|
-
cls,
|
59
|
-
task_id: str,
|
60
|
-
namespace: str,
|
61
|
-
graph_name: str,
|
62
|
-
function_name: str,
|
63
|
-
graph_version: str,
|
64
|
-
graph_invocation_id: str,
|
65
|
-
output_payload_uri_prefix: Optional[str],
|
66
|
-
) -> "TaskOutput":
|
67
|
-
"""Creates a TaskOutput for an internal error."""
|
68
|
-
# We are not sharing internal error messages with the customer.
|
69
|
-
return TaskOutput(
|
70
|
-
task_id=task_id,
|
71
|
-
namespace=namespace,
|
72
|
-
graph_name=graph_name,
|
73
|
-
function_name=function_name,
|
74
|
-
graph_version=graph_version,
|
75
|
-
graph_invocation_id=graph_invocation_id,
|
76
|
-
stderr="Platform failed to execute the function.",
|
77
|
-
is_internal_error=True,
|
78
|
-
output_payload_uri_prefix=output_payload_uri_prefix,
|
79
|
-
)
|
80
|
-
|
81
|
-
@classmethod
|
82
|
-
def function_timeout(
|
83
|
-
cls,
|
84
|
-
task_id: str,
|
85
|
-
namespace: str,
|
86
|
-
graph_name: str,
|
87
|
-
function_name: str,
|
88
|
-
graph_version: str,
|
89
|
-
graph_invocation_id: str,
|
90
|
-
timeout_sec: float,
|
91
|
-
output_payload_uri_prefix: Optional[str],
|
92
|
-
) -> "TaskOutput":
|
93
|
-
"""Creates a TaskOutput for an function timeout error."""
|
94
|
-
# Task stdout, stderr is not available.
|
95
|
-
return TaskOutput(
|
96
|
-
task_id=task_id,
|
97
|
-
namespace=namespace,
|
98
|
-
graph_name=graph_name,
|
99
|
-
function_name=function_name,
|
100
|
-
graph_version=graph_version,
|
101
|
-
graph_invocation_id=graph_invocation_id,
|
102
|
-
stderr=f"Function or router exceeded its configured timeout of {timeout_sec:.3f} sec.",
|
103
|
-
is_internal_error=False,
|
104
|
-
output_payload_uri_prefix=output_payload_uri_prefix,
|
105
|
-
)
|