indexify 0.3.15__py3-none-any.whl → 0.3.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +20 -91
- indexify/executor/api_objects.py +2 -0
- indexify/executor/executor.py +75 -84
- indexify/executor/function_executor/function_executor.py +5 -2
- indexify/executor/function_executor/function_executor_state.py +43 -43
- indexify/executor/function_executor/function_executor_states_container.py +10 -4
- indexify/executor/function_executor/function_executor_status.py +91 -0
- indexify/executor/function_executor/health_checker.py +37 -13
- indexify/executor/function_executor/metrics/function_executor.py +1 -1
- indexify/executor/function_executor/metrics/function_executor_state.py +36 -0
- indexify/executor/function_executor/server/function_executor_server_factory.py +8 -8
- indexify/executor/function_executor/single_task_runner.py +100 -37
- indexify/executor/grpc/channel_creator.py +53 -0
- indexify/executor/grpc/metrics/channel_creator.py +18 -0
- indexify/executor/grpc/metrics/state_reporter.py +17 -0
- indexify/executor/{state_reconciler.py → grpc/state_reconciler.py} +60 -31
- indexify/executor/grpc/state_reporter.py +199 -0
- indexify/executor/metrics/task_runner.py +7 -0
- indexify/executor/monitoring/health_checker/generic_health_checker.py +27 -12
- indexify/executor/task_runner.py +34 -6
- indexify/{task_scheduler/proto → proto}/task_scheduler.proto +23 -17
- indexify/proto/task_scheduler_pb2.py +64 -0
- indexify/{task_scheduler/proto → proto}/task_scheduler_pb2.pyi +28 -10
- indexify/{task_scheduler/proto → proto}/task_scheduler_pb2_grpc.py +16 -16
- {indexify-0.3.15.dist-info → indexify-0.3.17.dist-info}/METADATA +1 -1
- {indexify-0.3.15.dist-info → indexify-0.3.17.dist-info}/RECORD +28 -24
- indexify/executor/state_reporter.py +0 -127
- indexify/task_scheduler/proto/task_scheduler_pb2.py +0 -69
- {indexify-0.3.15.dist-info → indexify-0.3.17.dist-info}/WHEEL +0 -0
- {indexify-0.3.15.dist-info → indexify-0.3.17.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,91 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
|
3
|
+
|
4
|
+
class FunctionExecutorStatus(Enum):
|
5
|
+
"""Status of a Function Executor.
|
6
|
+
|
7
|
+
Each status lists transitions allowed to it.
|
8
|
+
"""
|
9
|
+
|
10
|
+
# DESTROYED -> STARTING_UP
|
11
|
+
STARTING_UP = "Starting Up"
|
12
|
+
# STARTING_UP -> STARTUP_FAILED_CUSTOMER_ERROR
|
13
|
+
STARTUP_FAILED_CUSTOMER_ERROR = "Startup Failed (Customer Error)"
|
14
|
+
# STARTING_UP -> STARTUP_FAILED_PLATFORM_ERROR
|
15
|
+
STARTUP_FAILED_PLATFORM_ERROR = "Startup Failed (Platform Error)"
|
16
|
+
# STARTING_UP -> IDLE
|
17
|
+
# RUNNING_TASK -> IDLE
|
18
|
+
IDLE = "Idle"
|
19
|
+
# IDLE -> RUNNING_TASK
|
20
|
+
RUNNING_TASK = "Running Task"
|
21
|
+
# IDLE -> UNHEALTHY
|
22
|
+
# RUNNING_TASK -> UNHEALTHY
|
23
|
+
UNHEALTHY = "Unhealthy"
|
24
|
+
# STARTUP_FAILED_CUSTOMER_ERROR -> DESTROYING
|
25
|
+
# STARTUP_FAILED_PLATFORM_ERROR -> DESTROYING
|
26
|
+
# UNHEALTHY -> DESTROYING
|
27
|
+
# IDLE -> DESTROYING
|
28
|
+
DESTROYING = "Destroying"
|
29
|
+
# DESTROYED (initial status)
|
30
|
+
# DESTROYING -> DESTROYED
|
31
|
+
DESTROYED = "Destroyed"
|
32
|
+
# Any state -> SHUTDOWN
|
33
|
+
SHUTDOWN = "Shutdown" # Permanent stop state
|
34
|
+
|
35
|
+
|
36
|
+
def is_status_change_allowed(
|
37
|
+
current_status: FunctionExecutorStatus, new_status: FunctionExecutorStatus
|
38
|
+
) -> bool:
|
39
|
+
"""Returns True if the transition is allowed."""
|
40
|
+
allowed_transitions = {
|
41
|
+
FunctionExecutorStatus.DESTROYED: [
|
42
|
+
FunctionExecutorStatus.DESTROYED,
|
43
|
+
FunctionExecutorStatus.STARTING_UP,
|
44
|
+
FunctionExecutorStatus.SHUTDOWN,
|
45
|
+
],
|
46
|
+
FunctionExecutorStatus.STARTING_UP: [
|
47
|
+
FunctionExecutorStatus.STARTING_UP,
|
48
|
+
FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
|
49
|
+
FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
|
50
|
+
FunctionExecutorStatus.IDLE,
|
51
|
+
FunctionExecutorStatus.SHUTDOWN,
|
52
|
+
],
|
53
|
+
FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR: [
|
54
|
+
FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
|
55
|
+
FunctionExecutorStatus.DESTROYING,
|
56
|
+
FunctionExecutorStatus.SHUTDOWN,
|
57
|
+
],
|
58
|
+
FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR: [
|
59
|
+
FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
|
60
|
+
FunctionExecutorStatus.DESTROYING,
|
61
|
+
FunctionExecutorStatus.SHUTDOWN,
|
62
|
+
],
|
63
|
+
FunctionExecutorStatus.IDLE: [
|
64
|
+
FunctionExecutorStatus.IDLE,
|
65
|
+
FunctionExecutorStatus.RUNNING_TASK,
|
66
|
+
FunctionExecutorStatus.UNHEALTHY,
|
67
|
+
FunctionExecutorStatus.DESTROYING,
|
68
|
+
FunctionExecutorStatus.SHUTDOWN,
|
69
|
+
],
|
70
|
+
FunctionExecutorStatus.RUNNING_TASK: [
|
71
|
+
FunctionExecutorStatus.RUNNING_TASK,
|
72
|
+
FunctionExecutorStatus.IDLE,
|
73
|
+
FunctionExecutorStatus.UNHEALTHY,
|
74
|
+
FunctionExecutorStatus.SHUTDOWN,
|
75
|
+
],
|
76
|
+
FunctionExecutorStatus.UNHEALTHY: [
|
77
|
+
FunctionExecutorStatus.UNHEALTHY,
|
78
|
+
FunctionExecutorStatus.DESTROYING,
|
79
|
+
FunctionExecutorStatus.SHUTDOWN,
|
80
|
+
],
|
81
|
+
FunctionExecutorStatus.DESTROYING: [
|
82
|
+
FunctionExecutorStatus.DESTROYING,
|
83
|
+
FunctionExecutorStatus.DESTROYED,
|
84
|
+
FunctionExecutorStatus.SHUTDOWN,
|
85
|
+
],
|
86
|
+
FunctionExecutorStatus.SHUTDOWN: [
|
87
|
+
FunctionExecutorStatus.SHUTDOWN
|
88
|
+
], # No transitions allowed from SHUTDOWN
|
89
|
+
}
|
90
|
+
|
91
|
+
return new_status in allowed_transitions.get(current_status, [])
|
@@ -1,8 +1,10 @@
|
|
1
1
|
import asyncio
|
2
|
+
import os
|
2
3
|
from collections.abc import Awaitable, Callable
|
3
4
|
from typing import Any, Optional
|
4
5
|
|
5
|
-
|
6
|
+
import grpc
|
7
|
+
import grpc.aio
|
6
8
|
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
7
9
|
HealthCheckRequest,
|
8
10
|
HealthCheckResponse,
|
@@ -27,7 +29,10 @@ class HealthCheckResult:
|
|
27
29
|
|
28
30
|
|
29
31
|
class HealthChecker:
|
30
|
-
def __init__(
|
32
|
+
def __init__(
|
33
|
+
self, channel: grpc.aio.Channel, stub: FunctionExecutorStub, logger: Any
|
34
|
+
):
|
35
|
+
self._channel: grpc.aio.Channel = channel
|
31
36
|
self._stub: FunctionExecutorStub = stub
|
32
37
|
self._logger: Any = logger.bind(module=__name__)
|
33
38
|
self._health_check_loop_task: Optional[asyncio.Task] = None
|
@@ -39,6 +44,12 @@ class HealthChecker:
|
|
39
44
|
"""Runs the health check once and returns the result.
|
40
45
|
|
41
46
|
Does not raise any exceptions."""
|
47
|
+
if os.getenv("INDEXIFY_DISABLE_FUNCTION_EXECUTOR_HEALTH_CHECKS", "0") == "1":
|
48
|
+
return HealthCheckResult(
|
49
|
+
is_healthy=True,
|
50
|
+
reason="Function Executor health checks are disabled using INDEXIFY_DISABLE_FUNCTION_EXECUTOR_HEALTH_CHECKS env var.",
|
51
|
+
)
|
52
|
+
|
42
53
|
with metric_health_check_latency.time():
|
43
54
|
try:
|
44
55
|
response: HealthCheckResponse = await self._stub.check_health(
|
@@ -49,19 +60,32 @@ class HealthChecker:
|
|
49
60
|
return HealthCheckResult(
|
50
61
|
is_healthy=response.healthy, reason=response.status_message
|
51
62
|
)
|
52
|
-
except AioRpcError as e:
|
53
|
-
|
54
|
-
#
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
63
|
+
except grpc.aio.AioRpcError as e:
|
64
|
+
# Due to the customer code running in Function Executor we can't reliably conclude
|
65
|
+
# that the FE is unhealthy when RPC status code is not OK. E.g. customer code can
|
66
|
+
# hold Python GIL and prevent the health check RPC from being processed by FE Python code.
|
67
|
+
#
|
68
|
+
# The only unhealthy condition we can be sure about is when the channel can't re-establish
|
69
|
+
# the TCP connection within HEALTH_CHECK_TIMEOUT_SEC deadline. This is because FE Python
|
70
|
+
# code is not involved when TCP connections are established to FE. Problems reestablishing
|
71
|
+
# the TCP connection are usually due to the FE process crashing and its gRPC server socket
|
72
|
+
# not being available anymore or due to prolonged local networking failures on Executor.
|
73
|
+
channel_connectivity = self._channel.get_state()
|
74
|
+
if channel_connectivity == grpc.ChannelConnectivity.TRANSIENT_FAILURE:
|
75
|
+
return HealthCheckResult(
|
76
|
+
is_healthy=False,
|
77
|
+
reason="Channel is in TRANSIENT_FAILURE state, assuming Function Executor crashed.",
|
78
|
+
)
|
79
|
+
else:
|
80
|
+
return HealthCheckResult(
|
81
|
+
is_healthy=True,
|
82
|
+
reason=f"Health check RPC failed with status code: {e.code().name}. Assuming Function Executor is healthy.",
|
83
|
+
)
|
59
84
|
except Exception as e:
|
60
|
-
|
61
|
-
self._logger.warning("Got unexpected exception, ignoring", exc_info=e)
|
85
|
+
self._logger.error("Got unexpected exception, ignoring", exc_info=e)
|
62
86
|
return HealthCheckResult(
|
63
|
-
is_healthy=
|
64
|
-
reason=f"Unexpected exception in Executor: {str(e)}",
|
87
|
+
is_healthy=True,
|
88
|
+
reason=f"Unexpected exception in Executor: {str(e)}. Assuming Function Executor is healthy.",
|
65
89
|
)
|
66
90
|
|
67
91
|
def start(self, callback: Callable[[HealthCheckResult], Awaitable[None]]) -> None:
|
@@ -90,7 +90,7 @@ metric_get_info_rpc_errors: prometheus_client.Counter = prometheus_client.Counte
|
|
90
90
|
)
|
91
91
|
metric_function_executor_infos: prometheus_client.Counter = prometheus_client.Counter(
|
92
92
|
"function_executor_infos",
|
93
|
-
"Number of Function
|
93
|
+
"Number of Function Executor creations with particular info",
|
94
94
|
["version", "sdk_version", "sdk_language", "sdk_language_version"],
|
95
95
|
)
|
96
96
|
|
@@ -1,5 +1,7 @@
|
|
1
1
|
import prometheus_client
|
2
2
|
|
3
|
+
from ..function_executor_status import FunctionExecutorStatus
|
4
|
+
|
3
5
|
# This file contains all metrics used by FunctionExecutorState.
|
4
6
|
|
5
7
|
metric_function_executor_state_not_locked_errors: prometheus_client.Counter = (
|
@@ -8,3 +10,37 @@ metric_function_executor_state_not_locked_errors: prometheus_client.Counter = (
|
|
8
10
|
"Number of times a Function Executor state was used without acquiring its lock",
|
9
11
|
)
|
10
12
|
)
|
13
|
+
|
14
|
+
# Function Executors count with a particular status.
|
15
|
+
metric_function_executors_with_status: prometheus_client.Gauge = (
|
16
|
+
prometheus_client.Gauge(
|
17
|
+
"function_executors_with_status",
|
18
|
+
"Number of Function Executors with a particular status",
|
19
|
+
["status"],
|
20
|
+
)
|
21
|
+
)
|
22
|
+
metric_function_executors_with_status.labels(
|
23
|
+
status=FunctionExecutorStatus.STARTING_UP.name
|
24
|
+
)
|
25
|
+
metric_function_executors_with_status.labels(
|
26
|
+
status=FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR.name
|
27
|
+
)
|
28
|
+
metric_function_executors_with_status.labels(
|
29
|
+
status=FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR.name
|
30
|
+
)
|
31
|
+
metric_function_executors_with_status.labels(status=FunctionExecutorStatus.IDLE.name)
|
32
|
+
metric_function_executors_with_status.labels(
|
33
|
+
status=FunctionExecutorStatus.RUNNING_TASK.name
|
34
|
+
)
|
35
|
+
metric_function_executors_with_status.labels(
|
36
|
+
status=FunctionExecutorStatus.UNHEALTHY.name
|
37
|
+
)
|
38
|
+
metric_function_executors_with_status.labels(
|
39
|
+
status=FunctionExecutorStatus.DESTROYING.name
|
40
|
+
)
|
41
|
+
metric_function_executors_with_status.labels(
|
42
|
+
status=FunctionExecutorStatus.DESTROYED.name
|
43
|
+
)
|
44
|
+
metric_function_executors_with_status.labels(
|
45
|
+
status=FunctionExecutorStatus.SHUTDOWN.name
|
46
|
+
)
|
@@ -1,8 +1,10 @@
|
|
1
|
-
from
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from typing import Any, List, Optional
|
2
3
|
|
3
4
|
from .function_executor_server import FunctionExecutorServer
|
4
5
|
|
5
6
|
|
7
|
+
@dataclass
|
6
8
|
class FunctionExecutorServerConfiguration:
|
7
9
|
"""Configuration for creating a FunctionExecutorServer.
|
8
10
|
|
@@ -14,13 +16,11 @@ class FunctionExecutorServerConfiguration:
|
|
14
16
|
configuration parameters or raise an exception if it can't implement
|
15
17
|
them."""
|
16
18
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
# Container image URI of the Function Executor Server.
|
23
|
-
self.image_uri: Optional[str] = image_uri
|
19
|
+
executor_id: str
|
20
|
+
function_executor_id: str
|
21
|
+
namespace: str
|
22
|
+
image_uri: Optional[str]
|
23
|
+
secret_names: List[str]
|
24
24
|
|
25
25
|
|
26
26
|
class FunctionExecutorServerFactory:
|
@@ -14,6 +14,7 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
|
14
14
|
from ..api_objects import Task
|
15
15
|
from .function_executor import CustomerError, FunctionExecutor
|
16
16
|
from .function_executor_state import FunctionExecutorState
|
17
|
+
from .function_executor_status import FunctionExecutorStatus
|
17
18
|
from .health_checker import HealthChecker, HealthCheckResult
|
18
19
|
from .metrics.single_task_runner import (
|
19
20
|
metric_function_executor_run_task_rpc_errors,
|
@@ -40,9 +41,11 @@ class SingleTaskRunner:
|
|
40
41
|
logger: Any,
|
41
42
|
):
|
42
43
|
self._executor_id: str = executor_id
|
43
|
-
self.
|
44
|
+
self._function_executor_state: FunctionExecutorState = function_executor_state
|
44
45
|
self._task_input: TaskInput = task_input
|
45
|
-
self.
|
46
|
+
self._function_executor_server_factory: FunctionExecutorServerFactory = (
|
47
|
+
function_executor_server_factory
|
48
|
+
)
|
46
49
|
self._base_url: str = base_url
|
47
50
|
self._config_path: Optional[str] = config_path
|
48
51
|
self._logger = logger.bind(module=__name__)
|
@@ -54,18 +57,32 @@ class SingleTaskRunner:
|
|
54
57
|
The lock is released during actual task run in the server.
|
55
58
|
The lock is relocked on return.
|
56
59
|
|
57
|
-
Raises an exception if an error occured.
|
58
|
-
|
60
|
+
Raises an exception if an error occured.
|
61
|
+
|
62
|
+
On enter the Function Executor status is either IDLE, UNHEALTHY or DESTROYED.
|
63
|
+
On return the Function Executor status is either IDLE, UNHEALTHY or DESTROYED.
|
64
|
+
"""
|
65
|
+
self._function_executor_state.check_locked()
|
59
66
|
|
60
|
-
if self.
|
61
|
-
|
67
|
+
if self._function_executor_state.status not in [
|
68
|
+
FunctionExecutorStatus.IDLE,
|
69
|
+
FunctionExecutorStatus.UNHEALTHY,
|
70
|
+
FunctionExecutorStatus.DESTROYED,
|
71
|
+
]:
|
72
|
+
self._logger.error(
|
73
|
+
"Function Executor is not in oneof [IDLE, UNHEALTHY, DESTROYED] state, cannot run the task",
|
74
|
+
status=self._function_executor_state.status,
|
75
|
+
)
|
76
|
+
raise RuntimeError(
|
77
|
+
f"Unexpected Function Executor state {self._function_executor_state.status}"
|
78
|
+
)
|
62
79
|
|
63
80
|
# If Function Executor became unhealthy while was idle then destroy it.
|
64
81
|
# It'll be recreated below.
|
65
82
|
await self._destroy_existing_function_executor_if_unhealthy()
|
66
83
|
|
67
84
|
# Create Function Executor if it doesn't exist yet.
|
68
|
-
if self.
|
85
|
+
if self._function_executor_state.status == FunctionExecutorStatus.DESTROYED:
|
69
86
|
try:
|
70
87
|
await self._create_function_executor()
|
71
88
|
except CustomerError as e:
|
@@ -87,15 +104,38 @@ class SingleTaskRunner:
|
|
87
104
|
# The periodic health checker might not notice this as it does only periodic checks.
|
88
105
|
await self._destroy_existing_function_executor_if_unhealthy()
|
89
106
|
|
90
|
-
|
91
|
-
|
92
|
-
|
107
|
+
if self._function_executor_state.status not in [
|
108
|
+
FunctionExecutorStatus.IDLE,
|
109
|
+
FunctionExecutorStatus.UNHEALTHY,
|
110
|
+
FunctionExecutorStatus.DESTROYED,
|
111
|
+
]:
|
112
|
+
self._logger.error(
|
113
|
+
"Function Executor status is not oneof [IDLE, UNHEALTHY, DESTROYED] after running the task, resetting the state to mitigate a possible bug",
|
114
|
+
status=self._function_executor_state.status,
|
115
|
+
)
|
116
|
+
if self._function_executor_state.function_executor is None:
|
117
|
+
await self._function_executor_state.set_status(
|
118
|
+
FunctionExecutorStatus.DESTROYED
|
119
|
+
)
|
120
|
+
else:
|
121
|
+
await self._function_executor_state.set_status(
|
122
|
+
FunctionExecutorStatus.UNHEALTHY
|
123
|
+
)
|
124
|
+
|
125
|
+
async def _create_function_executor(self) -> None:
|
126
|
+
await self._function_executor_state.set_status(
|
127
|
+
FunctionExecutorStatus.STARTING_UP
|
128
|
+
)
|
129
|
+
self._function_executor_state.function_executor = FunctionExecutor(
|
130
|
+
server_factory=self._function_executor_server_factory, logger=self._logger
|
93
131
|
)
|
94
132
|
config: FunctionExecutorServerConfiguration = (
|
95
133
|
FunctionExecutorServerConfiguration(
|
96
134
|
executor_id=self._executor_id,
|
97
|
-
function_executor_id=self.
|
135
|
+
function_executor_id=self._function_executor_state.id,
|
136
|
+
namespace=self._task_input.task.namespace,
|
98
137
|
image_uri=self._task_input.task.image_uri,
|
138
|
+
secret_names=self._task_input.task.secret_names or [],
|
99
139
|
)
|
100
140
|
)
|
101
141
|
initialize_request: InitializeRequest = InitializeRequest(
|
@@ -107,17 +147,29 @@ class SingleTaskRunner:
|
|
107
147
|
)
|
108
148
|
|
109
149
|
try:
|
110
|
-
await function_executor.initialize(
|
150
|
+
await self._function_executor_state.function_executor.initialize(
|
111
151
|
config=config,
|
112
152
|
initialize_request=initialize_request,
|
113
153
|
base_url=self._base_url,
|
114
154
|
config_path=self._config_path,
|
115
155
|
)
|
116
|
-
|
156
|
+
except CustomerError:
|
157
|
+
# We have to follow the valid state transition sequence.
|
158
|
+
await self._function_executor_state.set_status(
|
159
|
+
FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR
|
160
|
+
)
|
161
|
+
await self._function_executor_state.destroy_function_executor()
|
162
|
+
raise
|
117
163
|
except Exception:
|
118
|
-
|
164
|
+
# We have to follow the valid state transition sequence.
|
165
|
+
await self._function_executor_state.set_status(
|
166
|
+
FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
|
167
|
+
)
|
168
|
+
await self._function_executor_state.destroy_function_executor()
|
119
169
|
raise
|
120
170
|
|
171
|
+
await self._function_executor_state.set_status(FunctionExecutorStatus.IDLE)
|
172
|
+
|
121
173
|
async def _run(self) -> TaskOutput:
|
122
174
|
request: RunTaskRequest = RunTaskRequest(
|
123
175
|
namespace=self._task_input.task.namespace,
|
@@ -130,13 +182,15 @@ class SingleTaskRunner:
|
|
130
182
|
)
|
131
183
|
if self._task_input.init_value is not None:
|
132
184
|
request.function_init_value.CopyFrom(self._task_input.init_value)
|
133
|
-
channel: grpc.aio.Channel =
|
185
|
+
channel: grpc.aio.Channel = (
|
186
|
+
self._function_executor_state.function_executor.channel()
|
187
|
+
)
|
134
188
|
|
135
189
|
async with _RunningTaskContextManager(
|
136
190
|
invocation_id=self._task_input.task.invocation_id,
|
137
191
|
task_id=self._task_input.task.id,
|
138
192
|
health_check_failed_callback=self._health_check_failed_callback,
|
139
|
-
function_executor_state=self.
|
193
|
+
function_executor_state=self._function_executor_state,
|
140
194
|
):
|
141
195
|
with (
|
142
196
|
metric_function_executor_run_task_rpc_errors.count_exceptions(),
|
@@ -154,31 +208,40 @@ class SingleTaskRunner:
|
|
154
208
|
async def _health_check_failed_callback(self, result: HealthCheckResult):
|
155
209
|
# Function Executor destroy due to the periodic health check failure ensures that
|
156
210
|
# a running task RPC stuck in unhealthy Function Executor fails immidiately.
|
157
|
-
async with self.
|
158
|
-
if
|
159
|
-
|
160
|
-
|
161
|
-
|
211
|
+
async with self._function_executor_state.lock:
|
212
|
+
if (
|
213
|
+
self._function_executor_state.status
|
214
|
+
!= FunctionExecutorStatus.RUNNING_TASK
|
215
|
+
):
|
216
|
+
# Protection in case the callback gets delivered after we finished running the task.
|
217
|
+
return
|
218
|
+
|
219
|
+
await self._function_executor_state.set_status(
|
220
|
+
FunctionExecutorStatus.UNHEALTHY
|
221
|
+
)
|
222
|
+
await self._destroy_function_executor_on_failed_health_check(result.reason)
|
162
223
|
|
163
224
|
async def _destroy_existing_function_executor_if_unhealthy(self):
|
164
|
-
self.
|
165
|
-
if self.
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
225
|
+
self._function_executor_state.check_locked()
|
226
|
+
if self._function_executor_state.status == FunctionExecutorStatus.IDLE:
|
227
|
+
result: HealthCheckResult = (
|
228
|
+
await self._function_executor_state.function_executor.health_checker().check()
|
229
|
+
)
|
230
|
+
if not result.is_healthy:
|
231
|
+
await self._function_executor_state.set_status(
|
232
|
+
FunctionExecutorStatus.UNHEALTHY
|
233
|
+
)
|
234
|
+
|
235
|
+
if self._function_executor_state.status == FunctionExecutorStatus.UNHEALTHY:
|
236
|
+
await self._destroy_function_executor_on_failed_health_check(result.reason)
|
173
237
|
|
174
238
|
async def _destroy_function_executor_on_failed_health_check(self, reason: str):
|
175
|
-
self.
|
239
|
+
self._function_executor_state.check_locked()
|
176
240
|
self._logger.error(
|
177
241
|
"Function Executor health check failed, destroying Function Executor",
|
178
242
|
health_check_fail_reason=reason,
|
179
243
|
)
|
180
|
-
self.
|
181
|
-
await self._state.destroy_function_executor()
|
244
|
+
await self._function_executor_state.destroy_function_executor()
|
182
245
|
|
183
246
|
|
184
247
|
class _RunningTaskContextManager:
|
@@ -199,7 +262,7 @@ class _RunningTaskContextManager:
|
|
199
262
|
self._state: FunctionExecutorState = function_executor_state
|
200
263
|
|
201
264
|
async def __aenter__(self):
|
202
|
-
self._state.
|
265
|
+
await self._state.set_status(FunctionExecutorStatus.RUNNING_TASK)
|
203
266
|
self._state.function_executor.invocation_state_client().add_task_to_invocation_id_entry(
|
204
267
|
task_id=self._task_id,
|
205
268
|
invocation_id=self._invocation_id,
|
@@ -213,9 +276,9 @@ class _RunningTaskContextManager:
|
|
213
276
|
|
214
277
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
215
278
|
await self._state.lock.acquire()
|
216
|
-
|
217
|
-
|
218
|
-
|
279
|
+
# Health check callback could destroy the FunctionExecutor and set status to UNHEALTHY
|
280
|
+
if self._state.status == FunctionExecutorStatus.RUNNING_TASK:
|
281
|
+
await self._state.set_status(FunctionExecutorStatus.IDLE)
|
219
282
|
self._state.function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
|
220
283
|
task_id=self._task_id
|
221
284
|
)
|
@@ -0,0 +1,53 @@
|
|
1
|
+
import asyncio
|
2
|
+
from typing import Any
|
3
|
+
|
4
|
+
import grpc.aio
|
5
|
+
|
6
|
+
from .metrics.channel_creator import (
|
7
|
+
metric_grpc_server_channel_creation_latency,
|
8
|
+
metric_grpc_server_channel_creation_retries,
|
9
|
+
metric_grpc_server_channel_creations,
|
10
|
+
)
|
11
|
+
|
12
|
+
_RETRY_INTERVAL_SEC = 5
|
13
|
+
_CONNECT_TIMEOUT_SEC = 5
|
14
|
+
|
15
|
+
|
16
|
+
class ChannelCreator:
|
17
|
+
def __init__(self, server_address: str, logger: Any):
|
18
|
+
self._logger = logger.bind(module=__name__)
|
19
|
+
self._server_address = server_address
|
20
|
+
self._is_shutdown = False
|
21
|
+
|
22
|
+
async def create(self) -> grpc.aio.Channel:
|
23
|
+
"""Creates a channel to the gRPC server.
|
24
|
+
|
25
|
+
Blocks until the channel is ready.
|
26
|
+
Never raises any exceptions.
|
27
|
+
"""
|
28
|
+
with metric_grpc_server_channel_creation_latency.time():
|
29
|
+
metric_grpc_server_channel_creations.inc()
|
30
|
+
while not self._is_shutdown:
|
31
|
+
try:
|
32
|
+
channel = grpc.aio.insecure_channel(self._server_address)
|
33
|
+
await asyncio.wait_for(
|
34
|
+
channel.channel_ready(),
|
35
|
+
timeout=_CONNECT_TIMEOUT_SEC,
|
36
|
+
)
|
37
|
+
return channel
|
38
|
+
except Exception:
|
39
|
+
self._logger.error(
|
40
|
+
f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
|
41
|
+
)
|
42
|
+
try:
|
43
|
+
await channel.close()
|
44
|
+
except Exception as e:
|
45
|
+
self._logger.error(
|
46
|
+
"failed closing not established channel", exc_info=e
|
47
|
+
)
|
48
|
+
|
49
|
+
metric_grpc_server_channel_creation_retries.inc()
|
50
|
+
await asyncio.sleep(_RETRY_INTERVAL_SEC)
|
51
|
+
|
52
|
+
async def shutdown(self):
|
53
|
+
self._is_shutdown = True
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from ...monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
metric_grpc_server_channel_creations = prometheus_client.Counter(
|
6
|
+
"grpc_server_channel_creations",
|
7
|
+
"Number of times a channel to gRPC Server was created",
|
8
|
+
)
|
9
|
+
metric_grpc_server_channel_creation_retries = prometheus_client.Counter(
|
10
|
+
"grpc_server_channel_creation_retries",
|
11
|
+
"Number of retries during a channel creation to gRPC Server",
|
12
|
+
)
|
13
|
+
metric_grpc_server_channel_creation_latency: prometheus_client.Histogram = (
|
14
|
+
latency_metric_for_fast_operation(
|
15
|
+
"grpc_server_channel_creation",
|
16
|
+
"gRPC server channel creation",
|
17
|
+
)
|
18
|
+
)
|
@@ -0,0 +1,17 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from ...monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
metric_state_report_rpcs = prometheus_client.Counter(
|
6
|
+
"state_report_rpcs",
|
7
|
+
"Number of Executor state report RPCs to Server",
|
8
|
+
)
|
9
|
+
metric_state_report_errors = prometheus_client.Counter(
|
10
|
+
"state_report_rpc_errors",
|
11
|
+
"Number of Executor state report RPC errors",
|
12
|
+
)
|
13
|
+
metric_state_report_latency: prometheus_client.Histogram = (
|
14
|
+
latency_metric_for_fast_operation(
|
15
|
+
"state_report_rpc", "Executor state report rpc to Server"
|
16
|
+
)
|
17
|
+
)
|