indexify 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +20 -91
- indexify/executor/api_objects.py +2 -0
- indexify/executor/executor.py +77 -86
- indexify/executor/function_executor/function_executor_state.py +43 -43
- indexify/executor/function_executor/function_executor_states_container.py +10 -4
- indexify/executor/function_executor/function_executor_status.py +91 -0
- indexify/executor/function_executor/metrics/function_executor.py +1 -1
- indexify/executor/function_executor/metrics/function_executor_state.py +36 -0
- indexify/executor/function_executor/server/function_executor_server_factory.py +8 -8
- indexify/executor/function_executor/single_task_runner.py +100 -37
- indexify/executor/grpc/channel_creator.py +53 -0
- indexify/executor/grpc/metrics/channel_creator.py +18 -0
- indexify/executor/grpc/metrics/state_reporter.py +17 -0
- indexify/executor/{state_reconciler.py → grpc/state_reconciler.py} +60 -31
- indexify/executor/grpc/state_reporter.py +199 -0
- indexify/executor/monitoring/health_checker/generic_health_checker.py +27 -12
- indexify/executor/task_runner.py +30 -6
- indexify/{task_scheduler/proto → proto}/task_scheduler.proto +23 -17
- indexify/proto/task_scheduler_pb2.py +64 -0
- indexify/{task_scheduler/proto → proto}/task_scheduler_pb2.pyi +28 -10
- indexify/{task_scheduler/proto → proto}/task_scheduler_pb2_grpc.py +16 -16
- {indexify-0.3.14.dist-info → indexify-0.3.16.dist-info}/METADATA +1 -1
- {indexify-0.3.14.dist-info → indexify-0.3.16.dist-info}/RECORD +25 -21
- indexify/executor/state_reporter.py +0 -127
- indexify/task_scheduler/proto/task_scheduler_pb2.py +0 -69
- {indexify-0.3.14.dist-info → indexify-0.3.16.dist-info}/WHEEL +0 -0
- {indexify-0.3.14.dist-info → indexify-0.3.16.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,91 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
|
3
|
+
|
4
|
+
class FunctionExecutorStatus(Enum):
|
5
|
+
"""Status of a Function Executor.
|
6
|
+
|
7
|
+
Each status lists transitions allowed to it.
|
8
|
+
"""
|
9
|
+
|
10
|
+
# DESTROYED -> STARTING_UP
|
11
|
+
STARTING_UP = "Starting Up"
|
12
|
+
# STARTING_UP -> STARTUP_FAILED_CUSTOMER_ERROR
|
13
|
+
STARTUP_FAILED_CUSTOMER_ERROR = "Startup Failed (Customer Error)"
|
14
|
+
# STARTING_UP -> STARTUP_FAILED_PLATFORM_ERROR
|
15
|
+
STARTUP_FAILED_PLATFORM_ERROR = "Startup Failed (Platform Error)"
|
16
|
+
# STARTING_UP -> IDLE
|
17
|
+
# RUNNING_TASK -> IDLE
|
18
|
+
IDLE = "Idle"
|
19
|
+
# IDLE -> RUNNING_TASK
|
20
|
+
RUNNING_TASK = "Running Task"
|
21
|
+
# IDLE -> UNHEALTHY
|
22
|
+
# RUNNING_TASK -> UNHEALTHY
|
23
|
+
UNHEALTHY = "Unhealthy"
|
24
|
+
# STARTUP_FAILED_CUSTOMER_ERROR -> DESTROYING
|
25
|
+
# STARTUP_FAILED_PLATFORM_ERROR -> DESTROYING
|
26
|
+
# UNHEALTHY -> DESTROYING
|
27
|
+
# IDLE -> DESTROYING
|
28
|
+
DESTROYING = "Destroying"
|
29
|
+
# DESTROYED (initial status)
|
30
|
+
# DESTROYING -> DESTROYED
|
31
|
+
DESTROYED = "Destroyed"
|
32
|
+
# Any state -> SHUTDOWN
|
33
|
+
SHUTDOWN = "Shutdown" # Permanent stop state
|
34
|
+
|
35
|
+
|
36
|
+
def is_status_change_allowed(
|
37
|
+
current_status: FunctionExecutorStatus, new_status: FunctionExecutorStatus
|
38
|
+
) -> bool:
|
39
|
+
"""Returns True if the transition is allowed."""
|
40
|
+
allowed_transitions = {
|
41
|
+
FunctionExecutorStatus.DESTROYED: [
|
42
|
+
FunctionExecutorStatus.DESTROYED,
|
43
|
+
FunctionExecutorStatus.STARTING_UP,
|
44
|
+
FunctionExecutorStatus.SHUTDOWN,
|
45
|
+
],
|
46
|
+
FunctionExecutorStatus.STARTING_UP: [
|
47
|
+
FunctionExecutorStatus.STARTING_UP,
|
48
|
+
FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
|
49
|
+
FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
|
50
|
+
FunctionExecutorStatus.IDLE,
|
51
|
+
FunctionExecutorStatus.SHUTDOWN,
|
52
|
+
],
|
53
|
+
FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR: [
|
54
|
+
FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
|
55
|
+
FunctionExecutorStatus.DESTROYING,
|
56
|
+
FunctionExecutorStatus.SHUTDOWN,
|
57
|
+
],
|
58
|
+
FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR: [
|
59
|
+
FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
|
60
|
+
FunctionExecutorStatus.DESTROYING,
|
61
|
+
FunctionExecutorStatus.SHUTDOWN,
|
62
|
+
],
|
63
|
+
FunctionExecutorStatus.IDLE: [
|
64
|
+
FunctionExecutorStatus.IDLE,
|
65
|
+
FunctionExecutorStatus.RUNNING_TASK,
|
66
|
+
FunctionExecutorStatus.UNHEALTHY,
|
67
|
+
FunctionExecutorStatus.DESTROYING,
|
68
|
+
FunctionExecutorStatus.SHUTDOWN,
|
69
|
+
],
|
70
|
+
FunctionExecutorStatus.RUNNING_TASK: [
|
71
|
+
FunctionExecutorStatus.RUNNING_TASK,
|
72
|
+
FunctionExecutorStatus.IDLE,
|
73
|
+
FunctionExecutorStatus.UNHEALTHY,
|
74
|
+
FunctionExecutorStatus.SHUTDOWN,
|
75
|
+
],
|
76
|
+
FunctionExecutorStatus.UNHEALTHY: [
|
77
|
+
FunctionExecutorStatus.UNHEALTHY,
|
78
|
+
FunctionExecutorStatus.DESTROYING,
|
79
|
+
FunctionExecutorStatus.SHUTDOWN,
|
80
|
+
],
|
81
|
+
FunctionExecutorStatus.DESTROYING: [
|
82
|
+
FunctionExecutorStatus.DESTROYING,
|
83
|
+
FunctionExecutorStatus.DESTROYED,
|
84
|
+
FunctionExecutorStatus.SHUTDOWN,
|
85
|
+
],
|
86
|
+
FunctionExecutorStatus.SHUTDOWN: [
|
87
|
+
FunctionExecutorStatus.SHUTDOWN
|
88
|
+
], # No transitions allowed from SHUTDOWN
|
89
|
+
}
|
90
|
+
|
91
|
+
return new_status in allowed_transitions.get(current_status, [])
|
@@ -90,7 +90,7 @@ metric_get_info_rpc_errors: prometheus_client.Counter = prometheus_client.Counte
|
|
90
90
|
)
|
91
91
|
metric_function_executor_infos: prometheus_client.Counter = prometheus_client.Counter(
|
92
92
|
"function_executor_infos",
|
93
|
-
"Number of Function
|
93
|
+
"Number of Function Executor creations with particular info",
|
94
94
|
["version", "sdk_version", "sdk_language", "sdk_language_version"],
|
95
95
|
)
|
96
96
|
|
@@ -1,5 +1,7 @@
|
|
1
1
|
import prometheus_client
|
2
2
|
|
3
|
+
from ..function_executor_status import FunctionExecutorStatus
|
4
|
+
|
3
5
|
# This file contains all metrics used by FunctionExecutorState.
|
4
6
|
|
5
7
|
metric_function_executor_state_not_locked_errors: prometheus_client.Counter = (
|
@@ -8,3 +10,37 @@ metric_function_executor_state_not_locked_errors: prometheus_client.Counter = (
|
|
8
10
|
"Number of times a Function Executor state was used without acquiring its lock",
|
9
11
|
)
|
10
12
|
)
|
13
|
+
|
14
|
+
# Function Executors count with a particular status.
|
15
|
+
metric_function_executors_with_status: prometheus_client.Gauge = (
|
16
|
+
prometheus_client.Gauge(
|
17
|
+
"function_executors_with_status",
|
18
|
+
"Number of Function Executors with a particular status",
|
19
|
+
["status"],
|
20
|
+
)
|
21
|
+
)
|
22
|
+
metric_function_executors_with_status.labels(
|
23
|
+
status=FunctionExecutorStatus.STARTING_UP.name
|
24
|
+
)
|
25
|
+
metric_function_executors_with_status.labels(
|
26
|
+
status=FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR.name
|
27
|
+
)
|
28
|
+
metric_function_executors_with_status.labels(
|
29
|
+
status=FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR.name
|
30
|
+
)
|
31
|
+
metric_function_executors_with_status.labels(status=FunctionExecutorStatus.IDLE.name)
|
32
|
+
metric_function_executors_with_status.labels(
|
33
|
+
status=FunctionExecutorStatus.RUNNING_TASK.name
|
34
|
+
)
|
35
|
+
metric_function_executors_with_status.labels(
|
36
|
+
status=FunctionExecutorStatus.UNHEALTHY.name
|
37
|
+
)
|
38
|
+
metric_function_executors_with_status.labels(
|
39
|
+
status=FunctionExecutorStatus.DESTROYING.name
|
40
|
+
)
|
41
|
+
metric_function_executors_with_status.labels(
|
42
|
+
status=FunctionExecutorStatus.DESTROYED.name
|
43
|
+
)
|
44
|
+
metric_function_executors_with_status.labels(
|
45
|
+
status=FunctionExecutorStatus.SHUTDOWN.name
|
46
|
+
)
|
@@ -1,8 +1,10 @@
|
|
1
|
-
from
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from typing import Any, List, Optional
|
2
3
|
|
3
4
|
from .function_executor_server import FunctionExecutorServer
|
4
5
|
|
5
6
|
|
7
|
+
@dataclass
|
6
8
|
class FunctionExecutorServerConfiguration:
|
7
9
|
"""Configuration for creating a FunctionExecutorServer.
|
8
10
|
|
@@ -14,13 +16,11 @@ class FunctionExecutorServerConfiguration:
|
|
14
16
|
configuration parameters or raise an exception if it can't implement
|
15
17
|
them."""
|
16
18
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
# Container image URI of the Function Executor Server.
|
23
|
-
self.image_uri: Optional[str] = image_uri
|
19
|
+
executor_id: str
|
20
|
+
function_executor_id: str
|
21
|
+
namespace: str
|
22
|
+
image_uri: Optional[str]
|
23
|
+
secret_names: List[str]
|
24
24
|
|
25
25
|
|
26
26
|
class FunctionExecutorServerFactory:
|
@@ -14,6 +14,7 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
|
14
14
|
from ..api_objects import Task
|
15
15
|
from .function_executor import CustomerError, FunctionExecutor
|
16
16
|
from .function_executor_state import FunctionExecutorState
|
17
|
+
from .function_executor_status import FunctionExecutorStatus
|
17
18
|
from .health_checker import HealthChecker, HealthCheckResult
|
18
19
|
from .metrics.single_task_runner import (
|
19
20
|
metric_function_executor_run_task_rpc_errors,
|
@@ -40,9 +41,11 @@ class SingleTaskRunner:
|
|
40
41
|
logger: Any,
|
41
42
|
):
|
42
43
|
self._executor_id: str = executor_id
|
43
|
-
self.
|
44
|
+
self._function_executor_state: FunctionExecutorState = function_executor_state
|
44
45
|
self._task_input: TaskInput = task_input
|
45
|
-
self.
|
46
|
+
self._function_executor_server_factory: FunctionExecutorServerFactory = (
|
47
|
+
function_executor_server_factory
|
48
|
+
)
|
46
49
|
self._base_url: str = base_url
|
47
50
|
self._config_path: Optional[str] = config_path
|
48
51
|
self._logger = logger.bind(module=__name__)
|
@@ -54,18 +57,32 @@ class SingleTaskRunner:
|
|
54
57
|
The lock is released during actual task run in the server.
|
55
58
|
The lock is relocked on return.
|
56
59
|
|
57
|
-
Raises an exception if an error occured.
|
58
|
-
|
60
|
+
Raises an exception if an error occured.
|
61
|
+
|
62
|
+
On enter the Function Executor status is either IDLE, UNHEALTHY or DESTROYED.
|
63
|
+
On return the Function Executor status is either IDLE, UNHEALTHY or DESTROYED.
|
64
|
+
"""
|
65
|
+
self._function_executor_state.check_locked()
|
59
66
|
|
60
|
-
if self.
|
61
|
-
|
67
|
+
if self._function_executor_state.status not in [
|
68
|
+
FunctionExecutorStatus.IDLE,
|
69
|
+
FunctionExecutorStatus.UNHEALTHY,
|
70
|
+
FunctionExecutorStatus.DESTROYED,
|
71
|
+
]:
|
72
|
+
self._logger.error(
|
73
|
+
"Function Executor is not in oneof [IDLE, UNHEALTHY, DESTROYED] state, cannot run the task",
|
74
|
+
status=self._function_executor_state.status,
|
75
|
+
)
|
76
|
+
raise RuntimeError(
|
77
|
+
f"Unexpected Function Executor state {self._function_executor_state.status}"
|
78
|
+
)
|
62
79
|
|
63
80
|
# If Function Executor became unhealthy while was idle then destroy it.
|
64
81
|
# It'll be recreated below.
|
65
82
|
await self._destroy_existing_function_executor_if_unhealthy()
|
66
83
|
|
67
84
|
# Create Function Executor if it doesn't exist yet.
|
68
|
-
if self.
|
85
|
+
if self._function_executor_state.status == FunctionExecutorStatus.DESTROYED:
|
69
86
|
try:
|
70
87
|
await self._create_function_executor()
|
71
88
|
except CustomerError as e:
|
@@ -87,15 +104,38 @@ class SingleTaskRunner:
|
|
87
104
|
# The periodic health checker might not notice this as it does only periodic checks.
|
88
105
|
await self._destroy_existing_function_executor_if_unhealthy()
|
89
106
|
|
90
|
-
|
91
|
-
|
92
|
-
|
107
|
+
if self._function_executor_state.status not in [
|
108
|
+
FunctionExecutorStatus.IDLE,
|
109
|
+
FunctionExecutorStatus.UNHEALTHY,
|
110
|
+
FunctionExecutorStatus.DESTROYED,
|
111
|
+
]:
|
112
|
+
self._logger.error(
|
113
|
+
"Function Executor status is not oneof [IDLE, UNHEALTHY, DESTROYED] after running the task, resetting the state to mitigate a possible bug",
|
114
|
+
status=self._function_executor_state.status,
|
115
|
+
)
|
116
|
+
if self._function_executor_state.function_executor is None:
|
117
|
+
await self._function_executor_state.set_status(
|
118
|
+
FunctionExecutorStatus.DESTROYED
|
119
|
+
)
|
120
|
+
else:
|
121
|
+
await self._function_executor_state.set_status(
|
122
|
+
FunctionExecutorStatus.UNHEALTHY
|
123
|
+
)
|
124
|
+
|
125
|
+
async def _create_function_executor(self) -> None:
|
126
|
+
await self._function_executor_state.set_status(
|
127
|
+
FunctionExecutorStatus.STARTING_UP
|
128
|
+
)
|
129
|
+
self._function_executor_state.function_executor = FunctionExecutor(
|
130
|
+
server_factory=self._function_executor_server_factory, logger=self._logger
|
93
131
|
)
|
94
132
|
config: FunctionExecutorServerConfiguration = (
|
95
133
|
FunctionExecutorServerConfiguration(
|
96
134
|
executor_id=self._executor_id,
|
97
|
-
function_executor_id=self.
|
135
|
+
function_executor_id=self._function_executor_state.id,
|
136
|
+
namespace=self._task_input.task.namespace,
|
98
137
|
image_uri=self._task_input.task.image_uri,
|
138
|
+
secret_names=self._task_input.task.secret_names or [],
|
99
139
|
)
|
100
140
|
)
|
101
141
|
initialize_request: InitializeRequest = InitializeRequest(
|
@@ -107,17 +147,29 @@ class SingleTaskRunner:
|
|
107
147
|
)
|
108
148
|
|
109
149
|
try:
|
110
|
-
await function_executor.initialize(
|
150
|
+
await self._function_executor_state.function_executor.initialize(
|
111
151
|
config=config,
|
112
152
|
initialize_request=initialize_request,
|
113
153
|
base_url=self._base_url,
|
114
154
|
config_path=self._config_path,
|
115
155
|
)
|
116
|
-
|
156
|
+
except CustomerError:
|
157
|
+
# We have to follow the valid state transition sequence.
|
158
|
+
await self._function_executor_state.set_status(
|
159
|
+
FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR
|
160
|
+
)
|
161
|
+
await self._function_executor_state.destroy_function_executor()
|
162
|
+
raise
|
117
163
|
except Exception:
|
118
|
-
|
164
|
+
# We have to follow the valid state transition sequence.
|
165
|
+
await self._function_executor_state.set_status(
|
166
|
+
FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
|
167
|
+
)
|
168
|
+
await self._function_executor_state.destroy_function_executor()
|
119
169
|
raise
|
120
170
|
|
171
|
+
await self._function_executor_state.set_status(FunctionExecutorStatus.IDLE)
|
172
|
+
|
121
173
|
async def _run(self) -> TaskOutput:
|
122
174
|
request: RunTaskRequest = RunTaskRequest(
|
123
175
|
namespace=self._task_input.task.namespace,
|
@@ -130,13 +182,15 @@ class SingleTaskRunner:
|
|
130
182
|
)
|
131
183
|
if self._task_input.init_value is not None:
|
132
184
|
request.function_init_value.CopyFrom(self._task_input.init_value)
|
133
|
-
channel: grpc.aio.Channel =
|
185
|
+
channel: grpc.aio.Channel = (
|
186
|
+
self._function_executor_state.function_executor.channel()
|
187
|
+
)
|
134
188
|
|
135
189
|
async with _RunningTaskContextManager(
|
136
190
|
invocation_id=self._task_input.task.invocation_id,
|
137
191
|
task_id=self._task_input.task.id,
|
138
192
|
health_check_failed_callback=self._health_check_failed_callback,
|
139
|
-
function_executor_state=self.
|
193
|
+
function_executor_state=self._function_executor_state,
|
140
194
|
):
|
141
195
|
with (
|
142
196
|
metric_function_executor_run_task_rpc_errors.count_exceptions(),
|
@@ -154,31 +208,40 @@ class SingleTaskRunner:
|
|
154
208
|
async def _health_check_failed_callback(self, result: HealthCheckResult):
|
155
209
|
# Function Executor destroy due to the periodic health check failure ensures that
|
156
210
|
# a running task RPC stuck in unhealthy Function Executor fails immidiately.
|
157
|
-
async with self.
|
158
|
-
if
|
159
|
-
|
160
|
-
|
161
|
-
|
211
|
+
async with self._function_executor_state.lock:
|
212
|
+
if (
|
213
|
+
self._function_executor_state.status
|
214
|
+
!= FunctionExecutorStatus.RUNNING_TASK
|
215
|
+
):
|
216
|
+
# Protection in case the callback gets delivered after we finished running the task.
|
217
|
+
return
|
218
|
+
|
219
|
+
await self._function_executor_state.set_status(
|
220
|
+
FunctionExecutorStatus.UNHEALTHY
|
221
|
+
)
|
222
|
+
await self._destroy_function_executor_on_failed_health_check(result.reason)
|
162
223
|
|
163
224
|
async def _destroy_existing_function_executor_if_unhealthy(self):
|
164
|
-
self.
|
165
|
-
if self.
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
225
|
+
self._function_executor_state.check_locked()
|
226
|
+
if self._function_executor_state.status == FunctionExecutorStatus.IDLE:
|
227
|
+
result: HealthCheckResult = (
|
228
|
+
await self._function_executor_state.function_executor.health_checker().check()
|
229
|
+
)
|
230
|
+
if not result.is_healthy:
|
231
|
+
await self._function_executor_state.set_status(
|
232
|
+
FunctionExecutorStatus.UNHEALTHY
|
233
|
+
)
|
234
|
+
|
235
|
+
if self._function_executor_state.status == FunctionExecutorStatus.UNHEALTHY:
|
236
|
+
await self._destroy_function_executor_on_failed_health_check(result.reason)
|
173
237
|
|
174
238
|
async def _destroy_function_executor_on_failed_health_check(self, reason: str):
|
175
|
-
self.
|
239
|
+
self._function_executor_state.check_locked()
|
176
240
|
self._logger.error(
|
177
241
|
"Function Executor health check failed, destroying Function Executor",
|
178
242
|
health_check_fail_reason=reason,
|
179
243
|
)
|
180
|
-
self.
|
181
|
-
await self._state.destroy_function_executor()
|
244
|
+
await self._function_executor_state.destroy_function_executor()
|
182
245
|
|
183
246
|
|
184
247
|
class _RunningTaskContextManager:
|
@@ -199,7 +262,7 @@ class _RunningTaskContextManager:
|
|
199
262
|
self._state: FunctionExecutorState = function_executor_state
|
200
263
|
|
201
264
|
async def __aenter__(self):
|
202
|
-
self._state.
|
265
|
+
await self._state.set_status(FunctionExecutorStatus.RUNNING_TASK)
|
203
266
|
self._state.function_executor.invocation_state_client().add_task_to_invocation_id_entry(
|
204
267
|
task_id=self._task_id,
|
205
268
|
invocation_id=self._invocation_id,
|
@@ -213,9 +276,9 @@ class _RunningTaskContextManager:
|
|
213
276
|
|
214
277
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
215
278
|
await self._state.lock.acquire()
|
216
|
-
|
217
|
-
|
218
|
-
|
279
|
+
# Health check callback could destroy the FunctionExecutor and set status to UNHEALTHY
|
280
|
+
if self._state.status == FunctionExecutorStatus.RUNNING_TASK:
|
281
|
+
await self._state.set_status(FunctionExecutorStatus.IDLE)
|
219
282
|
self._state.function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
|
220
283
|
task_id=self._task_id
|
221
284
|
)
|
@@ -0,0 +1,53 @@
|
|
1
|
+
import asyncio
|
2
|
+
from typing import Any
|
3
|
+
|
4
|
+
import grpc.aio
|
5
|
+
|
6
|
+
from .metrics.channel_creator import (
|
7
|
+
metric_grpc_server_channel_creation_latency,
|
8
|
+
metric_grpc_server_channel_creation_retries,
|
9
|
+
metric_grpc_server_channel_creations,
|
10
|
+
)
|
11
|
+
|
12
|
+
_RETRY_INTERVAL_SEC = 5
|
13
|
+
_CONNECT_TIMEOUT_SEC = 5
|
14
|
+
|
15
|
+
|
16
|
+
class ChannelCreator:
|
17
|
+
def __init__(self, server_address: str, logger: Any):
|
18
|
+
self._logger = logger.bind(module=__name__)
|
19
|
+
self._server_address = server_address
|
20
|
+
self._is_shutdown = False
|
21
|
+
|
22
|
+
async def create(self) -> grpc.aio.Channel:
|
23
|
+
"""Creates a channel to the gRPC server.
|
24
|
+
|
25
|
+
Blocks until the channel is ready.
|
26
|
+
Never raises any exceptions.
|
27
|
+
"""
|
28
|
+
with metric_grpc_server_channel_creation_latency.time():
|
29
|
+
metric_grpc_server_channel_creations.inc()
|
30
|
+
while not self._is_shutdown:
|
31
|
+
try:
|
32
|
+
channel = grpc.aio.insecure_channel(self._server_address)
|
33
|
+
await asyncio.wait_for(
|
34
|
+
channel.channel_ready(),
|
35
|
+
timeout=_CONNECT_TIMEOUT_SEC,
|
36
|
+
)
|
37
|
+
return channel
|
38
|
+
except Exception:
|
39
|
+
self._logger.error(
|
40
|
+
f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
|
41
|
+
)
|
42
|
+
try:
|
43
|
+
await channel.close()
|
44
|
+
except Exception as e:
|
45
|
+
self._logger.error(
|
46
|
+
"failed closing not established channel", exc_info=e
|
47
|
+
)
|
48
|
+
|
49
|
+
metric_grpc_server_channel_creation_retries.inc()
|
50
|
+
await asyncio.sleep(_RETRY_INTERVAL_SEC)
|
51
|
+
|
52
|
+
async def shutdown(self):
|
53
|
+
self._is_shutdown = True
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from ...monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
metric_grpc_server_channel_creations = prometheus_client.Counter(
|
6
|
+
"grpc_server_channel_creations",
|
7
|
+
"Number of times a channel to gRPC Server was created",
|
8
|
+
)
|
9
|
+
metric_grpc_server_channel_creation_retries = prometheus_client.Counter(
|
10
|
+
"grpc_server_channel_creation_retries",
|
11
|
+
"Number of retries during a channel creation to gRPC Server",
|
12
|
+
)
|
13
|
+
metric_grpc_server_channel_creation_latency: prometheus_client.Histogram = (
|
14
|
+
latency_metric_for_fast_operation(
|
15
|
+
"grpc_server_channel_creation",
|
16
|
+
"gRPC server channel creation",
|
17
|
+
)
|
18
|
+
)
|
@@ -0,0 +1,17 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from ...monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
metric_state_report_rpcs = prometheus_client.Counter(
|
6
|
+
"state_report_rpcs",
|
7
|
+
"Number of Executor state report RPCs to Server",
|
8
|
+
)
|
9
|
+
metric_state_report_errors = prometheus_client.Counter(
|
10
|
+
"state_report_rpc_errors",
|
11
|
+
"Number of Executor state report RPC errors",
|
12
|
+
)
|
13
|
+
metric_state_report_latency: prometheus_client.Histogram = (
|
14
|
+
latency_metric_for_fast_operation(
|
15
|
+
"state_report_rpc", "Executor state report rpc to Server"
|
16
|
+
)
|
17
|
+
)
|
@@ -7,29 +7,29 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
|
|
7
7
|
SerializedObject,
|
8
8
|
)
|
9
9
|
|
10
|
-
from indexify.
|
10
|
+
from indexify.proto.task_scheduler_pb2 import (
|
11
11
|
DesiredExecutorState,
|
12
12
|
FunctionExecutorDescription,
|
13
13
|
FunctionExecutorStatus,
|
14
14
|
GetDesiredExecutorStatesRequest,
|
15
15
|
)
|
16
|
-
from indexify.
|
16
|
+
from indexify.proto.task_scheduler_pb2_grpc import (
|
17
17
|
TaskSchedulerServiceStub,
|
18
18
|
)
|
19
19
|
|
20
|
-
from
|
21
|
-
from
|
22
|
-
from
|
23
|
-
from
|
20
|
+
from ..downloader import Downloader
|
21
|
+
from ..function_executor.function_executor import CustomerError, FunctionExecutor
|
22
|
+
from ..function_executor.function_executor_state import FunctionExecutorState
|
23
|
+
from ..function_executor.function_executor_states_container import (
|
24
24
|
FunctionExecutorStatesContainer,
|
25
25
|
)
|
26
|
-
from
|
26
|
+
from ..function_executor.server.function_executor_server_factory import (
|
27
27
|
FunctionExecutorServerConfiguration,
|
28
28
|
FunctionExecutorServerFactory,
|
29
29
|
)
|
30
|
-
from
|
31
|
-
from
|
32
|
-
from
|
30
|
+
from ..function_executor.task_input import TaskInput
|
31
|
+
from ..function_executor.task_output import TaskOutput
|
32
|
+
from ..metrics.executor import (
|
33
33
|
METRIC_TASKS_COMPLETED_OUTCOME_ALL,
|
34
34
|
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
|
35
35
|
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
|
@@ -42,7 +42,10 @@ from .metrics.executor import (
|
|
42
42
|
metric_tasks_fetched,
|
43
43
|
metric_tasks_reporting_outcome,
|
44
44
|
)
|
45
|
-
from
|
45
|
+
from ..task_reporter import TaskReporter
|
46
|
+
from .channel_creator import ChannelCreator
|
47
|
+
|
48
|
+
_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
|
46
49
|
|
47
50
|
|
48
51
|
class ExecutorStateReconciler:
|
@@ -55,11 +58,13 @@ class ExecutorStateReconciler:
|
|
55
58
|
config_path: Optional[str],
|
56
59
|
downloader: Downloader,
|
57
60
|
task_reporter: TaskReporter,
|
58
|
-
|
61
|
+
channel_creator: ChannelCreator,
|
59
62
|
logger: Any,
|
60
63
|
):
|
61
64
|
self._executor_id: str = executor_id
|
62
|
-
self.
|
65
|
+
self._function_executor_server_factory: FunctionExecutorServerFactory = (
|
66
|
+
function_executor_server_factory
|
67
|
+
)
|
63
68
|
self._base_url: str = base_url
|
64
69
|
self._config_path: Optional[str] = config_path
|
65
70
|
self._downloader: Downloader = downloader
|
@@ -67,39 +72,60 @@ class ExecutorStateReconciler:
|
|
67
72
|
self._function_executor_states: FunctionExecutorStatesContainer = (
|
68
73
|
function_executor_states
|
69
74
|
)
|
70
|
-
self.
|
75
|
+
self._channel_creator = channel_creator
|
71
76
|
self._logger: Any = logger.bind(module=__name__)
|
72
77
|
self._is_shutdown: bool = False
|
73
|
-
self._reconciliation_lock: asyncio.Lock = asyncio.Lock()
|
74
78
|
self._server_last_clock: Optional[int] = None
|
75
79
|
|
76
80
|
async def run(self):
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
81
|
+
"""Runs the state reconciler.
|
82
|
+
|
83
|
+
Never raises any exceptions.
|
84
|
+
"""
|
85
|
+
while not self._is_shutdown:
|
86
|
+
async with await self._channel_creator.create() as server_channel:
|
87
|
+
server_channel: grpc.aio.Channel
|
88
|
+
stub = TaskSchedulerServiceStub(server_channel)
|
89
|
+
while not self._is_shutdown:
|
90
|
+
try:
|
91
|
+
# TODO: Report state once before starting the stream.
|
92
|
+
desired_states_stream: AsyncGenerator[
|
93
|
+
DesiredExecutorState, None
|
94
|
+
] = stub.get_desired_executor_states(
|
95
|
+
GetDesiredExecutorStatesRequest(
|
96
|
+
executor_id=self._executor_id
|
97
|
+
)
|
98
|
+
)
|
99
|
+
await self._process_desired_states_stream(desired_states_stream)
|
100
|
+
except Exception as e:
|
101
|
+
self._logger.error(
|
102
|
+
f"Failed processing desired states stream, reconnecting in {_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC} sec.",
|
103
|
+
exc_info=e,
|
104
|
+
)
|
105
|
+
await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
|
106
|
+
break
|
107
|
+
|
108
|
+
self._logger.info("State reconciler shutdown.")
|
109
|
+
|
110
|
+
async def _process_desired_states_stream(
|
111
|
+
self, desired_states: AsyncGenerator[DesiredExecutorState, None]
|
112
|
+
):
|
82
113
|
async for new_state in desired_states:
|
83
114
|
if self._is_shutdown:
|
84
115
|
return
|
116
|
+
|
85
117
|
new_state: DesiredExecutorState
|
86
118
|
if self._server_last_clock is not None:
|
87
119
|
if self._server_last_clock >= new_state.clock:
|
88
120
|
continue # Duplicate or outdated message state sent by Server.
|
89
121
|
|
90
122
|
self._server_last_clock = new_state.clock
|
91
|
-
|
123
|
+
await self._reconcile_state(new_state)
|
92
124
|
|
93
125
|
async def _reconcile_state(self, new_state: DesiredExecutorState):
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
# Simple non concurrent implementation for now for the PoC.
|
98
|
-
# Obtain this lock to force only a single coroutine doing the reconciliation.
|
99
|
-
async with self._reconciliation_lock:
|
100
|
-
await self._reconcile_function_executors(new_state)
|
101
|
-
# TODO
|
102
|
-
# await self._reconcile_task_allocations(new_state)
|
126
|
+
await self._reconcile_function_executors(new_state)
|
127
|
+
# TODO
|
128
|
+
# await self._reconcile_task_allocations(new_state)
|
103
129
|
|
104
130
|
async def shutdown(self):
|
105
131
|
"""Shuts down the state reconciler.
|
@@ -121,6 +147,7 @@ class ExecutorStateReconciler:
|
|
121
147
|
graph_name=desired_function_executor.graph_name,
|
122
148
|
graph_version=desired_function_executor.graph_version,
|
123
149
|
function_name=desired_function_executor.function_name,
|
150
|
+
image_uri=desired_function_executor.image_uri,
|
124
151
|
)
|
125
152
|
)
|
126
153
|
|
@@ -203,13 +230,15 @@ class ExecutorStateReconciler:
|
|
203
230
|
logger=logger,
|
204
231
|
)
|
205
232
|
function_executor: FunctionExecutor = FunctionExecutor(
|
206
|
-
server_factory=self.
|
233
|
+
server_factory=self._function_executor_server_factory, logger=logger
|
207
234
|
)
|
208
235
|
config: FunctionExecutorServerConfiguration = (
|
209
236
|
FunctionExecutorServerConfiguration(
|
210
237
|
executor_id=self._executor_id,
|
211
238
|
function_executor_id=description.id,
|
239
|
+
namespace=description.namespace,
|
212
240
|
image_uri=description.image_uri,
|
241
|
+
secret_names=list(description.secret_names),
|
213
242
|
)
|
214
243
|
)
|
215
244
|
initialize_request: InitializeRequest = InitializeRequest(
|