indexify 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +21 -18
- indexify/executor/api_objects.py +12 -0
- indexify/executor/downloader.py +4 -1
- indexify/executor/executor.py +65 -28
- indexify/executor/executor_flavor.py +7 -0
- indexify/executor/function_executor/function_executor.py +24 -11
- indexify/executor/function_executor/function_executor_state.py +9 -1
- indexify/executor/function_executor/function_executor_states_container.py +3 -1
- indexify/executor/function_executor/function_executor_status.py +2 -0
- indexify/executor/function_executor/health_checker.py +20 -2
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
- indexify/executor/function_executor/single_task_runner.py +15 -11
- indexify/executor/function_executor/task_output.py +35 -2
- indexify/executor/grpc/channel_manager.py +160 -0
- indexify/executor/grpc/completed_tasks_container.py +26 -0
- indexify/executor/grpc/function_executor_controller.py +421 -0
- indexify/executor/grpc/state_reconciler.py +33 -38
- indexify/executor/grpc/state_reporter.py +100 -39
- indexify/executor/grpc/task_controller.py +449 -0
- indexify/executor/metrics/task_reporter.py +14 -0
- indexify/executor/task_fetcher.py +8 -3
- indexify/executor/task_reporter.py +112 -4
- indexify/executor/task_runner.py +1 -0
- indexify/proto/{task_scheduler.proto → executor_api.proto} +86 -11
- indexify/proto/executor_api_pb2.py +80 -0
- indexify/proto/{task_scheduler_pb2.pyi → executor_api_pb2.pyi} +162 -7
- indexify/proto/executor_api_pb2_grpc.py +227 -0
- {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/METADATA +1 -1
- {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/RECORD +32 -28
- indexify/executor/grpc/channel_creator.py +0 -53
- indexify/proto/task_scheduler_pb2.py +0 -64
- indexify/proto/task_scheduler_pb2_grpc.py +0 -170
- /indexify/executor/grpc/metrics/{channel_creator.py → channel_manager.py} +0 -0
- {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/WHEEL +0 -0
- {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,160 @@
|
|
1
|
+
import asyncio
|
2
|
+
from typing import Any, Dict, Optional
|
3
|
+
|
4
|
+
import grpc.aio
|
5
|
+
import yaml
|
6
|
+
|
7
|
+
from .metrics.channel_manager import (
|
8
|
+
metric_grpc_server_channel_creation_latency,
|
9
|
+
metric_grpc_server_channel_creation_retries,
|
10
|
+
metric_grpc_server_channel_creations,
|
11
|
+
)
|
12
|
+
|
13
|
+
_RETRY_INTERVAL_SEC = 5
|
14
|
+
_CONNECT_TIMEOUT_SEC = 5
|
15
|
+
|
16
|
+
|
17
|
+
class ChannelManager:
|
18
|
+
def __init__(self, server_address: str, config_path: Optional[str], logger: Any):
|
19
|
+
self._logger: Any = logger.bind(module=__name__, server_address=server_address)
|
20
|
+
self._server_address: str = server_address
|
21
|
+
self._channel_credentials: Optional[grpc.ChannelCredentials] = None
|
22
|
+
# This lock protects the fields below.
|
23
|
+
self._lock = asyncio.Lock()
|
24
|
+
self._channel: Optional[grpc.aio.Channel] = None
|
25
|
+
|
26
|
+
self._init_tls(config_path)
|
27
|
+
|
28
|
+
def _init_tls(self, config_path: Optional[str]):
|
29
|
+
if config_path is None:
|
30
|
+
return
|
31
|
+
|
32
|
+
# The same config file format as in Tensorlake SDK HTTP client, see:
|
33
|
+
# https://github.com/tensorlakeai/tensorlake/blob/main/src/tensorlake/utils/http_client.py
|
34
|
+
with open(config_path, "r") as config_file:
|
35
|
+
config = yaml.safe_load(config_file)
|
36
|
+
|
37
|
+
if not config.get("use_tls", False):
|
38
|
+
return
|
39
|
+
|
40
|
+
tls_config: Dict[str, str] = config["tls_config"]
|
41
|
+
cert_path: Optional[str] = tls_config.get("cert_path", None)
|
42
|
+
key_path: Optional[str] = tls_config.get("key_path", None)
|
43
|
+
ca_bundle_path: Optional[str] = tls_config.get("ca_bundle_path", None)
|
44
|
+
|
45
|
+
self._logger = self._logger.bind(
|
46
|
+
cert_path=cert_path,
|
47
|
+
key_path=key_path,
|
48
|
+
ca_bundle_path=ca_bundle_path,
|
49
|
+
)
|
50
|
+
self._logger.info("TLS is enabled for grpc channels to server")
|
51
|
+
|
52
|
+
private_key: Optional[bytes] = None
|
53
|
+
certificate_chain: Optional[bytes] = None
|
54
|
+
root_certificates: Optional[bytes] = None
|
55
|
+
|
56
|
+
if cert_path is not None:
|
57
|
+
with open(cert_path, "rb") as cert_file:
|
58
|
+
certificate_chain = cert_file.read()
|
59
|
+
if key_path is not None:
|
60
|
+
with open(key_path, "rb") as key_file:
|
61
|
+
private_key = key_file.read()
|
62
|
+
if ca_bundle_path is not None:
|
63
|
+
with open(ca_bundle_path, "rb") as ca_bundle_file:
|
64
|
+
root_certificates = ca_bundle_file.read()
|
65
|
+
|
66
|
+
self._channel_credentials = grpc.ssl_channel_credentials(
|
67
|
+
root_certificates=root_certificates,
|
68
|
+
private_key=private_key,
|
69
|
+
certificate_chain=certificate_chain,
|
70
|
+
)
|
71
|
+
|
72
|
+
async def get_channel(self) -> grpc.aio.Channel:
|
73
|
+
"""Returns a channel to the gRPC server.
|
74
|
+
|
75
|
+
Returns a ready to use channel. Blocks until the channel is ready,
|
76
|
+
never raises any exceptions.
|
77
|
+
If previously returned channel is healthy then returns it again.
|
78
|
+
Otherwise, returns a new channel but closes the previously returned one.
|
79
|
+
"""
|
80
|
+
# Use the lock to ensure that we only create one channel without race conditions.
|
81
|
+
async with self._lock:
|
82
|
+
if self._channel is None:
|
83
|
+
self._channel = await self._create_channel()
|
84
|
+
elif not await self._locked_channel_is_healthy():
|
85
|
+
self._logger.info("grpc channel to server is unhealthy")
|
86
|
+
await self._destroy_locked_channel()
|
87
|
+
self._channel = await self._create_channel()
|
88
|
+
|
89
|
+
return self._channel
|
90
|
+
|
91
|
+
async def _create_channel(self) -> grpc.aio.Channel:
|
92
|
+
"""Creates a new channel to the gRPC server."
|
93
|
+
|
94
|
+
Returns a ready to use channel. Blocks until the channel
|
95
|
+
is ready, never raises any exceptions.
|
96
|
+
"""
|
97
|
+
self._logger.info("creating new grpc server channel")
|
98
|
+
|
99
|
+
with metric_grpc_server_channel_creation_latency.time():
|
100
|
+
metric_grpc_server_channel_creations.inc()
|
101
|
+
while True:
|
102
|
+
try:
|
103
|
+
if self._channel_credentials is None:
|
104
|
+
channel = grpc.aio.insecure_channel(target=self._server_address)
|
105
|
+
else:
|
106
|
+
channel = grpc.aio.secure_channel(
|
107
|
+
target=self._server_address,
|
108
|
+
credentials=self._channel_credentials,
|
109
|
+
)
|
110
|
+
|
111
|
+
await asyncio.wait_for(
|
112
|
+
channel.channel_ready(),
|
113
|
+
timeout=_CONNECT_TIMEOUT_SEC,
|
114
|
+
)
|
115
|
+
return channel
|
116
|
+
except Exception:
|
117
|
+
self._logger.error(
|
118
|
+
f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
|
119
|
+
)
|
120
|
+
try:
|
121
|
+
await channel.close()
|
122
|
+
except Exception as e:
|
123
|
+
self._logger.error(
|
124
|
+
"failed closing not established channel", exc_info=e
|
125
|
+
)
|
126
|
+
|
127
|
+
metric_grpc_server_channel_creation_retries.inc()
|
128
|
+
await asyncio.sleep(_RETRY_INTERVAL_SEC)
|
129
|
+
|
130
|
+
async def _locked_channel_is_healthy(self) -> bool:
|
131
|
+
"""Checks if the channel is healthy.
|
132
|
+
|
133
|
+
Returns True if the channel is healthy, False otherwise.
|
134
|
+
self._lock must be acquired before calling this method.
|
135
|
+
Never raises any exceptions.
|
136
|
+
"""
|
137
|
+
try:
|
138
|
+
return self._channel.get_state() == grpc.ChannelConnectivity.READY
|
139
|
+
except Exception as e:
|
140
|
+
# Assume that the channel is healthy because get_state() method is marked as experimental
|
141
|
+
# so we can't fully trust it.
|
142
|
+
self._logger.error(
|
143
|
+
"failed getting channel state, assuming channel is healthy", exc_info=e
|
144
|
+
)
|
145
|
+
return True
|
146
|
+
|
147
|
+
async def _destroy_locked_channel(self):
|
148
|
+
"""Closes the existing channel.
|
149
|
+
|
150
|
+
self._lock must be acquired before calling this method.
|
151
|
+
Never raises any exceptions.
|
152
|
+
"""
|
153
|
+
try:
|
154
|
+
await self._channel.close()
|
155
|
+
except Exception as e:
|
156
|
+
self._logger.error("failed closing channel", exc_info=e)
|
157
|
+
self._channel = None
|
158
|
+
|
159
|
+
async def shutdown(self):
|
160
|
+
pass
|
@@ -0,0 +1,26 @@
|
|
1
|
+
import asyncio
|
2
|
+
from typing import List, Set
|
3
|
+
|
4
|
+
|
5
|
+
class CompletedTasksContainer:
|
6
|
+
"""An asyncio concurrent container for the completed task IDs."""
|
7
|
+
|
8
|
+
def __init__(self):
|
9
|
+
# The fields below are protected by the lock.
|
10
|
+
self._lock: asyncio.Lock = asyncio.Lock()
|
11
|
+
self._completed_task_ids: Set[str] = set()
|
12
|
+
|
13
|
+
async def add(self, task_id: str) -> None:
|
14
|
+
"""Add a task to the container."""
|
15
|
+
async with self._lock:
|
16
|
+
self._completed_task_ids.add(task_id)
|
17
|
+
|
18
|
+
async def contains(self, task_id: str) -> bool:
|
19
|
+
"""Check if the task is in the container."""
|
20
|
+
async with self._lock:
|
21
|
+
return task_id in self._completed_task_ids
|
22
|
+
|
23
|
+
async def replace(self, task_ids: List[str]) -> None:
|
24
|
+
"""Replaces the task IDs with the supplied task IDs."""
|
25
|
+
async with self._lock:
|
26
|
+
self._completed_task_ids = set(task_ids)
|
@@ -0,0 +1,421 @@
|
|
1
|
+
import asyncio
|
2
|
+
from typing import Any, Optional
|
3
|
+
|
4
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
5
|
+
InitializeRequest,
|
6
|
+
SerializedObject,
|
7
|
+
)
|
8
|
+
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
9
|
+
|
10
|
+
from indexify.proto.executor_api_pb2 import (
|
11
|
+
FunctionExecutorDescription,
|
12
|
+
)
|
13
|
+
from indexify.proto.executor_api_pb2 import (
|
14
|
+
FunctionExecutorStatus as FunctionExecutorStatusProto,
|
15
|
+
)
|
16
|
+
|
17
|
+
from ..downloader import Downloader
|
18
|
+
from ..function_executor.function_executor import CustomerError, FunctionExecutor
|
19
|
+
from ..function_executor.function_executor_state import FunctionExecutorState
|
20
|
+
from ..function_executor.function_executor_status import FunctionExecutorStatus
|
21
|
+
from ..function_executor.health_checker import HealthCheckResult
|
22
|
+
from ..function_executor.server.function_executor_server_factory import (
|
23
|
+
FunctionExecutorServerConfiguration,
|
24
|
+
FunctionExecutorServerFactory,
|
25
|
+
)
|
26
|
+
|
27
|
+
|
28
|
+
class FunctionExecutorController:
|
29
|
+
def __init__(
|
30
|
+
self,
|
31
|
+
executor_id: str,
|
32
|
+
function_executor_state: FunctionExecutorState,
|
33
|
+
function_executor_description: FunctionExecutorDescription,
|
34
|
+
function_executor_server_factory: FunctionExecutorServerFactory,
|
35
|
+
downloader: Downloader,
|
36
|
+
base_url: str,
|
37
|
+
config_path: str,
|
38
|
+
logger: Any,
|
39
|
+
):
|
40
|
+
"""Initializes the FunctionExecutorController.
|
41
|
+
|
42
|
+
Raises ValueError if the supplied FunctionExecutorDescription is not valid.
|
43
|
+
"""
|
44
|
+
_validate_function_executor_description(function_executor_description)
|
45
|
+
self._executor_id: str = executor_id
|
46
|
+
self._function_executor_state: FunctionExecutorState = function_executor_state
|
47
|
+
self._function_executor_description: FunctionExecutorDescription = (
|
48
|
+
function_executor_description
|
49
|
+
)
|
50
|
+
self._function_executor_server_factory: FunctionExecutorServerFactory = (
|
51
|
+
function_executor_server_factory
|
52
|
+
)
|
53
|
+
self._downloader: Downloader = downloader
|
54
|
+
self._base_url: str = base_url
|
55
|
+
self._config_path: str = config_path
|
56
|
+
self._logger: Any = logger.bind(
|
57
|
+
module=__name__,
|
58
|
+
function_executor_id=function_executor_description.id,
|
59
|
+
namespace=function_executor_description.namespace,
|
60
|
+
graph_name=function_executor_description.graph_name,
|
61
|
+
graph_version=function_executor_description.graph_version,
|
62
|
+
function_name=function_executor_description.function_name,
|
63
|
+
image_uri=function_executor_description.image_uri,
|
64
|
+
)
|
65
|
+
self._reconciliation_loop_task: asyncio.Task = asyncio.create_task(
|
66
|
+
self._reconciliation_loop()
|
67
|
+
)
|
68
|
+
# The locks protects the desired status.
|
69
|
+
self._lock: asyncio.Lock = asyncio.Lock()
|
70
|
+
# The same as the initial FE status.
|
71
|
+
self._desired_status: FunctionExecutorStatusProto = (
|
72
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
|
73
|
+
)
|
74
|
+
self._desired_status_change_notifier: asyncio.Condition = asyncio.Condition(
|
75
|
+
lock=self._lock
|
76
|
+
)
|
77
|
+
|
78
|
+
async def set_desired_status(
|
79
|
+
self, desired_status: FunctionExecutorStatusProto
|
80
|
+
) -> None:
|
81
|
+
"""Updates the desired Function Executor status.
|
82
|
+
|
83
|
+
Reconciliation is done asynchronously.
|
84
|
+
"""
|
85
|
+
async with self._lock:
|
86
|
+
if self._desired_status == desired_status:
|
87
|
+
return
|
88
|
+
self._desired_status = desired_status
|
89
|
+
self._desired_status_change_notifier.notify_all()
|
90
|
+
|
91
|
+
async def _reconciliation_loop(self) -> None:
|
92
|
+
self._logger.info("function executor controller reconciliation loop started")
|
93
|
+
# The same as the initial FE status.
|
94
|
+
last_seen_desired_status: FunctionExecutorStatusProto = (
|
95
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
|
96
|
+
)
|
97
|
+
# The loop is exited via loop async task cancellation on FE shutdown.
|
98
|
+
while True:
|
99
|
+
async with self._lock:
|
100
|
+
while last_seen_desired_status == self._desired_status:
|
101
|
+
await self._desired_status_change_notifier.wait()
|
102
|
+
|
103
|
+
last_seen_desired_status = self._desired_status
|
104
|
+
# It's guaranteed that we don't run _reconcile concurrently multiple times.
|
105
|
+
await self._reconcile(last_seen_desired_status)
|
106
|
+
|
107
|
+
async def _reconcile(self, desired_status: FunctionExecutorStatusProto) -> None:
|
108
|
+
async with self._function_executor_state.lock:
|
109
|
+
current_status: FunctionExecutorStatus = (
|
110
|
+
self._function_executor_state.status
|
111
|
+
)
|
112
|
+
# We have to process all possible combination of current and desired statuses.
|
113
|
+
if current_status == FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR:
|
114
|
+
if (
|
115
|
+
desired_status
|
116
|
+
== FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR
|
117
|
+
):
|
118
|
+
return # Same status, nothing to do.
|
119
|
+
|
120
|
+
# All we can do from the current status is to destroy the FE to possibly recreate it later
|
121
|
+
# if Server requests to do this. This is why we don't accept any other desired statuses.
|
122
|
+
return await self._destroy_or_shutdown_fe_if_desired(desired_status)
|
123
|
+
|
124
|
+
if current_status == FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR:
|
125
|
+
if (
|
126
|
+
desired_status
|
127
|
+
== FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR
|
128
|
+
):
|
129
|
+
return # Same status, nothing to do.
|
130
|
+
|
131
|
+
# All we can do from the current status is to destroy the FE to possibly recreate it later
|
132
|
+
# if Server requests to do this. This is why we don't accept any other desired statuses.
|
133
|
+
return await self._destroy_or_shutdown_fe_if_desired(desired_status)
|
134
|
+
|
135
|
+
if current_status == FunctionExecutorStatus.IDLE:
|
136
|
+
if (
|
137
|
+
desired_status
|
138
|
+
== FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE
|
139
|
+
):
|
140
|
+
return # Same status, nothing to do.
|
141
|
+
|
142
|
+
# Server can only request FE destroy or shutdown when FE has IDLE status.
|
143
|
+
# Transition from IDLE to RUNNING_TASK can only be done by Task controller.
|
144
|
+
# Transition from IDLE to UNHEALTHY can only be done by FE controller.
|
145
|
+
return await self._destroy_or_shutdown_fe_if_desired(desired_status)
|
146
|
+
|
147
|
+
if current_status == FunctionExecutorStatus.RUNNING_TASK:
|
148
|
+
if (
|
149
|
+
desired_status
|
150
|
+
== FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK
|
151
|
+
):
|
152
|
+
return # Same status, nothing to do.
|
153
|
+
|
154
|
+
# Server can only request FE destroy or shutdown when FE has RUNNING_TASK status.
|
155
|
+
# Transition from RUNNING_TASK to UNHEALTHY can only be done by Task controller.
|
156
|
+
return await self._destroy_or_shutdown_fe_if_desired(desired_status)
|
157
|
+
|
158
|
+
if current_status == FunctionExecutorStatus.UNHEALTHY:
|
159
|
+
if (
|
160
|
+
desired_status
|
161
|
+
== FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNHEALTHY
|
162
|
+
):
|
163
|
+
return # Same status, nothing to do.
|
164
|
+
|
165
|
+
# Server can only request FE destroy or shutdown when FE has RUNNING_TASK status.
|
166
|
+
return await self._destroy_or_shutdown_fe_if_desired(desired_status)
|
167
|
+
|
168
|
+
if current_status == FunctionExecutorStatus.DESTROYED:
|
169
|
+
if (
|
170
|
+
desired_status
|
171
|
+
== FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
|
172
|
+
):
|
173
|
+
return # Same status, nothing to do.
|
174
|
+
|
175
|
+
return await self._reconcile_from_destroyed(desired_status)
|
176
|
+
|
177
|
+
# _reconcile() can't be called when current FE status is one of "long running" states
|
178
|
+
# handled by FE controller like STARTING_UP and DESTROYING. This is because _reconcile()
|
179
|
+
# is called with concurrency of 1 and _reconcile() waits until these long running states
|
180
|
+
# (operations) are finished before returning.
|
181
|
+
#
|
182
|
+
# It's not possible to have SHUTDOWN current status because when FE controller transitions to SHUTDOWN
|
183
|
+
# status, it cancels the reconciliation loop task.
|
184
|
+
self._logger.error(
|
185
|
+
"unexpected current function executor status, skipping state reconciliation",
|
186
|
+
current_status=current_status.name,
|
187
|
+
desired_status=FunctionExecutorStatusProto.Name(desired_status),
|
188
|
+
)
|
189
|
+
|
190
|
+
async def _destroy_or_shutdown_fe_if_desired(
|
191
|
+
self, desired_status: FunctionExecutorStatusProto
|
192
|
+
) -> None:
|
193
|
+
"""Destroys the Function Executor if desired status asks for it.
|
194
|
+
|
195
|
+
Otherwise logs an error because other actions are not allowed by the current status.
|
196
|
+
Caller holds the FE state lock.
|
197
|
+
"""
|
198
|
+
if desired_status not in [
|
199
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPING,
|
200
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
|
201
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
|
202
|
+
]:
|
203
|
+
self._logger.error(
|
204
|
+
"unexpected desired function executor status received from server, skipping state reconciliation",
|
205
|
+
current_status=self._function_executor_state.status.name,
|
206
|
+
desired_status=FunctionExecutorStatusProto.Name(desired_status),
|
207
|
+
)
|
208
|
+
return
|
209
|
+
|
210
|
+
await self._destroy_function_executor()
|
211
|
+
# FE state status is now DESTROYED.
|
212
|
+
if (
|
213
|
+
desired_status
|
214
|
+
== FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
|
215
|
+
):
|
216
|
+
await self._shutdown()
|
217
|
+
# No code is executed after this point because reconciliation loop aio task is cancelled.
|
218
|
+
|
219
|
+
async def _reconcile_from_destroyed(
|
220
|
+
self, desired_status: FunctionExecutorStatusProto
|
221
|
+
) -> None:
|
222
|
+
"""Reconciles the FE state when it has DESTROYED status.
|
223
|
+
|
224
|
+
Caller holds the FE state lock.
|
225
|
+
"""
|
226
|
+
if desired_status not in [
|
227
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTING_UP,
|
228
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE,
|
229
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK,
|
230
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
|
231
|
+
]:
|
232
|
+
self._logger.error(
|
233
|
+
"unexpected desired function executor status received from server, skipping state reconciliation",
|
234
|
+
current_status=self._function_executor_state.status.name,
|
235
|
+
desired_status=FunctionExecutorStatusProto.Name(desired_status),
|
236
|
+
)
|
237
|
+
return
|
238
|
+
|
239
|
+
if (
|
240
|
+
desired_status
|
241
|
+
== FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
|
242
|
+
):
|
243
|
+
await self._shutdown()
|
244
|
+
# No code is executed after this point because reconciliation loop aio task is cancelled.
|
245
|
+
return
|
246
|
+
|
247
|
+
# All the rest of the allowed desired statuses ask to create the FE.
|
248
|
+
await self._function_executor_state.set_status(
|
249
|
+
FunctionExecutorStatus.STARTING_UP
|
250
|
+
)
|
251
|
+
|
252
|
+
next_status: FunctionExecutorStatus = FunctionExecutorStatus.IDLE
|
253
|
+
next_status_message: str = ""
|
254
|
+
async with _UnlockedLockContextManager(self._function_executor_state.lock):
|
255
|
+
try:
|
256
|
+
function_executor: FunctionExecutor = await _create_function_executor(
|
257
|
+
function_executor_description=self._function_executor_description,
|
258
|
+
function_executor_server_factory=self._function_executor_server_factory,
|
259
|
+
downloader=self._downloader,
|
260
|
+
executor_id=self._executor_id,
|
261
|
+
base_url=self._base_url,
|
262
|
+
config_path=self._config_path,
|
263
|
+
logger=self._logger,
|
264
|
+
)
|
265
|
+
except CustomerError as e:
|
266
|
+
next_status = FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR
|
267
|
+
next_status_message = str(e)
|
268
|
+
except Exception as e:
|
269
|
+
next_status = FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
|
270
|
+
|
271
|
+
# FE state lock is acquired again at this point.
|
272
|
+
await self._function_executor_state.set_status(next_status, next_status_message)
|
273
|
+
|
274
|
+
if next_status == FunctionExecutorStatus.IDLE:
|
275
|
+
# Task controllers will notice that this FE is IDLE and start running on it one by one.
|
276
|
+
self._function_executor_state.function_executor = function_executor
|
277
|
+
# Health checker starts after FE creation and gets automatically stopped on FE destroy.
|
278
|
+
self._function_executor_state.function_executor.health_checker().start(
|
279
|
+
self._health_check_failed_callback
|
280
|
+
)
|
281
|
+
|
282
|
+
async def _destroy_function_executor(self) -> None:
|
283
|
+
"""Destroys the Function Executor if it exists.
|
284
|
+
|
285
|
+
Caller holds the FE state lock.
|
286
|
+
"""
|
287
|
+
await self._function_executor_state.set_status(
|
288
|
+
FunctionExecutorStatus.DESTROYING
|
289
|
+
)
|
290
|
+
async with _UnlockedLockContextManager(self._function_executor_state.lock):
|
291
|
+
await self._function_executor_state.function_executor.destroy()
|
292
|
+
await self._function_executor_state.set_status(FunctionExecutorStatus.DESTROYED)
|
293
|
+
self._function_executor_state.function_executor = None
|
294
|
+
|
295
|
+
async def _shutdown(self) -> None:
|
296
|
+
"""Shuts down the controller.
|
297
|
+
|
298
|
+
Caller holds the FE state lock.
|
299
|
+
Raises asyncio.CancelledError on return when called from reconciliation loop.
|
300
|
+
"""
|
301
|
+
self._logger.info("shutting down function executor controller")
|
302
|
+
await self._function_executor_state.set_status(FunctionExecutorStatus.SHUTDOWN)
|
303
|
+
self._reconciliation_loop_task.cancel()
|
304
|
+
await self._reconciliation_loop_task
|
305
|
+
|
306
|
+
async def _health_check_failed_callback(self, result: HealthCheckResult):
|
307
|
+
async with self._function_executor_state.lock:
|
308
|
+
if self._function_executor_state.status == FunctionExecutorStatus.UNHEALTHY:
|
309
|
+
return
|
310
|
+
|
311
|
+
if self._function_executor_state.status in (
|
312
|
+
FunctionExecutorStatus.IDLE,
|
313
|
+
FunctionExecutorStatus.RUNNING_TASK,
|
314
|
+
):
|
315
|
+
# There can be false positive health check failures when we're creating
|
316
|
+
# or destroying FEs so we're not interested in them.
|
317
|
+
#
|
318
|
+
# Server should react to this transition into unhealthy state and ask to
|
319
|
+
# destroy this FE.
|
320
|
+
await self._function_executor_state.set_status(
|
321
|
+
FunctionExecutorStatus.UNHEALTHY
|
322
|
+
)
|
323
|
+
|
324
|
+
|
325
|
+
async def _create_function_executor(
|
326
|
+
function_executor_description: FunctionExecutorDescription,
|
327
|
+
function_executor_server_factory: FunctionExecutorServerFactory,
|
328
|
+
downloader: Downloader,
|
329
|
+
executor_id: str,
|
330
|
+
base_url: str,
|
331
|
+
config_path: str,
|
332
|
+
logger: Any,
|
333
|
+
) -> FunctionExecutor:
|
334
|
+
"""Creates a function executor.
|
335
|
+
|
336
|
+
Raises Exception in case of failure.
|
337
|
+
Raises CustomerError if customer code failed during FE creation.
|
338
|
+
"""
|
339
|
+
graph: SerializedObject = await downloader.download_graph(
|
340
|
+
namespace=function_executor_description.namespace,
|
341
|
+
graph_name=function_executor_description.graph_name,
|
342
|
+
graph_version=function_executor_description.graph_version,
|
343
|
+
logger=logger,
|
344
|
+
)
|
345
|
+
|
346
|
+
config: FunctionExecutorServerConfiguration = FunctionExecutorServerConfiguration(
|
347
|
+
executor_id=executor_id,
|
348
|
+
function_executor_id=function_executor_description.id,
|
349
|
+
namespace=function_executor_description.namespace,
|
350
|
+
secret_names=list(function_executor_description.secret_names),
|
351
|
+
)
|
352
|
+
if function_executor_description.HasField("image_uri"):
|
353
|
+
config.image_uri = function_executor_description.image_uri
|
354
|
+
|
355
|
+
initialize_request: InitializeRequest = InitializeRequest(
|
356
|
+
namespace=function_executor_description.namespace,
|
357
|
+
graph_name=function_executor_description.graph_name,
|
358
|
+
graph_version=function_executor_description.graph_version,
|
359
|
+
function_name=function_executor_description.function_name,
|
360
|
+
graph=graph,
|
361
|
+
)
|
362
|
+
customer_code_timeout_sec: Optional[float] = None
|
363
|
+
if function_executor_description.HasField("customer_code_timeout_ms"):
|
364
|
+
# TODO: Add integration tests with FE customer code initialization timeout
|
365
|
+
# when end-to-end implementation is done.
|
366
|
+
customer_code_timeout_sec = (
|
367
|
+
function_executor_description.customer_code_timeout_ms / 1000.0
|
368
|
+
)
|
369
|
+
|
370
|
+
function_executor: FunctionExecutor = FunctionExecutor(
|
371
|
+
server_factory=function_executor_server_factory, logger=logger
|
372
|
+
)
|
373
|
+
|
374
|
+
try:
|
375
|
+
# Raises CustomerError if initialization failed in customer code or customer code timed out.
|
376
|
+
await function_executor.initialize(
|
377
|
+
config=config,
|
378
|
+
initialize_request=initialize_request,
|
379
|
+
base_url=base_url,
|
380
|
+
config_path=config_path,
|
381
|
+
customer_code_timeout_sec=customer_code_timeout_sec,
|
382
|
+
)
|
383
|
+
return function_executor
|
384
|
+
except Exception:
|
385
|
+
await function_executor.destroy()
|
386
|
+
raise
|
387
|
+
|
388
|
+
|
389
|
+
def _validate_function_executor_description(
|
390
|
+
function_executor_description: FunctionExecutorDescription,
|
391
|
+
) -> None:
|
392
|
+
"""Validates the supplied FE description.
|
393
|
+
|
394
|
+
Raises ValueError if the description is not valid.
|
395
|
+
"""
|
396
|
+
validator = MessageValidator(function_executor_description)
|
397
|
+
validator.required_field("id")
|
398
|
+
validator.required_field("namespace")
|
399
|
+
validator.required_field("graph_name")
|
400
|
+
validator.required_field("graph_version")
|
401
|
+
validator.required_field("function_name")
|
402
|
+
# image_uri is optional.
|
403
|
+
# secret_names can be empty.
|
404
|
+
# resource_limits is optional.
|
405
|
+
|
406
|
+
|
407
|
+
class _UnlockedLockContextManager:
|
408
|
+
"""Unlocks its lock on enter to the scope and locks it back on exit."""
|
409
|
+
|
410
|
+
def __init__(
|
411
|
+
self,
|
412
|
+
lock: asyncio.Lock,
|
413
|
+
):
|
414
|
+
self._lock: asyncio.Lock = lock
|
415
|
+
|
416
|
+
async def __aenter__(self):
|
417
|
+
self._lock.release()
|
418
|
+
return self
|
419
|
+
|
420
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
421
|
+
await self._lock.acquire()
|