indexify 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. indexify/cli/cli.py +21 -18
  2. indexify/executor/api_objects.py +12 -0
  3. indexify/executor/downloader.py +4 -1
  4. indexify/executor/executor.py +65 -28
  5. indexify/executor/executor_flavor.py +7 -0
  6. indexify/executor/function_executor/function_executor.py +24 -11
  7. indexify/executor/function_executor/function_executor_state.py +9 -1
  8. indexify/executor/function_executor/function_executor_states_container.py +3 -1
  9. indexify/executor/function_executor/function_executor_status.py +2 -0
  10. indexify/executor/function_executor/health_checker.py +20 -2
  11. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
  12. indexify/executor/function_executor/single_task_runner.py +15 -11
  13. indexify/executor/function_executor/task_output.py +35 -2
  14. indexify/executor/grpc/channel_manager.py +160 -0
  15. indexify/executor/grpc/completed_tasks_container.py +26 -0
  16. indexify/executor/grpc/function_executor_controller.py +421 -0
  17. indexify/executor/grpc/state_reconciler.py +33 -38
  18. indexify/executor/grpc/state_reporter.py +100 -39
  19. indexify/executor/grpc/task_controller.py +449 -0
  20. indexify/executor/metrics/task_reporter.py +14 -0
  21. indexify/executor/task_fetcher.py +8 -3
  22. indexify/executor/task_reporter.py +112 -4
  23. indexify/executor/task_runner.py +1 -0
  24. indexify/proto/{task_scheduler.proto → executor_api.proto} +86 -11
  25. indexify/proto/executor_api_pb2.py +80 -0
  26. indexify/proto/{task_scheduler_pb2.pyi → executor_api_pb2.pyi} +162 -7
  27. indexify/proto/executor_api_pb2_grpc.py +227 -0
  28. {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/METADATA +1 -1
  29. {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/RECORD +32 -28
  30. indexify/executor/grpc/channel_creator.py +0 -53
  31. indexify/proto/task_scheduler_pb2.py +0 -64
  32. indexify/proto/task_scheduler_pb2_grpc.py +0 -170
  33. /indexify/executor/grpc/metrics/{channel_creator.py → channel_manager.py} +0 -0
  34. {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/WHEEL +0 -0
  35. {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,160 @@
1
+ import asyncio
2
+ from typing import Any, Dict, Optional
3
+
4
+ import grpc.aio
5
+ import yaml
6
+
7
+ from .metrics.channel_manager import (
8
+ metric_grpc_server_channel_creation_latency,
9
+ metric_grpc_server_channel_creation_retries,
10
+ metric_grpc_server_channel_creations,
11
+ )
12
+
13
+ _RETRY_INTERVAL_SEC = 5
14
+ _CONNECT_TIMEOUT_SEC = 5
15
+
16
+
17
+ class ChannelManager:
18
+ def __init__(self, server_address: str, config_path: Optional[str], logger: Any):
19
+ self._logger: Any = logger.bind(module=__name__, server_address=server_address)
20
+ self._server_address: str = server_address
21
+ self._channel_credentials: Optional[grpc.ChannelCredentials] = None
22
+ # This lock protects the fields below.
23
+ self._lock = asyncio.Lock()
24
+ self._channel: Optional[grpc.aio.Channel] = None
25
+
26
+ self._init_tls(config_path)
27
+
28
+ def _init_tls(self, config_path: Optional[str]):
29
+ if config_path is None:
30
+ return
31
+
32
+ # The same config file format as in Tensorlake SDK HTTP client, see:
33
+ # https://github.com/tensorlakeai/tensorlake/blob/main/src/tensorlake/utils/http_client.py
34
+ with open(config_path, "r") as config_file:
35
+ config = yaml.safe_load(config_file)
36
+
37
+ if not config.get("use_tls", False):
38
+ return
39
+
40
+ tls_config: Dict[str, str] = config["tls_config"]
41
+ cert_path: Optional[str] = tls_config.get("cert_path", None)
42
+ key_path: Optional[str] = tls_config.get("key_path", None)
43
+ ca_bundle_path: Optional[str] = tls_config.get("ca_bundle_path", None)
44
+
45
+ self._logger = self._logger.bind(
46
+ cert_path=cert_path,
47
+ key_path=key_path,
48
+ ca_bundle_path=ca_bundle_path,
49
+ )
50
+ self._logger.info("TLS is enabled for grpc channels to server")
51
+
52
+ private_key: Optional[bytes] = None
53
+ certificate_chain: Optional[bytes] = None
54
+ root_certificates: Optional[bytes] = None
55
+
56
+ if cert_path is not None:
57
+ with open(cert_path, "rb") as cert_file:
58
+ certificate_chain = cert_file.read()
59
+ if key_path is not None:
60
+ with open(key_path, "rb") as key_file:
61
+ private_key = key_file.read()
62
+ if ca_bundle_path is not None:
63
+ with open(ca_bundle_path, "rb") as ca_bundle_file:
64
+ root_certificates = ca_bundle_file.read()
65
+
66
+ self._channel_credentials = grpc.ssl_channel_credentials(
67
+ root_certificates=root_certificates,
68
+ private_key=private_key,
69
+ certificate_chain=certificate_chain,
70
+ )
71
+
72
+ async def get_channel(self) -> grpc.aio.Channel:
73
+ """Returns a channel to the gRPC server.
74
+
75
+ Returns a ready to use channel. Blocks until the channel is ready,
76
+ never raises any exceptions.
77
+ If previously returned channel is healthy then returns it again.
78
+ Otherwise, returns a new channel but closes the previously returned one.
79
+ """
80
+ # Use the lock to ensure that we only create one channel without race conditions.
81
+ async with self._lock:
82
+ if self._channel is None:
83
+ self._channel = await self._create_channel()
84
+ elif not await self._locked_channel_is_healthy():
85
+ self._logger.info("grpc channel to server is unhealthy")
86
+ await self._destroy_locked_channel()
87
+ self._channel = await self._create_channel()
88
+
89
+ return self._channel
90
+
91
+ async def _create_channel(self) -> grpc.aio.Channel:
92
+ """Creates a new channel to the gRPC server."
93
+
94
+ Returns a ready to use channel. Blocks until the channel
95
+ is ready, never raises any exceptions.
96
+ """
97
+ self._logger.info("creating new grpc server channel")
98
+
99
+ with metric_grpc_server_channel_creation_latency.time():
100
+ metric_grpc_server_channel_creations.inc()
101
+ while True:
102
+ try:
103
+ if self._channel_credentials is None:
104
+ channel = grpc.aio.insecure_channel(target=self._server_address)
105
+ else:
106
+ channel = grpc.aio.secure_channel(
107
+ target=self._server_address,
108
+ credentials=self._channel_credentials,
109
+ )
110
+
111
+ await asyncio.wait_for(
112
+ channel.channel_ready(),
113
+ timeout=_CONNECT_TIMEOUT_SEC,
114
+ )
115
+ return channel
116
+ except Exception:
117
+ self._logger.error(
118
+ f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
119
+ )
120
+ try:
121
+ await channel.close()
122
+ except Exception as e:
123
+ self._logger.error(
124
+ "failed closing not established channel", exc_info=e
125
+ )
126
+
127
+ metric_grpc_server_channel_creation_retries.inc()
128
+ await asyncio.sleep(_RETRY_INTERVAL_SEC)
129
+
130
+ async def _locked_channel_is_healthy(self) -> bool:
131
+ """Checks if the channel is healthy.
132
+
133
+ Returns True if the channel is healthy, False otherwise.
134
+ self._lock must be acquired before calling this method.
135
+ Never raises any exceptions.
136
+ """
137
+ try:
138
+ return self._channel.get_state() == grpc.ChannelConnectivity.READY
139
+ except Exception as e:
140
+ # Assume that the channel is healthy because get_state() method is marked as experimental
141
+ # so we can't fully trust it.
142
+ self._logger.error(
143
+ "failed getting channel state, assuming channel is healthy", exc_info=e
144
+ )
145
+ return True
146
+
147
+ async def _destroy_locked_channel(self):
148
+ """Closes the existing channel.
149
+
150
+ self._lock must be acquired before calling this method.
151
+ Never raises any exceptions.
152
+ """
153
+ try:
154
+ await self._channel.close()
155
+ except Exception as e:
156
+ self._logger.error("failed closing channel", exc_info=e)
157
+ self._channel = None
158
+
159
+ async def shutdown(self):
160
+ pass
@@ -0,0 +1,26 @@
1
+ import asyncio
2
+ from typing import List, Set
3
+
4
+
5
+ class CompletedTasksContainer:
6
+ """An asyncio concurrent container for the completed task IDs."""
7
+
8
+ def __init__(self):
9
+ # The fields below are protected by the lock.
10
+ self._lock: asyncio.Lock = asyncio.Lock()
11
+ self._completed_task_ids: Set[str] = set()
12
+
13
+ async def add(self, task_id: str) -> None:
14
+ """Add a task to the container."""
15
+ async with self._lock:
16
+ self._completed_task_ids.add(task_id)
17
+
18
+ async def contains(self, task_id: str) -> bool:
19
+ """Check if the task is in the container."""
20
+ async with self._lock:
21
+ return task_id in self._completed_task_ids
22
+
23
+ async def replace(self, task_ids: List[str]) -> None:
24
+ """Replaces the task IDs with the supplied task IDs."""
25
+ async with self._lock:
26
+ self._completed_task_ids = set(task_ids)
@@ -0,0 +1,421 @@
1
+ import asyncio
2
+ from typing import Any, Optional
3
+
4
+ from tensorlake.function_executor.proto.function_executor_pb2 import (
5
+ InitializeRequest,
6
+ SerializedObject,
7
+ )
8
+ from tensorlake.function_executor.proto.message_validator import MessageValidator
9
+
10
+ from indexify.proto.executor_api_pb2 import (
11
+ FunctionExecutorDescription,
12
+ )
13
+ from indexify.proto.executor_api_pb2 import (
14
+ FunctionExecutorStatus as FunctionExecutorStatusProto,
15
+ )
16
+
17
+ from ..downloader import Downloader
18
+ from ..function_executor.function_executor import CustomerError, FunctionExecutor
19
+ from ..function_executor.function_executor_state import FunctionExecutorState
20
+ from ..function_executor.function_executor_status import FunctionExecutorStatus
21
+ from ..function_executor.health_checker import HealthCheckResult
22
+ from ..function_executor.server.function_executor_server_factory import (
23
+ FunctionExecutorServerConfiguration,
24
+ FunctionExecutorServerFactory,
25
+ )
26
+
27
+
28
+ class FunctionExecutorController:
29
+ def __init__(
30
+ self,
31
+ executor_id: str,
32
+ function_executor_state: FunctionExecutorState,
33
+ function_executor_description: FunctionExecutorDescription,
34
+ function_executor_server_factory: FunctionExecutorServerFactory,
35
+ downloader: Downloader,
36
+ base_url: str,
37
+ config_path: str,
38
+ logger: Any,
39
+ ):
40
+ """Initializes the FunctionExecutorController.
41
+
42
+ Raises ValueError if the supplied FunctionExecutorDescription is not valid.
43
+ """
44
+ _validate_function_executor_description(function_executor_description)
45
+ self._executor_id: str = executor_id
46
+ self._function_executor_state: FunctionExecutorState = function_executor_state
47
+ self._function_executor_description: FunctionExecutorDescription = (
48
+ function_executor_description
49
+ )
50
+ self._function_executor_server_factory: FunctionExecutorServerFactory = (
51
+ function_executor_server_factory
52
+ )
53
+ self._downloader: Downloader = downloader
54
+ self._base_url: str = base_url
55
+ self._config_path: str = config_path
56
+ self._logger: Any = logger.bind(
57
+ module=__name__,
58
+ function_executor_id=function_executor_description.id,
59
+ namespace=function_executor_description.namespace,
60
+ graph_name=function_executor_description.graph_name,
61
+ graph_version=function_executor_description.graph_version,
62
+ function_name=function_executor_description.function_name,
63
+ image_uri=function_executor_description.image_uri,
64
+ )
65
+ self._reconciliation_loop_task: asyncio.Task = asyncio.create_task(
66
+ self._reconciliation_loop()
67
+ )
68
+ # The locks protects the desired status.
69
+ self._lock: asyncio.Lock = asyncio.Lock()
70
+ # The same as the initial FE status.
71
+ self._desired_status: FunctionExecutorStatusProto = (
72
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
73
+ )
74
+ self._desired_status_change_notifier: asyncio.Condition = asyncio.Condition(
75
+ lock=self._lock
76
+ )
77
+
78
+ async def set_desired_status(
79
+ self, desired_status: FunctionExecutorStatusProto
80
+ ) -> None:
81
+ """Updates the desired Function Executor status.
82
+
83
+ Reconciliation is done asynchronously.
84
+ """
85
+ async with self._lock:
86
+ if self._desired_status == desired_status:
87
+ return
88
+ self._desired_status = desired_status
89
+ self._desired_status_change_notifier.notify_all()
90
+
91
+ async def _reconciliation_loop(self) -> None:
92
+ self._logger.info("function executor controller reconciliation loop started")
93
+ # The same as the initial FE status.
94
+ last_seen_desired_status: FunctionExecutorStatusProto = (
95
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
96
+ )
97
+ # The loop is exited via loop async task cancellation on FE shutdown.
98
+ while True:
99
+ async with self._lock:
100
+ while last_seen_desired_status == self._desired_status:
101
+ await self._desired_status_change_notifier.wait()
102
+
103
+ last_seen_desired_status = self._desired_status
104
+ # It's guaranteed that we don't run _reconcile concurrently multiple times.
105
+ await self._reconcile(last_seen_desired_status)
106
+
107
+ async def _reconcile(self, desired_status: FunctionExecutorStatusProto) -> None:
108
+ async with self._function_executor_state.lock:
109
+ current_status: FunctionExecutorStatus = (
110
+ self._function_executor_state.status
111
+ )
112
+ # We have to process all possible combination of current and desired statuses.
113
+ if current_status == FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR:
114
+ if (
115
+ desired_status
116
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR
117
+ ):
118
+ return # Same status, nothing to do.
119
+
120
+ # All we can do from the current status is to destroy the FE to possibly recreate it later
121
+ # if Server requests to do this. This is why we don't accept any other desired statuses.
122
+ return await self._destroy_or_shutdown_fe_if_desired(desired_status)
123
+
124
+ if current_status == FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR:
125
+ if (
126
+ desired_status
127
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR
128
+ ):
129
+ return # Same status, nothing to do.
130
+
131
+ # All we can do from the current status is to destroy the FE to possibly recreate it later
132
+ # if Server requests to do this. This is why we don't accept any other desired statuses.
133
+ return await self._destroy_or_shutdown_fe_if_desired(desired_status)
134
+
135
+ if current_status == FunctionExecutorStatus.IDLE:
136
+ if (
137
+ desired_status
138
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE
139
+ ):
140
+ return # Same status, nothing to do.
141
+
142
+ # Server can only request FE destroy or shutdown when FE has IDLE status.
143
+ # Transition from IDLE to RUNNING_TASK can only be done by Task controller.
144
+ # Transition from IDLE to UNHEALTHY can only be done by FE controller.
145
+ return await self._destroy_or_shutdown_fe_if_desired(desired_status)
146
+
147
+ if current_status == FunctionExecutorStatus.RUNNING_TASK:
148
+ if (
149
+ desired_status
150
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK
151
+ ):
152
+ return # Same status, nothing to do.
153
+
154
+ # Server can only request FE destroy or shutdown when FE has RUNNING_TASK status.
155
+ # Transition from RUNNING_TASK to UNHEALTHY can only be done by Task controller.
156
+ return await self._destroy_or_shutdown_fe_if_desired(desired_status)
157
+
158
+ if current_status == FunctionExecutorStatus.UNHEALTHY:
159
+ if (
160
+ desired_status
161
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNHEALTHY
162
+ ):
163
+ return # Same status, nothing to do.
164
+
165
+ # Server can only request FE destroy or shutdown when FE has RUNNING_TASK status.
166
+ return await self._destroy_or_shutdown_fe_if_desired(desired_status)
167
+
168
+ if current_status == FunctionExecutorStatus.DESTROYED:
169
+ if (
170
+ desired_status
171
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
172
+ ):
173
+ return # Same status, nothing to do.
174
+
175
+ return await self._reconcile_from_destroyed(desired_status)
176
+
177
+ # _reconcile() can't be called when current FE status is one of "long running" states
178
+ # handled by FE controller like STARTING_UP and DESTROYING. This is because _reconcile()
179
+ # is called with concurrency of 1 and _reconcile() waits until these long running states
180
+ # (operations) are finished before returning.
181
+ #
182
+ # It's not possible to have SHUTDOWN current status because when FE controller transitions to SHUTDOWN
183
+ # status, it cancels the reconciliation loop task.
184
+ self._logger.error(
185
+ "unexpected current function executor status, skipping state reconciliation",
186
+ current_status=current_status.name,
187
+ desired_status=FunctionExecutorStatusProto.Name(desired_status),
188
+ )
189
+
190
+ async def _destroy_or_shutdown_fe_if_desired(
191
+ self, desired_status: FunctionExecutorStatusProto
192
+ ) -> None:
193
+ """Destroys the Function Executor if desired status asks for it.
194
+
195
+ Otherwise logs an error because other actions are not allowed by the current status.
196
+ Caller holds the FE state lock.
197
+ """
198
+ if desired_status not in [
199
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPING,
200
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
201
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
202
+ ]:
203
+ self._logger.error(
204
+ "unexpected desired function executor status received from server, skipping state reconciliation",
205
+ current_status=self._function_executor_state.status.name,
206
+ desired_status=FunctionExecutorStatusProto.Name(desired_status),
207
+ )
208
+ return
209
+
210
+ await self._destroy_function_executor()
211
+ # FE state status is now DESTROYED.
212
+ if (
213
+ desired_status
214
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
215
+ ):
216
+ await self._shutdown()
217
+ # No code is executed after this point because reconciliation loop aio task is cancelled.
218
+
219
+ async def _reconcile_from_destroyed(
220
+ self, desired_status: FunctionExecutorStatusProto
221
+ ) -> None:
222
+ """Reconciles the FE state when it has DESTROYED status.
223
+
224
+ Caller holds the FE state lock.
225
+ """
226
+ if desired_status not in [
227
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTING_UP,
228
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE,
229
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK,
230
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
231
+ ]:
232
+ self._logger.error(
233
+ "unexpected desired function executor status received from server, skipping state reconciliation",
234
+ current_status=self._function_executor_state.status.name,
235
+ desired_status=FunctionExecutorStatusProto.Name(desired_status),
236
+ )
237
+ return
238
+
239
+ if (
240
+ desired_status
241
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
242
+ ):
243
+ await self._shutdown()
244
+ # No code is executed after this point because reconciliation loop aio task is cancelled.
245
+ return
246
+
247
+ # All the rest of the allowed desired statuses ask to create the FE.
248
+ await self._function_executor_state.set_status(
249
+ FunctionExecutorStatus.STARTING_UP
250
+ )
251
+
252
+ next_status: FunctionExecutorStatus = FunctionExecutorStatus.IDLE
253
+ next_status_message: str = ""
254
+ async with _UnlockedLockContextManager(self._function_executor_state.lock):
255
+ try:
256
+ function_executor: FunctionExecutor = await _create_function_executor(
257
+ function_executor_description=self._function_executor_description,
258
+ function_executor_server_factory=self._function_executor_server_factory,
259
+ downloader=self._downloader,
260
+ executor_id=self._executor_id,
261
+ base_url=self._base_url,
262
+ config_path=self._config_path,
263
+ logger=self._logger,
264
+ )
265
+ except CustomerError as e:
266
+ next_status = FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR
267
+ next_status_message = str(e)
268
+ except Exception as e:
269
+ next_status = FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
270
+
271
+ # FE state lock is acquired again at this point.
272
+ await self._function_executor_state.set_status(next_status, next_status_message)
273
+
274
+ if next_status == FunctionExecutorStatus.IDLE:
275
+ # Task controllers will notice that this FE is IDLE and start running on it one by one.
276
+ self._function_executor_state.function_executor = function_executor
277
+ # Health checker starts after FE creation and gets automatically stopped on FE destroy.
278
+ self._function_executor_state.function_executor.health_checker().start(
279
+ self._health_check_failed_callback
280
+ )
281
+
282
+ async def _destroy_function_executor(self) -> None:
283
+ """Destroys the Function Executor if it exists.
284
+
285
+ Caller holds the FE state lock.
286
+ """
287
+ await self._function_executor_state.set_status(
288
+ FunctionExecutorStatus.DESTROYING
289
+ )
290
+ async with _UnlockedLockContextManager(self._function_executor_state.lock):
291
+ await self._function_executor_state.function_executor.destroy()
292
+ await self._function_executor_state.set_status(FunctionExecutorStatus.DESTROYED)
293
+ self._function_executor_state.function_executor = None
294
+
295
+ async def _shutdown(self) -> None:
296
+ """Shuts down the controller.
297
+
298
+ Caller holds the FE state lock.
299
+ Raises asyncio.CancelledError on return when called from reconciliation loop.
300
+ """
301
+ self._logger.info("shutting down function executor controller")
302
+ await self._function_executor_state.set_status(FunctionExecutorStatus.SHUTDOWN)
303
+ self._reconciliation_loop_task.cancel()
304
+ await self._reconciliation_loop_task
305
+
306
+ async def _health_check_failed_callback(self, result: HealthCheckResult):
307
+ async with self._function_executor_state.lock:
308
+ if self._function_executor_state.status == FunctionExecutorStatus.UNHEALTHY:
309
+ return
310
+
311
+ if self._function_executor_state.status in (
312
+ FunctionExecutorStatus.IDLE,
313
+ FunctionExecutorStatus.RUNNING_TASK,
314
+ ):
315
+ # There can be false positive health check failures when we're creating
316
+ # or destroying FEs so we're not interested in them.
317
+ #
318
+ # Server should react to this transition into unhealthy state and ask to
319
+ # destroy this FE.
320
+ await self._function_executor_state.set_status(
321
+ FunctionExecutorStatus.UNHEALTHY
322
+ )
323
+
324
+
325
+ async def _create_function_executor(
326
+ function_executor_description: FunctionExecutorDescription,
327
+ function_executor_server_factory: FunctionExecutorServerFactory,
328
+ downloader: Downloader,
329
+ executor_id: str,
330
+ base_url: str,
331
+ config_path: str,
332
+ logger: Any,
333
+ ) -> FunctionExecutor:
334
+ """Creates a function executor.
335
+
336
+ Raises Exception in case of failure.
337
+ Raises CustomerError if customer code failed during FE creation.
338
+ """
339
+ graph: SerializedObject = await downloader.download_graph(
340
+ namespace=function_executor_description.namespace,
341
+ graph_name=function_executor_description.graph_name,
342
+ graph_version=function_executor_description.graph_version,
343
+ logger=logger,
344
+ )
345
+
346
+ config: FunctionExecutorServerConfiguration = FunctionExecutorServerConfiguration(
347
+ executor_id=executor_id,
348
+ function_executor_id=function_executor_description.id,
349
+ namespace=function_executor_description.namespace,
350
+ secret_names=list(function_executor_description.secret_names),
351
+ )
352
+ if function_executor_description.HasField("image_uri"):
353
+ config.image_uri = function_executor_description.image_uri
354
+
355
+ initialize_request: InitializeRequest = InitializeRequest(
356
+ namespace=function_executor_description.namespace,
357
+ graph_name=function_executor_description.graph_name,
358
+ graph_version=function_executor_description.graph_version,
359
+ function_name=function_executor_description.function_name,
360
+ graph=graph,
361
+ )
362
+ customer_code_timeout_sec: Optional[float] = None
363
+ if function_executor_description.HasField("customer_code_timeout_ms"):
364
+ # TODO: Add integration tests with FE customer code initialization timeout
365
+ # when end-to-end implementation is done.
366
+ customer_code_timeout_sec = (
367
+ function_executor_description.customer_code_timeout_ms / 1000.0
368
+ )
369
+
370
+ function_executor: FunctionExecutor = FunctionExecutor(
371
+ server_factory=function_executor_server_factory, logger=logger
372
+ )
373
+
374
+ try:
375
+ # Raises CustomerError if initialization failed in customer code or customer code timed out.
376
+ await function_executor.initialize(
377
+ config=config,
378
+ initialize_request=initialize_request,
379
+ base_url=base_url,
380
+ config_path=config_path,
381
+ customer_code_timeout_sec=customer_code_timeout_sec,
382
+ )
383
+ return function_executor
384
+ except Exception:
385
+ await function_executor.destroy()
386
+ raise
387
+
388
+
389
+ def _validate_function_executor_description(
390
+ function_executor_description: FunctionExecutorDescription,
391
+ ) -> None:
392
+ """Validates the supplied FE description.
393
+
394
+ Raises ValueError if the description is not valid.
395
+ """
396
+ validator = MessageValidator(function_executor_description)
397
+ validator.required_field("id")
398
+ validator.required_field("namespace")
399
+ validator.required_field("graph_name")
400
+ validator.required_field("graph_version")
401
+ validator.required_field("function_name")
402
+ # image_uri is optional.
403
+ # secret_names can be empty.
404
+ # resource_limits is optional.
405
+
406
+
407
+ class _UnlockedLockContextManager:
408
+ """Unlocks its lock on enter to the scope and locks it back on exit."""
409
+
410
+ def __init__(
411
+ self,
412
+ lock: asyncio.Lock,
413
+ ):
414
+ self._lock: asyncio.Lock = lock
415
+
416
+ async def __aenter__(self):
417
+ self._lock.release()
418
+ return self
419
+
420
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
421
+ await self._lock.acquire()