indexify 0.4.10__tar.gz → 0.4.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {indexify-0.4.10 → indexify-0.4.12}/PKG-INFO +2 -2
  2. {indexify-0.4.10 → indexify-0.4.12}/pyproject.toml +2 -2
  3. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/events.py +18 -16
  4. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/function_executor_controller.py +177 -121
  5. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/metrics/function_executor_controller.py +25 -18
  6. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/run_task.py +14 -0
  7. indexify-0.4.12/src/indexify/executor/function_executor_controller/terminate_function_executor.py +38 -0
  8. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/state_reconciler.py +2 -7
  9. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/proto/executor_api.proto +2 -4
  10. indexify-0.4.12/src/indexify/proto/executor_api_pb2.py +88 -0
  11. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/proto/executor_api_pb2.pyi +9 -13
  12. indexify-0.4.10/src/indexify/executor/function_executor_controller/destroy_function_executor.py +0 -28
  13. indexify-0.4.10/src/indexify/proto/executor_api_pb2.py +0 -88
  14. {indexify-0.4.10 → indexify-0.4.12}/README.md +0 -0
  15. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/cli/__init__.py +0 -0
  16. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/cli/build_image.py +0 -0
  17. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/cli/deploy.py +0 -0
  18. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/cli/executor.py +0 -0
  19. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/README.md +0 -0
  20. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/blob_store/blob_store.py +0 -0
  21. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/blob_store/local_fs_blob_store.py +0 -0
  22. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/blob_store/metrics/blob_store.py +0 -0
  23. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/blob_store/s3_blob_store.py +0 -0
  24. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/channel_manager.py +0 -0
  25. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/executor.py +0 -0
  26. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_allowlist.py +0 -0
  27. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/function_executor.py +0 -0
  28. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/health_checker.py +0 -0
  29. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/invocation_state_client.py +0 -0
  30. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/metrics/function_executor.py +0 -0
  31. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/metrics/health_checker.py +0 -0
  32. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/metrics/invocation_state_client.py +0 -0
  33. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/server/client_configuration.py +0 -0
  34. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/server/function_executor_server.py +0 -0
  35. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/server/function_executor_server_factory.py +0 -0
  36. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/server/subprocess_function_executor_server.py +0 -0
  37. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +0 -0
  38. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/__init__.py +0 -0
  39. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/completed_task_metrics.py +0 -0
  40. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/create_function_executor.py +0 -0
  41. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/debug_event_loop.py +0 -0
  42. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/downloads.py +0 -0
  43. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/function_executor_startup_output.py +0 -0
  44. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/loggers.py +0 -0
  45. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/message_validators.py +0 -0
  46. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +0 -0
  47. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/metrics/downloads.py +0 -0
  48. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/metrics/run_task.py +0 -0
  49. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/metrics/upload_task_output.py +0 -0
  50. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/prepare_task.py +0 -0
  51. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/task_info.py +0 -0
  52. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/task_output.py +0 -0
  53. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/function_executor_controller/upload_task_output.py +0 -0
  54. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/host_resources/host_resources.py +0 -0
  55. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/host_resources/nvidia_gpu.py +0 -0
  56. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/host_resources/nvidia_gpu_allocator.py +0 -0
  57. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/metrics/channel_manager.py +0 -0
  58. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/metrics/executor.py +0 -0
  59. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/metrics/state_reconciler.py +0 -0
  60. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/metrics/state_reporter.py +0 -0
  61. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/monitoring/handler.py +0 -0
  62. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/monitoring/health_check_handler.py +0 -0
  63. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/monitoring/health_checker/generic_health_checker.py +0 -0
  64. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/monitoring/health_checker/health_checker.py +0 -0
  65. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/monitoring/metrics.py +0 -0
  66. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/monitoring/prometheus_metrics_handler.py +0 -0
  67. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/monitoring/server.py +0 -0
  68. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/monitoring/startup_probe_handler.py +0 -0
  69. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/executor/state_reporter.py +0 -0
  70. {indexify-0.4.10 → indexify-0.4.12}/src/indexify/proto/executor_api_pb2_grpc.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: indexify
3
- Version: 0.4.10
3
+ Version: 0.4.12
4
4
  Summary: Open Source Indexify components and helper tools
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -17,7 +17,7 @@ Requires-Dist: aiohttp (>=3.11.0,<4.0.0)
17
17
  Requires-Dist: boto3 (>=1.37.30,<2.0.0)
18
18
  Requires-Dist: prometheus-client (>=0.21.1,<0.22.0)
19
19
  Requires-Dist: psutil (>=7.0.0,<8.0.0)
20
- Requires-Dist: tensorlake (==0.2.7)
20
+ Requires-Dist: tensorlake (==0.2.8)
21
21
  Project-URL: Repository, https://github.com/tensorlakeai/indexify
22
22
  Description-Content-Type: text/markdown
23
23
 
@@ -1,7 +1,7 @@
1
1
  [tool.poetry]
2
2
  name = "indexify"
3
3
  # Incremented if any of the components provided in this packages are updated.
4
- version = "0.4.10"
4
+ version = "0.4.12"
5
5
  description = "Open Source Indexify components and helper tools"
6
6
  authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
7
7
  license = "Apache 2.0"
@@ -25,7 +25,7 @@ prometheus-client = "^0.21.1"
25
25
  psutil = "^7.0.0"
26
26
  # Adds function-executor binary, utils lib, sdk used in indexify-cli commands.
27
27
  # We need to specify the tensorlake version exactly because pip install doesn't respect poetry.lock files.
28
- tensorlake = "0.2.7"
28
+ tensorlake = "0.2.8"
29
29
  # Uncomment the next line to use local tensorlake package (only for development!)
30
30
  # tensorlake = { path = "../tensorlake", develop = true }
31
31
  # pydantic is provided by tensorlake
@@ -1,5 +1,5 @@
1
1
  from enum import Enum
2
- from typing import Optional
2
+ from typing import List, Optional
3
3
 
4
4
  from indexify.executor.function_executor.function_executor import (
5
5
  FunctionExecutor,
@@ -12,7 +12,7 @@ from .task_info import TaskInfo
12
12
 
13
13
  class EventType(Enum):
14
14
  FUNCTION_EXECUTOR_CREATED = 1
15
- FUNCTION_EXECUTOR_DESTROYED = 2
15
+ FUNCTION_EXECUTOR_TERMINATED = 2
16
16
  SHUTDOWN_INITIATED = 3
17
17
  TASK_PREPARATION_FINISHED = 4
18
18
  SCHEDULE_TASK_EXECUTION = 5
@@ -50,23 +50,32 @@ class FunctionExecutorCreated(BaseEvent):
50
50
  self.output: FunctionExecutorStartupOutput = output
51
51
 
52
52
 
53
- class FunctionExecutorDestroyed(BaseEvent):
53
+ class FunctionExecutorTerminated(BaseEvent):
54
54
  """
55
- Event indicating that Function Executor has been destroyed.
55
+ Event indicating that Function Executor has been terminated (destroyed).
56
56
  """
57
57
 
58
58
  def __init__(
59
- self, is_success: bool, termination_reason: FunctionExecutorTerminationReason
59
+ self,
60
+ is_success: bool,
61
+ fe_termination_reason: FunctionExecutorTerminationReason,
62
+ allocation_ids_caused_termination: List[str],
60
63
  ):
61
- super().__init__(EventType.FUNCTION_EXECUTOR_DESTROYED)
64
+ super().__init__(EventType.FUNCTION_EXECUTOR_TERMINATED)
62
65
  self.is_success: bool = is_success
63
- self.termination_reason: FunctionExecutorTerminationReason = termination_reason
66
+ self.fe_termination_reason: FunctionExecutorTerminationReason = (
67
+ fe_termination_reason
68
+ )
69
+ self.allocation_ids_caused_termination: List[str] = (
70
+ allocation_ids_caused_termination
71
+ )
64
72
 
65
73
  def __str__(self) -> str:
66
74
  return (
67
75
  f"Event(type={self.event_type.name}, "
68
76
  f"is_success={self.is_success}, "
69
- f"termination_reason={FunctionExecutorTerminationReason.Name(self.termination_reason)})"
77
+ f"fe_termination_reason={FunctionExecutorTerminationReason.Name(self.fe_termination_reason)}, "
78
+ f"allocation_ids_caused_termination={self.allocation_ids_caused_termination})"
70
79
  )
71
80
 
72
81
 
@@ -75,15 +84,8 @@ class ShutdownInitiated(BaseEvent):
75
84
  Event indicating that Function Executor shutdown has been initiated.
76
85
  """
77
86
 
78
- def __init__(self, termination_reason: FunctionExecutorTerminationReason):
87
+ def __init__(self):
79
88
  super().__init__(EventType.SHUTDOWN_INITIATED)
80
- self.termination_reason: FunctionExecutorTerminationReason = termination_reason
81
-
82
- def __str__(self) -> str:
83
- return (
84
- f"Event(type={self.event_type.name}, "
85
- f"termination_reason={FunctionExecutorTerminationReason.Name(self.termination_reason)})"
86
- )
87
89
 
88
90
 
89
91
  class TaskPreparationFinished(BaseEvent):
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  import time
3
3
  from collections.abc import Coroutine
4
+ from enum import Enum
4
5
  from pathlib import Path
5
6
  from typing import Any, Dict, List, Optional
6
7
 
@@ -28,12 +29,11 @@ from .debug_event_loop import (
28
29
  debug_print_events,
29
30
  debug_print_processing_event,
30
31
  )
31
- from .destroy_function_executor import destroy_function_executor
32
32
  from .events import (
33
33
  BaseEvent,
34
34
  EventType,
35
35
  FunctionExecutorCreated,
36
- FunctionExecutorDestroyed,
36
+ FunctionExecutorTerminated,
37
37
  ScheduleTaskExecution,
38
38
  ShutdownInitiated,
39
39
  TaskExecutionFinished,
@@ -43,12 +43,14 @@ from .events import (
43
43
  from .function_executor_startup_output import FunctionExecutorStartupOutput
44
44
  from .loggers import function_executor_logger, task_allocation_logger
45
45
  from .metrics.function_executor_controller import (
46
- METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING,
47
- METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING,
48
- METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED,
49
- METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN,
46
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED,
47
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING,
48
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP,
49
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED,
50
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING,
51
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN,
50
52
  metric_control_loop_handle_event_latency,
51
- metric_function_executors_with_status,
53
+ metric_function_executors_with_state,
52
54
  metric_runnable_tasks,
53
55
  metric_runnable_tasks_per_function_name,
54
56
  metric_schedule_task_latency,
@@ -58,9 +60,20 @@ from .prepare_task import prepare_task
58
60
  from .run_task import run_task_on_function_executor
59
61
  from .task_info import TaskInfo
60
62
  from .task_output import TaskOutput
63
+ from .terminate_function_executor import terminate_function_executor
61
64
  from .upload_task_output import upload_task_output
62
65
 
63
66
 
67
+ # Actual FE controller states, they are a bit different from statuses reported to the Server.
68
+ # All the valid state transitions are forward only (can skip multiple states in a row).
69
+ class _FE_CONTROLLER_STATE(Enum):
70
+ NOT_STARTED = 1
71
+ STARTING_UP = 2
72
+ RUNNING = 3
73
+ TERMINATING = 4
74
+ TERMINATED = 5
75
+
76
+
64
77
  class FunctionExecutorController:
65
78
  def __init__(
66
79
  self,
@@ -94,19 +107,17 @@ class FunctionExecutorController:
94
107
  self._logger: Any = function_executor_logger(
95
108
  function_executor_description, logger.bind(module=__name__)
96
109
  )
97
- # Mutable state. No lock needed as it's modified by async tasks running in
98
- # the same event loop.
110
+ self._destroy_lock: asyncio.Lock = asyncio.Lock()
111
+ # Mutable state. No lock needed as it's modified by async tasks running in the same event loop.
99
112
  self._fe: Optional[FunctionExecutor] = None
100
- self._fe_termination_reason: FunctionExecutorTerminationReason = (
101
- None # Optional
102
- )
103
- # FE Status reported to Server.
104
- self._status: FunctionExecutorStatus = (
105
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN
106
- )
107
- metric_function_executors_with_status.labels(
108
- status=_to_fe_status_metric_label(self._status, self._logger)
113
+ self._internal_state = _FE_CONTROLLER_STATE.NOT_STARTED
114
+ metric_function_executors_with_state.labels(
115
+ state=_to_fe_state_metric_label(self._internal_state, self._logger)
109
116
  ).inc()
117
+ self._reported_state: FunctionExecutorState = FunctionExecutorState(
118
+ description=function_executor_description,
119
+ status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN,
120
+ )
110
121
  # Ordered list of events to be processed by the control loop.
111
122
  self._events: List[BaseEvent] = []
112
123
  # Asyncio event used to notify the control loop that there are new events to process.
@@ -124,13 +135,6 @@ class FunctionExecutorController:
124
135
  def function_executor_id(self) -> str:
125
136
  return self._fe_description.id
126
137
 
127
- def status(self) -> FunctionExecutorStatus:
128
- """Returns the current status of the Function Executor.
129
-
130
- Not blocking.
131
- """
132
- return self._status
133
-
134
138
  def add_task_allocation(self, task_allocation: TaskAllocation) -> None:
135
139
  """Adds a task to the Function Executor.
136
140
 
@@ -205,9 +209,10 @@ class FunctionExecutorController:
205
209
  """Starts up the Function Executor and prepares it to run tasks.
206
210
 
207
211
  Not blocking. Never raises exceptions."""
208
- if self._control_loop_aio_task is not None:
212
+ if self._internal_state != _FE_CONTROLLER_STATE.NOT_STARTED:
209
213
  self._logger.warning(
210
- "ignoring startup call as the Function Executor is already started"
214
+ "function executor state is not NOT_STARTED, ignoring startup call",
215
+ internal_state=self._internal_state.name,
211
216
  )
212
217
  return
213
218
 
@@ -215,7 +220,13 @@ class FunctionExecutorController:
215
220
  self._control_loop(),
216
221
  name="function executor control loop",
217
222
  )
218
- self._set_status(FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_PENDING)
223
+ self._update_internal_state(_FE_CONTROLLER_STATE.STARTING_UP)
224
+ self._update_reported_state(
225
+ FunctionExecutorState(
226
+ description=self._fe_description,
227
+ status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_PENDING,
228
+ )
229
+ )
219
230
  next_aio = create_function_executor(
220
231
  function_executor_description=self._fe_description,
221
232
  function_executor_server_factory=self._fe_server_factory,
@@ -237,17 +248,13 @@ class FunctionExecutorController:
237
248
  ),
238
249
  )
239
250
 
240
- async def shutdown(
241
- self, termination_reason: FunctionExecutorTerminationReason
242
- ) -> None:
251
+ async def shutdown(self) -> None:
243
252
  """Shutsdown the Function Executor and frees all of its resources.
244
253
 
245
- All the tasks are reported as failed with FE Terminated failure code.
254
+ No task outcomes and outputs are getting reported to Server after this call.
246
255
  Doesn't raise any exceptions. Blocks until the shutdown is complete.
247
256
  """
248
- self._add_event(
249
- ShutdownInitiated(termination_reason=termination_reason), source="shutdown"
250
- )
257
+ self._add_event(ShutdownInitiated(), source="shutdown")
251
258
  try:
252
259
  await self._control_loop_aio_task
253
260
  except asyncio.CancelledError:
@@ -259,51 +266,49 @@ class FunctionExecutorController:
259
266
  )
260
267
  self._logger.info("function executor controller shutdown finished")
261
268
 
262
- def _set_status(
269
+ def _update_internal_state(self, new_state: _FE_CONTROLLER_STATE) -> None:
270
+ """Updates the internal state of the Function Executor Controller.
271
+
272
+ Not blocking. Never raises exceptions."""
273
+ old_state: _FE_CONTROLLER_STATE = self._internal_state
274
+ self._internal_state = new_state
275
+
276
+ self._logger.info(
277
+ "function executor internal state changed",
278
+ old_state=old_state.name,
279
+ new_state=new_state.name,
280
+ )
281
+
282
+ metric_function_executors_with_state.labels(
283
+ state=_to_fe_state_metric_label(old_state, self._logger)
284
+ ).dec()
285
+ metric_function_executors_with_state.labels(
286
+ state=_to_fe_state_metric_label(new_state, self._logger)
287
+ ).inc()
288
+
289
+ def _update_reported_state(
263
290
  self,
264
- status: FunctionExecutorStatus,
291
+ new_state: FunctionExecutorState,
265
292
  ) -> None:
266
- """Sets Function Executor status and reports it to the Server.
293
+ """Sets new Function Executor state and reports it to the Server.
267
294
 
268
295
  Not blocking. Never raises exceptions."""
269
- old_status: FunctionExecutorStatus = self._status
270
- new_status: FunctionExecutorStatus = status
271
- self._status: FunctionExecutorStatus = new_status
296
+ old_state: FunctionExecutorState = self._reported_state
297
+ self._reported_state = new_state
272
298
 
273
299
  self._logger.info(
274
- "function executor status changed",
275
- old_status=FunctionExecutorStatus.Name(old_status),
276
- new_status=FunctionExecutorStatus.Name(new_status),
300
+ "function executor grpc status changed",
301
+ old_status=FunctionExecutorStatus.Name(old_state.status),
302
+ new_status=FunctionExecutorStatus.Name(new_state.status),
277
303
  termination_reason=_termination_reason_to_short_name(
278
- self._fe_termination_reason
304
+ new_state.termination_reason
279
305
  ),
280
306
  )
281
- metric_function_executors_with_status.labels(
282
- status=_to_fe_status_metric_label(old_status, self._logger)
283
- ).dec()
284
- metric_function_executors_with_status.labels(
285
- status=_to_fe_status_metric_label(new_status, self._logger)
286
- ).inc()
287
307
 
288
- self._state_reporter.update_function_executor_state(self._current_state())
308
+ self._state_reporter.update_function_executor_state(new_state)
289
309
  # Report the status change to the Server asap to reduce latency in the system.
290
310
  self._state_reporter.schedule_state_report()
291
311
 
292
- def _current_state(self) -> FunctionExecutorState:
293
- """Returns the current state of the Function Executor.
294
-
295
- Not blocking. Never raises exceptions.
296
- """
297
- termination_reason: Optional[FunctionExecutorTerminationReason] = None
298
- if self._fe_termination_reason is not None:
299
- termination_reason = self._fe_termination_reason
300
-
301
- return FunctionExecutorState(
302
- description=self._fe_description,
303
- status=self._status,
304
- termination_reason=termination_reason,
305
- )
306
-
307
312
  async def _control_loop(self) -> None:
308
313
  """Runs control loop that coordinates all the work done by the Function Executor.
309
314
 
@@ -341,8 +346,8 @@ class FunctionExecutorController:
341
346
  """
342
347
  if event.event_type == EventType.FUNCTION_EXECUTOR_CREATED:
343
348
  return self._handle_event_function_executor_created(event)
344
- elif event.event_type == EventType.FUNCTION_EXECUTOR_DESTROYED:
345
- return self._handle_event_function_executor_destroyed(event)
349
+ elif event.event_type == EventType.FUNCTION_EXECUTOR_TERMINATED:
350
+ return self._handle_event_function_executor_terminated(event)
346
351
  elif event.event_type == EventType.TASK_PREPARATION_FINISHED:
347
352
  return self._handle_event_task_preparation_finished(event)
348
353
  elif event.event_type == EventType.SCHEDULE_TASK_EXECUTION:
@@ -454,13 +459,25 @@ class FunctionExecutorController:
454
459
  self._state_reporter.schedule_state_report()
455
460
 
456
461
  if event.function_executor is None:
457
- self._destroy_function_executor_before_termination(
458
- event.output.termination_reason
462
+ # Server needs to increment attempts counter for all the tasks that were pending while FE was starting up.
463
+ # This prevents infinite retries if FEs consistently fail to start up.
464
+ self._start_termination(
465
+ fe_termination_reason=event.output.termination_reason,
466
+ allocation_ids_caused_termination=[
467
+ task_info.allocation.allocation_id
468
+ for task_info in self._tasks.values()
469
+ ],
459
470
  )
460
471
  return
461
472
 
462
473
  self._fe = event.function_executor
463
- self._set_status(FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING)
474
+ self._update_internal_state(_FE_CONTROLLER_STATE.RUNNING)
475
+ self._update_reported_state(
476
+ FunctionExecutorState(
477
+ description=self._fe_description,
478
+ status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING,
479
+ )
480
+ )
464
481
  # Health checker starts after FE creation and gets automatically stopped on FE destroy.
465
482
  self._fe.health_checker().start(self._health_check_failed_callback)
466
483
  self._add_event(
@@ -468,20 +485,30 @@ class FunctionExecutorController:
468
485
  source="_handle_event_function_executor_created",
469
486
  )
470
487
 
471
- def _handle_event_function_executor_destroyed(
472
- self, event: FunctionExecutorDestroyed
488
+ def _handle_event_function_executor_terminated(
489
+ self, event: FunctionExecutorTerminated
473
490
  ) -> None:
474
- """Handles the Function Executor destroy finished event.
491
+ """Handles the Function Executor terminated event.
475
492
 
476
493
  Doesn't raise any exceptions. Doesn't block.
477
494
  """
478
495
  if not event.is_success:
479
496
  self._logger.error(
480
- "Function Executor destroy failed unexpectedly, this should never happen",
497
+ "Function Executor termination failed unexpectedly, this should never happen",
498
+ )
499
+
500
+ self._fe = None
501
+ # Set reported status only after the FE got destroyed because Server assumes that all FE resources are freed when the status changes.
502
+ self._update_reported_state(
503
+ FunctionExecutorState(
504
+ description=self._fe_description,
505
+ status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED,
506
+ termination_reason=event.fe_termination_reason,
507
+ allocation_ids_caused_termination=event.allocation_ids_caused_termination,
481
508
  )
482
- # Set the status only after the FE got destroyed because Server assumes that all FE resources are freed when the status changes.
483
- self._fe_termination_reason = event.termination_reason
484
- self._set_status(FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED)
509
+ )
510
+ self._update_internal_state(_FE_CONTROLLER_STATE.TERMINATED)
511
+
485
512
  # Invoke the scheduler so it can fail runnable tasks with FE Terminated error.
486
513
  self._add_event(
487
514
  ScheduleTaskExecution(),
@@ -493,8 +520,14 @@ class FunctionExecutorController:
493
520
  "Function Executor health check failed, terminating Function Executor",
494
521
  reason=result.reason,
495
522
  )
496
- self._destroy_function_executor_before_termination(
497
- termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
523
+
524
+ self._start_termination(
525
+ fe_termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY,
526
+ allocation_ids_caused_termination=(
527
+ []
528
+ if self._running_task is None
529
+ else [self._running_task.allocation.allocation_id]
530
+ ),
498
531
  )
499
532
 
500
533
  def _handle_event_task_preparation_finished(
@@ -532,14 +565,15 @@ class FunctionExecutorController:
532
565
  if len(self._runnable_tasks) == 0:
533
566
  return
534
567
 
535
- if self._status not in [
536
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING,
537
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED,
568
+ if self._internal_state not in [
569
+ _FE_CONTROLLER_STATE.RUNNING,
570
+ _FE_CONTROLLER_STATE.TERMINATING,
571
+ _FE_CONTROLLER_STATE.TERMINATED,
538
572
  ]:
539
- return # Can't progress pending task with the current status.
573
+ return # Can't progress runnable tasks in the current state.
540
574
 
541
575
  if (
542
- self._status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING
576
+ self._internal_state == _FE_CONTROLLER_STATE.RUNNING
543
577
  and self._running_task is not None
544
578
  ):
545
579
  return
@@ -555,12 +589,15 @@ class FunctionExecutorController:
555
589
  if task_info.is_cancelled:
556
590
  task_info.output = TaskOutput.task_cancelled(task_info.allocation)
557
591
  self._start_task_output_upload(task_info)
558
- elif self._status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED:
592
+ elif self._internal_state in [
593
+ _FE_CONTROLLER_STATE.TERMINATING,
594
+ _FE_CONTROLLER_STATE.TERMINATED,
595
+ ]:
559
596
  task_info.output = TaskOutput.function_executor_terminated(
560
597
  task_info.allocation
561
598
  )
562
599
  self._start_task_output_upload(task_info)
563
- elif self._status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING:
600
+ elif self._internal_state == _FE_CONTROLLER_STATE.RUNNING:
564
601
  self._running_task = task_info
565
602
  next_aio = run_task_on_function_executor(
566
603
  task_info=task_info,
@@ -603,8 +640,11 @@ class FunctionExecutorController:
603
640
  ScheduleTaskExecution(), source="_handle_event_task_execution_finished"
604
641
  )
605
642
  else:
606
- self._destroy_function_executor_before_termination(
607
- termination_reason=event.function_executor_termination_reason
643
+ self._start_termination(
644
+ fe_termination_reason=event.function_executor_termination_reason,
645
+ allocation_ids_caused_termination=[
646
+ event.task_info.allocation.allocation_id
647
+ ],
608
648
  )
609
649
 
610
650
  # Ignore is_cancelled because cancelling a task still involves uploading its output.
@@ -660,23 +700,37 @@ class FunctionExecutorController:
660
700
  )
661
701
  self._state_reporter.schedule_state_report()
662
702
 
663
- def _destroy_function_executor_before_termination(
664
- self, termination_reason: FunctionExecutorTerminationReason
703
+ def _start_termination(
704
+ self,
705
+ fe_termination_reason: FunctionExecutorTerminationReason,
706
+ allocation_ids_caused_termination: List[str],
665
707
  ) -> None:
666
- """Destroys the Function Executor and frees all its resources to prepare for transitioning to the TERMINATED state.
708
+ """Starts termination of the Function Executor if it's not started yet.
667
709
 
668
710
  Doesn't raise any exceptions. Doesn't block.
669
711
  """
670
- next_aio = destroy_function_executor(
712
+ if self._internal_state in [
713
+ _FE_CONTROLLER_STATE.TERMINATING,
714
+ _FE_CONTROLLER_STATE.TERMINATED,
715
+ ]:
716
+ # _start_termination() can be called multiple times, e.g. by each failed task alloc
717
+ # when the FE is unhealthy. Dedup the calls to keep state machine consistent.
718
+ return
719
+
720
+ self._update_internal_state(_FE_CONTROLLER_STATE.TERMINATING)
721
+ next_aio = terminate_function_executor(
671
722
  function_executor=self._fe,
672
- termination_reason=termination_reason,
723
+ lock=self._destroy_lock,
724
+ fe_termination_reason=fe_termination_reason,
725
+ allocation_ids_caused_termination=allocation_ids_caused_termination,
673
726
  logger=self._logger,
674
727
  )
675
- self._fe = None
676
728
  self._spawn_aio_for_fe(
677
729
  aio=next_aio,
678
- on_exception=FunctionExecutorDestroyed(
679
- is_success=False, termination_reason=termination_reason
730
+ on_exception=FunctionExecutorTerminated(
731
+ is_success=False,
732
+ fe_termination_reason=fe_termination_reason,
733
+ allocation_ids_caused_termination=allocation_ids_caused_termination,
680
734
  ),
681
735
  )
682
736
 
@@ -716,16 +770,18 @@ class FunctionExecutorController:
716
770
  # BaseException includes asyncio.CancelledError which is always raised here.
717
771
  pass
718
772
 
719
- if self._status != FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED:
720
- self._handle_event_function_executor_destroyed(
721
- await destroy_function_executor(
722
- function_executor=self._fe,
723
- termination_reason=event.termination_reason,
724
- logger=self._logger,
773
+ # Makes sure we don't run fe destroy concurrently with an event loop task.
774
+ # FE destroy uses asyncio.to_thread() calls so it doesn't get cancelled with all the tasks above.
775
+ async with self._destroy_lock:
776
+ if self._fe is not None:
777
+ self._logger.info(
778
+ "destroying function executor",
725
779
  )
726
- )
727
- metric_function_executors_with_status.labels(
728
- status=_to_fe_status_metric_label(self._status, self._logger)
780
+ await self._fe.destroy()
781
+
782
+ # Cleanup the metric from this FE.
783
+ metric_function_executors_with_state.labels(
784
+ state=_to_fe_state_metric_label(self._internal_state, self._logger)
729
785
  ).dec()
730
786
 
731
787
  self._state_reporter.remove_function_executor_state(self.function_executor_id())
@@ -735,21 +791,23 @@ class FunctionExecutorController:
735
791
  debug_print_events(events=self._events, logger=self._logger)
736
792
 
737
793
 
738
- def _to_fe_status_metric_label(status: FunctionExecutorStatus, logger: Any) -> str:
739
- if status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN:
740
- return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN
741
- elif status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_PENDING:
742
- return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING
743
- elif status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING:
744
- return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING
745
- elif status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED:
746
- return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED
794
+ def _to_fe_state_metric_label(state: _FE_CONTROLLER_STATE, logger: Any) -> str:
795
+ if state == _FE_CONTROLLER_STATE.NOT_STARTED:
796
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED
797
+ elif state == _FE_CONTROLLER_STATE.STARTING_UP:
798
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP
799
+ elif state == _FE_CONTROLLER_STATE.RUNNING:
800
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING
801
+ elif state == _FE_CONTROLLER_STATE.TERMINATING:
802
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING
803
+ elif state == _FE_CONTROLLER_STATE.TERMINATED:
804
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED
747
805
  else:
748
806
  logger.error(
749
- "unexpected Function Executor status",
750
- status=FunctionExecutorStatus.Name(status),
807
+ "unexpected Function Executor internal state",
808
+ state=state.name,
751
809
  )
752
- return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN
810
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN
753
811
 
754
812
 
755
813
  _termination_reason_to_short_name_map = {
@@ -757,8 +815,6 @@ _termination_reason_to_short_name_map = {
757
815
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR: "STARTUP_FAILED_INTERNAL_ERROR",
758
816
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_ERROR: "STARTUP_FAILED_FUNCTION_ERROR",
759
817
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_TIMEOUT: "STARTUP_FAILED_FUNCTION_TIMEOUT",
760
- FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_EXECUTOR_SHUTDOWN: "EXECUTOR_SHUTDOWN",
761
- FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_REMOVED_FROM_DESIRED_STATE: "REMOVED_FROM_DESIRED_STATE",
762
818
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY: "UNHEALTHY",
763
819
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR: "INTERNAL_ERROR",
764
820
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT: "FUNCTION_TIMEOUT",
@@ -34,27 +34,34 @@ metric_runnable_tasks_per_function_name: prometheus_client.Gauge = (
34
34
  )
35
35
  )
36
36
 
37
- metric_function_executors_with_status: prometheus_client.Gauge = (
38
- prometheus_client.Gauge(
39
- "function_executors_with_status",
40
- "Number of Function Executors with a particular status",
41
- ["status"],
42
- )
37
+ metric_function_executors_with_state: prometheus_client.Gauge = prometheus_client.Gauge(
38
+ "function_executors_with_state",
39
+ "Number of Function Executors with a particular internal state",
40
+ ["state"],
43
41
  )
44
- METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN = "unknown"
45
- METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING = "pending"
46
- METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING = "running"
47
- METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED = "terminated"
42
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN = "unknown"
43
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED = "not_started"
44
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP = "starting_up"
45
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING = "running"
46
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING = "terminating"
47
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED = "terminated"
48
+
48
49
 
49
- metric_function_executors_with_status.labels(
50
- status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN
50
+ metric_function_executors_with_state.labels(
51
+ state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN
52
+ )
53
+ metric_function_executors_with_state.labels(
54
+ state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED
55
+ )
56
+ metric_function_executors_with_state.labels(
57
+ state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP
51
58
  )
52
- metric_function_executors_with_status.labels(
53
- status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING
59
+ metric_function_executors_with_state.labels(
60
+ state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING
54
61
  )
55
- metric_function_executors_with_status.labels(
56
- status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING
62
+ metric_function_executors_with_state.labels(
63
+ state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING
57
64
  )
58
- metric_function_executors_with_status.labels(
59
- status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED
65
+ metric_function_executors_with_state.labels(
66
+ state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED
60
67
  )