indexify 0.3.15__py3-none-any.whl → 0.3.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. indexify/cli/cli.py +20 -91
  2. indexify/executor/api_objects.py +2 -0
  3. indexify/executor/executor.py +75 -84
  4. indexify/executor/function_executor/function_executor.py +5 -2
  5. indexify/executor/function_executor/function_executor_state.py +43 -43
  6. indexify/executor/function_executor/function_executor_states_container.py +10 -4
  7. indexify/executor/function_executor/function_executor_status.py +91 -0
  8. indexify/executor/function_executor/health_checker.py +37 -13
  9. indexify/executor/function_executor/metrics/function_executor.py +1 -1
  10. indexify/executor/function_executor/metrics/function_executor_state.py +36 -0
  11. indexify/executor/function_executor/server/function_executor_server_factory.py +8 -8
  12. indexify/executor/function_executor/single_task_runner.py +100 -37
  13. indexify/executor/grpc/channel_creator.py +53 -0
  14. indexify/executor/grpc/metrics/channel_creator.py +18 -0
  15. indexify/executor/grpc/metrics/state_reporter.py +17 -0
  16. indexify/executor/{state_reconciler.py → grpc/state_reconciler.py} +60 -31
  17. indexify/executor/grpc/state_reporter.py +199 -0
  18. indexify/executor/metrics/task_runner.py +7 -0
  19. indexify/executor/monitoring/health_checker/generic_health_checker.py +27 -12
  20. indexify/executor/task_runner.py +34 -6
  21. indexify/{task_scheduler/proto → proto}/task_scheduler.proto +23 -17
  22. indexify/proto/task_scheduler_pb2.py +64 -0
  23. indexify/{task_scheduler/proto → proto}/task_scheduler_pb2.pyi +28 -10
  24. indexify/{task_scheduler/proto → proto}/task_scheduler_pb2_grpc.py +16 -16
  25. {indexify-0.3.15.dist-info → indexify-0.3.17.dist-info}/METADATA +1 -1
  26. {indexify-0.3.15.dist-info → indexify-0.3.17.dist-info}/RECORD +28 -24
  27. indexify/executor/state_reporter.py +0 -127
  28. indexify/task_scheduler/proto/task_scheduler_pb2.py +0 -69
  29. {indexify-0.3.15.dist-info → indexify-0.3.17.dist-info}/WHEEL +0 -0
  30. {indexify-0.3.15.dist-info → indexify-0.3.17.dist-info}/entry_points.txt +0 -0
@@ -7,29 +7,29 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
7
7
  SerializedObject,
8
8
  )
9
9
 
10
- from indexify.task_scheduler.proto.task_scheduler_pb2 import (
10
+ from indexify.proto.task_scheduler_pb2 import (
11
11
  DesiredExecutorState,
12
12
  FunctionExecutorDescription,
13
13
  FunctionExecutorStatus,
14
14
  GetDesiredExecutorStatesRequest,
15
15
  )
16
- from indexify.task_scheduler.proto.task_scheduler_pb2_grpc import (
16
+ from indexify.proto.task_scheduler_pb2_grpc import (
17
17
  TaskSchedulerServiceStub,
18
18
  )
19
19
 
20
- from .downloader import Downloader
21
- from .function_executor.function_executor import CustomerError, FunctionExecutor
22
- from .function_executor.function_executor_state import FunctionExecutorState
23
- from .function_executor.function_executor_states_container import (
20
+ from ..downloader import Downloader
21
+ from ..function_executor.function_executor import CustomerError, FunctionExecutor
22
+ from ..function_executor.function_executor_state import FunctionExecutorState
23
+ from ..function_executor.function_executor_states_container import (
24
24
  FunctionExecutorStatesContainer,
25
25
  )
26
- from .function_executor.server.function_executor_server_factory import (
26
+ from ..function_executor.server.function_executor_server_factory import (
27
27
  FunctionExecutorServerConfiguration,
28
28
  FunctionExecutorServerFactory,
29
29
  )
30
- from .function_executor.task_input import TaskInput
31
- from .function_executor.task_output import TaskOutput
32
- from .metrics.executor import (
30
+ from ..function_executor.task_input import TaskInput
31
+ from ..function_executor.task_output import TaskOutput
32
+ from ..metrics.executor import (
33
33
  METRIC_TASKS_COMPLETED_OUTCOME_ALL,
34
34
  METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
35
35
  METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
@@ -42,7 +42,10 @@ from .metrics.executor import (
42
42
  metric_tasks_fetched,
43
43
  metric_tasks_reporting_outcome,
44
44
  )
45
- from .task_reporter import TaskReporter
45
+ from ..task_reporter import TaskReporter
46
+ from .channel_creator import ChannelCreator
47
+
48
+ _RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
46
49
 
47
50
 
48
51
  class ExecutorStateReconciler:
@@ -55,11 +58,13 @@ class ExecutorStateReconciler:
55
58
  config_path: Optional[str],
56
59
  downloader: Downloader,
57
60
  task_reporter: TaskReporter,
58
- server_channel: grpc.aio.Channel,
61
+ channel_creator: ChannelCreator,
59
62
  logger: Any,
60
63
  ):
61
64
  self._executor_id: str = executor_id
62
- self._factory: FunctionExecutorServerFactory = function_executor_server_factory
65
+ self._function_executor_server_factory: FunctionExecutorServerFactory = (
66
+ function_executor_server_factory
67
+ )
63
68
  self._base_url: str = base_url
64
69
  self._config_path: Optional[str] = config_path
65
70
  self._downloader: Downloader = downloader
@@ -67,39 +72,60 @@ class ExecutorStateReconciler:
67
72
  self._function_executor_states: FunctionExecutorStatesContainer = (
68
73
  function_executor_states
69
74
  )
70
- self._stub: TaskSchedulerServiceStub = TaskSchedulerServiceStub(server_channel)
75
+ self._channel_creator = channel_creator
71
76
  self._logger: Any = logger.bind(module=__name__)
72
77
  self._is_shutdown: bool = False
73
- self._reconciliation_lock: asyncio.Lock = asyncio.Lock()
74
78
  self._server_last_clock: Optional[int] = None
75
79
 
76
80
  async def run(self):
77
- desired_states: AsyncGenerator[DesiredExecutorState, None] = (
78
- self._stub.get_desired_executor_states(
79
- GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
80
- )
81
- )
81
+ """Runs the state reconciler.
82
+
83
+ Never raises any exceptions.
84
+ """
85
+ while not self._is_shutdown:
86
+ async with await self._channel_creator.create() as server_channel:
87
+ server_channel: grpc.aio.Channel
88
+ stub = TaskSchedulerServiceStub(server_channel)
89
+ while not self._is_shutdown:
90
+ try:
91
+ # TODO: Report state once before starting the stream.
92
+ desired_states_stream: AsyncGenerator[
93
+ DesiredExecutorState, None
94
+ ] = stub.get_desired_executor_states(
95
+ GetDesiredExecutorStatesRequest(
96
+ executor_id=self._executor_id
97
+ )
98
+ )
99
+ await self._process_desired_states_stream(desired_states_stream)
100
+ except Exception as e:
101
+ self._logger.error(
102
+ f"Failed processing desired states stream, reconnecting in {_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC} sec.",
103
+ exc_info=e,
104
+ )
105
+ await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
106
+ break
107
+
108
+ self._logger.info("State reconciler shutdown.")
109
+
110
+ async def _process_desired_states_stream(
111
+ self, desired_states: AsyncGenerator[DesiredExecutorState, None]
112
+ ):
82
113
  async for new_state in desired_states:
83
114
  if self._is_shutdown:
84
115
  return
116
+
85
117
  new_state: DesiredExecutorState
86
118
  if self._server_last_clock is not None:
87
119
  if self._server_last_clock >= new_state.clock:
88
120
  continue # Duplicate or outdated message state sent by Server.
89
121
 
90
122
  self._server_last_clock = new_state.clock
91
- asyncio.create_task(self._reconcile_state(new_state))
123
+ await self._reconcile_state(new_state)
92
124
 
93
125
  async def _reconcile_state(self, new_state: DesiredExecutorState):
94
- if self._is_shutdown:
95
- return
96
-
97
- # Simple non concurrent implementation for now for the PoC.
98
- # Obtain this lock to force only a single coroutine doing the reconciliation.
99
- async with self._reconciliation_lock:
100
- await self._reconcile_function_executors(new_state)
101
- # TODO
102
- # await self._reconcile_task_allocations(new_state)
126
+ await self._reconcile_function_executors(new_state)
127
+ # TODO
128
+ # await self._reconcile_task_allocations(new_state)
103
129
 
104
130
  async def shutdown(self):
105
131
  """Shuts down the state reconciler.
@@ -121,6 +147,7 @@ class ExecutorStateReconciler:
121
147
  graph_name=desired_function_executor.graph_name,
122
148
  graph_version=desired_function_executor.graph_version,
123
149
  function_name=desired_function_executor.function_name,
150
+ image_uri=desired_function_executor.image_uri,
124
151
  )
125
152
  )
126
153
 
@@ -203,13 +230,15 @@ class ExecutorStateReconciler:
203
230
  logger=logger,
204
231
  )
205
232
  function_executor: FunctionExecutor = FunctionExecutor(
206
- server_factory=self._factory, logger=logger
233
+ server_factory=self._function_executor_server_factory, logger=logger
207
234
  )
208
235
  config: FunctionExecutorServerConfiguration = (
209
236
  FunctionExecutorServerConfiguration(
210
237
  executor_id=self._executor_id,
211
238
  function_executor_id=description.id,
239
+ namespace=description.namespace,
212
240
  image_uri=description.image_uri,
241
+ secret_names=list(description.secret_names),
213
242
  )
214
243
  )
215
244
  initialize_request: InitializeRequest = InitializeRequest(
@@ -0,0 +1,199 @@
1
+ import asyncio
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ import grpc
5
+
6
+ from indexify.proto.task_scheduler_pb2 import (
7
+ AllowedFunction,
8
+ ExecutorState,
9
+ ExecutorStatus,
10
+ FunctionExecutorDescription,
11
+ )
12
+ from indexify.proto.task_scheduler_pb2 import (
13
+ FunctionExecutorState as FunctionExecutorStateProto,
14
+ )
15
+ from indexify.proto.task_scheduler_pb2 import (
16
+ FunctionExecutorStatus as FunctionExecutorStatusProto,
17
+ )
18
+ from indexify.proto.task_scheduler_pb2 import (
19
+ GPUModel,
20
+ GPUResources,
21
+ HostResources,
22
+ ReportExecutorStateRequest,
23
+ )
24
+ from indexify.proto.task_scheduler_pb2_grpc import (
25
+ TaskSchedulerServiceStub,
26
+ )
27
+
28
+ from ..api_objects import FunctionURI
29
+ from ..function_executor.function_executor_state import FunctionExecutorState
30
+ from ..function_executor.function_executor_states_container import (
31
+ FunctionExecutorStatesContainer,
32
+ )
33
+ from ..function_executor.function_executor_status import FunctionExecutorStatus
34
+ from .channel_creator import ChannelCreator
35
+ from .metrics.state_reporter import (
36
+ metric_state_report_errors,
37
+ metric_state_report_latency,
38
+ metric_state_report_rpcs,
39
+ )
40
+
41
+ _REPORTING_INTERVAL_SEC = 5
42
+ _REPORT_RPC_TIMEOUT_SEC = 5
43
+ _REPORT_BACKOFF_ON_ERROR_SEC = 5
44
+
45
+
46
+ class ExecutorStateReporter:
47
+ def __init__(
48
+ self,
49
+ executor_id: str,
50
+ development_mode: bool,
51
+ function_allowlist: Optional[List[FunctionURI]],
52
+ function_executor_states: FunctionExecutorStatesContainer,
53
+ channel_creator: ChannelCreator,
54
+ logger: Any,
55
+ ):
56
+ self._executor_id: str = executor_id
57
+ self._development_mode: bool = development_mode
58
+ self._function_executor_states: FunctionExecutorStatesContainer = (
59
+ function_executor_states
60
+ )
61
+ self._channel_creator = channel_creator
62
+ self._logger: Any = logger.bind(module=__name__)
63
+ self._is_shutdown: bool = False
64
+ self._executor_status: ExecutorStatus = ExecutorStatus.EXECUTOR_STATUS_UNKNOWN
65
+ self._allowed_functions: List[AllowedFunction] = _to_grpc_allowed_functions(
66
+ function_allowlist
67
+ )
68
+
69
+ def update_executor_status(self, value: ExecutorStatus):
70
+ self._executor_status = value
71
+
72
+ async def run(self):
73
+ """Runs the state reporter.
74
+
75
+ Never raises any exceptions.
76
+ """
77
+ while not self._is_shutdown:
78
+ async with await self._channel_creator.create() as server_channel:
79
+ server_channel: grpc.aio.Channel
80
+ stub = TaskSchedulerServiceStub(server_channel)
81
+ while not self._is_shutdown:
82
+ try:
83
+ await self._report_state(stub)
84
+ await asyncio.sleep(_REPORTING_INTERVAL_SEC)
85
+ except Exception as e:
86
+ self._logger.error(
87
+ f"Failed to report state to the server, reconnecting in {_REPORT_BACKOFF_ON_ERROR_SEC} sec.",
88
+ exc_info=e,
89
+ )
90
+ await asyncio.sleep(_REPORT_BACKOFF_ON_ERROR_SEC)
91
+ break
92
+
93
+ self._logger.info("State reporter shutdown")
94
+
95
+ async def _report_state(self, stub: TaskSchedulerServiceStub):
96
+ with (
97
+ metric_state_report_errors.count_exceptions(),
98
+ metric_state_report_latency.time(),
99
+ ):
100
+ metric_state_report_rpcs.inc()
101
+ state = ExecutorState(
102
+ executor_id=self._executor_id,
103
+ development_mode=self._development_mode,
104
+ executor_status=self._executor_status,
105
+ free_resources=await self._fetch_free_host_resources(),
106
+ allowed_functions=self._allowed_functions,
107
+ function_executor_states=await self._fetch_function_executor_states(),
108
+ )
109
+
110
+ await stub.report_executor_state(
111
+ ReportExecutorStateRequest(executor_state=state),
112
+ timeout=_REPORT_RPC_TIMEOUT_SEC,
113
+ )
114
+
115
+ async def _fetch_free_host_resources(self) -> HostResources:
116
+ # TODO: Implement host resource metrics reporting.
117
+ return HostResources(
118
+ cpu_count=0,
119
+ memory_bytes=0,
120
+ disk_bytes=0,
121
+ gpu=GPUResources(
122
+ count=0,
123
+ model=GPUModel.GPU_MODEL_UNKNOWN,
124
+ ),
125
+ )
126
+
127
+ async def _fetch_function_executor_states(self) -> List[FunctionExecutorStateProto]:
128
+ states = []
129
+
130
+ async for function_executor_state in self._function_executor_states:
131
+ function_executor_state: FunctionExecutorState
132
+ states.append(
133
+ FunctionExecutorStateProto(
134
+ description=FunctionExecutorDescription(
135
+ id=function_executor_state.id,
136
+ namespace=function_executor_state.namespace,
137
+ graph_name=function_executor_state.graph_name,
138
+ graph_version=function_executor_state.graph_version,
139
+ function_name=function_executor_state.function_name,
140
+ ),
141
+ status=_to_grpc_function_executor_status(
142
+ function_executor_state.status, self._logger
143
+ ),
144
+ )
145
+ )
146
+
147
+ return states
148
+
149
+ async def shutdown(self):
150
+ """Shuts down the state reporter.
151
+
152
+ Never raises any exceptions.
153
+ """
154
+ self._is_shutdown = True
155
+
156
+
157
+ def _to_grpc_allowed_functions(function_allowlist: Optional[List[FunctionURI]]):
158
+ if function_allowlist is None:
159
+ return []
160
+
161
+ allowed_functions: List[AllowedFunction] = []
162
+ for function_uri in function_allowlist:
163
+ function_uri: FunctionURI
164
+ allowed_function = AllowedFunction(
165
+ namespace=function_uri.namespace,
166
+ graph_name=function_uri.compute_graph,
167
+ function_name=function_uri.compute_fn,
168
+ )
169
+ if function_uri.version is not None:
170
+ allowed_function.graph_version = function_uri.version
171
+ allowed_functions.append(allowed_function)
172
+
173
+ return allowed_functions
174
+
175
+
176
+ _STATUS_MAPPING: Dict[FunctionExecutorStatus, Any] = {
177
+ FunctionExecutorStatus.STARTING_UP: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTING_UP,
178
+ FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR,
179
+ FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR,
180
+ FunctionExecutorStatus.IDLE: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE,
181
+ FunctionExecutorStatus.RUNNING_TASK: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK,
182
+ FunctionExecutorStatus.UNHEALTHY: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNHEALTHY,
183
+ FunctionExecutorStatus.DESTROYING: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPING,
184
+ FunctionExecutorStatus.DESTROYED: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
185
+ FunctionExecutorStatus.SHUTDOWN: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
186
+ }
187
+
188
+
189
+ def _to_grpc_function_executor_status(
190
+ status: FunctionExecutorStatus, logger: Any
191
+ ) -> FunctionExecutorStatusProto:
192
+ result: FunctionExecutorStatusProto = _STATUS_MAPPING.get(
193
+ status, FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNKNOWN
194
+ )
195
+
196
+ if result == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNKNOWN:
197
+ logger.error("Unexpected Function Executor status", status=status)
198
+
199
+ return result
@@ -23,6 +23,13 @@ metric_tasks_blocked_by_policy: prometheus_client.Gauge = prometheus_client.Gaug
23
23
  "tasks_blocked_by_policy",
24
24
  "Number of tasks that are ready for execution but are blocked according to the current policy (typically waiting for a free Function Executor)",
25
25
  )
26
+ metric_tasks_blocked_by_policy_per_function_name: prometheus_client.Gauge = (
27
+ prometheus_client.Gauge(
28
+ "tasks_blocked_by_policy_per_function_name",
29
+ "Number of tasks that are ready for execution but are blocked according to the current policy (typically waiting for a free Function Executor)",
30
+ ["function_name"],
31
+ )
32
+ )
26
33
 
27
34
  # Metrics for the stage when task is running.
28
35
  metric_task_runs: prometheus_client.Counter = prometheus_client.Counter(
@@ -3,6 +3,7 @@ from typing import Optional
3
3
  from ...function_executor.function_executor_states_container import (
4
4
  FunctionExecutorStatesContainer,
5
5
  )
6
+ from ...function_executor.function_executor_status import FunctionExecutorStatus
6
7
  from .health_checker import HealthChecker, HealthCheckResult
7
8
 
8
9
  HEALTH_CHECKER_NAME = "GenericHealthChecker"
@@ -16,6 +17,7 @@ class GenericHealthChecker(HealthChecker):
16
17
 
17
18
  def __init__(self):
18
19
  self._function_executor_states: Optional[FunctionExecutorStatesContainer] = None
20
+ self._function_executor_health_check_ever_failed = False
19
21
 
20
22
  def set_function_executor_states_container(
21
23
  self, states: FunctionExecutorStatesContainer
@@ -42,17 +44,30 @@ class GenericHealthChecker(HealthChecker):
42
44
  # * So we fail whole Executor health check if a Function Executor health check ever failed to hint the users
43
45
  # that we probably need to recreate the Executor machine/VM/container (unless there's a bug in Function
44
46
  # code that user can investigate themself).
47
+ await self._check_function_executors()
48
+ if self._function_executor_health_check_ever_failed:
49
+ return HealthCheckResult(
50
+ is_success=False,
51
+ status_message="A Function Executor health check failed",
52
+ checker_name=HEALTH_CHECKER_NAME,
53
+ )
54
+ else:
55
+ return HealthCheckResult(
56
+ is_success=True,
57
+ status_message="All Function Executors pass health checks",
58
+ checker_name=HEALTH_CHECKER_NAME,
59
+ )
60
+
61
+ async def _check_function_executors(self):
62
+ if self._function_executor_health_check_ever_failed:
63
+ return
64
+
45
65
  async for state in self._function_executor_states:
46
66
  # No need to async lock the state to read a single value.
47
- if state.health_check_failed:
48
- return HealthCheckResult(
49
- is_success=False,
50
- status_message="A Function Executor health check failed",
51
- checker_name=HEALTH_CHECKER_NAME,
52
- )
53
-
54
- return HealthCheckResult(
55
- is_success=True,
56
- status_message="All Function Executors pass health checks",
57
- checker_name=HEALTH_CHECKER_NAME,
58
- )
67
+ if state.status in [
68
+ FunctionExecutorStatus.UNHEALTHY,
69
+ FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
70
+ FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
71
+ ]:
72
+ self._function_executor_health_check_ever_failed = True
73
+ return
@@ -1,7 +1,10 @@
1
1
  from typing import Any, Optional
2
2
 
3
3
  from .api_objects import Task
4
- from .function_executor.function_executor_state import FunctionExecutorState
4
+ from .function_executor.function_executor_state import (
5
+ FunctionExecutorState,
6
+ FunctionExecutorStatus,
7
+ )
5
8
  from .function_executor.function_executor_states_container import (
6
9
  FunctionExecutorStatesContainer,
7
10
  )
@@ -19,6 +22,7 @@ from .metrics.task_runner import (
19
22
  metric_task_run_platform_errors,
20
23
  metric_task_runs,
21
24
  metric_tasks_blocked_by_policy,
25
+ metric_tasks_blocked_by_policy_per_function_name,
22
26
  metric_tasks_running,
23
27
  )
24
28
 
@@ -52,6 +56,9 @@ class TaskRunner:
52
56
  with (
53
57
  metric_task_policy_errors.count_exceptions(),
54
58
  metric_tasks_blocked_by_policy.track_inprogress(),
59
+ metric_tasks_blocked_by_policy_per_function_name.labels(
60
+ function_name=task_input.task.compute_fn
61
+ ).track_inprogress(),
55
62
  metric_task_policy_latency.time(),
56
63
  ):
57
64
  metric_task_policy_runs.inc()
@@ -109,6 +116,10 @@ class TaskRunner:
109
116
  raise
110
117
 
111
118
  async def _run_task_policy(self, state: FunctionExecutorState, task: Task) -> None:
119
+ """Runs the task policy until the task can run on the Function Executor.
120
+
121
+ On successful return the Function Executor status is either IDLE or DESTROYED.
122
+ """
112
123
  # Current policy for running tasks:
113
124
  # - There can only be a single Function Executor per function regardless of function versions.
114
125
  # -- If a Function Executor already exists for a different function version then wait until
@@ -116,13 +127,30 @@ class TaskRunner:
116
127
  # -- This prevents failed tasks for different versions of the same function continiously
117
128
  # destroying each other's Function Executors.
118
129
  # - Each Function Executor rans at most 1 task concurrently.
119
- await state.wait_running_tasks_less(1)
130
+ await state.wait_status(
131
+ [
132
+ FunctionExecutorStatus.DESTROYED,
133
+ FunctionExecutorStatus.IDLE,
134
+ FunctionExecutorStatus.UNHEALTHY,
135
+ FunctionExecutorStatus.SHUTDOWN,
136
+ ]
137
+ )
138
+ # We only shutdown the Function Executor on full Executor shutdown so it's fine to raise error here.
139
+ if state.status == FunctionExecutorStatus.SHUTDOWN:
140
+ raise Exception("Function Executor state is shutting down")
120
141
 
121
- if state.graph_version != task.graph_version:
142
+ if state.status == FunctionExecutorStatus.UNHEALTHY:
122
143
  await state.destroy_function_executor()
123
- state.graph_version = task.graph_version
124
- # At this point the state belongs to the version of the function from the task
125
- # and there are no running tasks in the Function Executor.
144
+
145
+ if state.graph_version == task.graph_version:
146
+ return # All good, we can run on this Function Executor.
147
+
148
+ if state.status in [FunctionExecutorStatus.IDLE]:
149
+ await state.destroy_function_executor()
150
+
151
+ state.graph_version = task.graph_version
152
+ # At this point the state belongs to the version of the function from the task
153
+ # and there are no running tasks in the Function Executor.
126
154
 
127
155
  async def _run_task(
128
156
  self, state: FunctionExecutorState, task_input: TaskInput, logger: Any
@@ -26,7 +26,7 @@ message GPUResources {
26
26
  optional GPUModel model = 2;
27
27
  }
28
28
 
29
- // Free host resources available at the Executor.
29
+ // Resources that we're currently tracking and limiting on Executor.
30
30
  message HostResources {
31
31
  optional uint32 cpu_count = 1;
32
32
  optional uint64 memory_bytes = 2;
@@ -45,14 +45,14 @@ message AllowedFunction {
45
45
 
46
46
  enum FunctionExecutorStatus {
47
47
  FUNCTION_EXECUTOR_STATUS_UNKNOWN = 0;
48
- FUNCTION_EXECUTOR_STATUS_STOPPED = 1;
49
- FUNCTION_EXECUTOR_STATUS_STARTING_UP = 2;
50
- FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR = 3;
51
- FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR = 4;
52
- FUNCTION_EXECUTOR_STATUS_IDLE = 5;
53
- FUNCTION_EXECUTOR_STATUS_RUNNING_TASK = 6;
54
- FUNCTION_EXECUTOR_STATUS_UNHEALTHY = 7;
55
- FUNCTION_EXECUTOR_STATUS_STOPPING = 8;
48
+ FUNCTION_EXECUTOR_STATUS_STARTING_UP = 1;
49
+ FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR = 2;
50
+ FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR = 3;
51
+ FUNCTION_EXECUTOR_STATUS_IDLE = 4;
52
+ FUNCTION_EXECUTOR_STATUS_RUNNING_TASK = 5;
53
+ FUNCTION_EXECUTOR_STATUS_UNHEALTHY = 6;
54
+ FUNCTION_EXECUTOR_STATUS_STOPPING = 7;
55
+ FUNCTION_EXECUTOR_STATUS_STOPPED = 8;
56
56
  }
57
57
 
58
58
  // Immutable information that identifies and describes a Function Executor.
@@ -63,6 +63,8 @@ message FunctionExecutorDescription {
63
63
  optional string graph_version = 4;
64
64
  optional string function_name = 5;
65
65
  optional string image_uri = 6;
66
+ repeated string secret_names = 7;
67
+ optional HostResources resource_limits = 8;
66
68
  }
67
69
 
68
70
  message FunctionExecutorState {
@@ -72,19 +74,22 @@ message FunctionExecutorState {
72
74
 
73
75
  enum ExecutorStatus {
74
76
  EXECUTOR_STATUS_UNKNOWN = 0;
75
- EXECUTOR_STATUS_STARTING = 1;
77
+ EXECUTOR_STATUS_STARTING_UP = 1;
76
78
  EXECUTOR_STATUS_RUNNING = 2;
77
79
  EXECUTOR_STATUS_DRAINED = 3;
78
- EXECUTOR_STATUS_SHUTTING_DOWN = 4;
80
+ EXECUTOR_STATUS_STOPPING = 4;
81
+ EXECUTOR_STATUS_STOPPED = 5;
79
82
  }
80
83
 
81
84
  message ExecutorState {
82
85
  optional string executor_id = 1;
83
- optional ExecutorStatus executor_status = 2;
84
- optional HostResources host_resources = 3;
86
+ optional bool development_mode = 2;
87
+ optional ExecutorStatus executor_status = 3;
88
+ // Free resources available at the Executor.
89
+ optional HostResources free_resources = 4;
85
90
  // Empty allowed_functions list means that any function can run on the Executor.
86
- repeated AllowedFunction allowed_functions = 4;
87
- repeated FunctionExecutorState function_executor_states = 5;
91
+ repeated AllowedFunction allowed_functions = 5;
92
+ repeated FunctionExecutorState function_executor_states = 6;
88
93
  }
89
94
 
90
95
  // A message sent by Executor to report its up to date state to Server.
@@ -106,6 +111,7 @@ message Task {
106
111
  optional string graph_invocation_id = 6;
107
112
  optional string input_key = 8;
108
113
  optional string reducer_output_key = 9;
114
+ optional string timeout_ms = 10;
109
115
  }
110
116
 
111
117
  message TaskAllocation {
@@ -139,9 +145,9 @@ service TaskSchedulerService {
139
145
  // Called by Executor to open a stream of its desired states. When Server wants Executor to change something
140
146
  // it puts a message on the stream with the new desired state of the Executor.
141
147
  //
142
- // Depricated HTTP API is used to download the serialized graph and task inputs.
148
+ // Deprecated HTTP API is used to download the serialized graph and task inputs.
143
149
  rpc get_desired_executor_states(GetDesiredExecutorStatesRequest) returns (stream DesiredExecutorState) {}
144
150
 
145
- // Task outcome is currently reported via depricated HTTP API. We're going to migrate task output reporting to gRPC
151
+ // Task outcome is currently reported via deprecated HTTP API. We're going to migrate task output reporting to gRPC
146
152
  // when we move S3 downloads and uploads to Executor.
147
153
  }