indexify 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. indexify/cli/cli.py +20 -91
  2. indexify/executor/api_objects.py +2 -0
  3. indexify/executor/executor.py +77 -86
  4. indexify/executor/function_executor/function_executor_state.py +43 -43
  5. indexify/executor/function_executor/function_executor_states_container.py +10 -4
  6. indexify/executor/function_executor/function_executor_status.py +91 -0
  7. indexify/executor/function_executor/metrics/function_executor.py +1 -1
  8. indexify/executor/function_executor/metrics/function_executor_state.py +36 -0
  9. indexify/executor/function_executor/server/function_executor_server_factory.py +8 -8
  10. indexify/executor/function_executor/single_task_runner.py +100 -37
  11. indexify/executor/grpc/channel_creator.py +53 -0
  12. indexify/executor/grpc/metrics/channel_creator.py +18 -0
  13. indexify/executor/grpc/metrics/state_reporter.py +17 -0
  14. indexify/executor/{state_reconciler.py → grpc/state_reconciler.py} +60 -31
  15. indexify/executor/grpc/state_reporter.py +199 -0
  16. indexify/executor/monitoring/health_checker/generic_health_checker.py +27 -12
  17. indexify/executor/task_runner.py +30 -6
  18. indexify/{task_scheduler/proto → proto}/task_scheduler.proto +23 -17
  19. indexify/proto/task_scheduler_pb2.py +64 -0
  20. indexify/{task_scheduler/proto → proto}/task_scheduler_pb2.pyi +28 -10
  21. indexify/{task_scheduler/proto → proto}/task_scheduler_pb2_grpc.py +16 -16
  22. {indexify-0.3.14.dist-info → indexify-0.3.16.dist-info}/METADATA +1 -1
  23. {indexify-0.3.14.dist-info → indexify-0.3.16.dist-info}/RECORD +25 -21
  24. indexify/executor/state_reporter.py +0 -127
  25. indexify/task_scheduler/proto/task_scheduler_pb2.py +0 -69
  26. {indexify-0.3.14.dist-info → indexify-0.3.16.dist-info}/WHEEL +0 -0
  27. {indexify-0.3.14.dist-info → indexify-0.3.16.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,91 @@
1
+ from enum import Enum
2
+
3
+
4
+ class FunctionExecutorStatus(Enum):
5
+ """Status of a Function Executor.
6
+
7
+ Each status lists transitions allowed to it.
8
+ """
9
+
10
+ # DESTROYED -> STARTING_UP
11
+ STARTING_UP = "Starting Up"
12
+ # STARTING_UP -> STARTUP_FAILED_CUSTOMER_ERROR
13
+ STARTUP_FAILED_CUSTOMER_ERROR = "Startup Failed (Customer Error)"
14
+ # STARTING_UP -> STARTUP_FAILED_PLATFORM_ERROR
15
+ STARTUP_FAILED_PLATFORM_ERROR = "Startup Failed (Platform Error)"
16
+ # STARTING_UP -> IDLE
17
+ # RUNNING_TASK -> IDLE
18
+ IDLE = "Idle"
19
+ # IDLE -> RUNNING_TASK
20
+ RUNNING_TASK = "Running Task"
21
+ # IDLE -> UNHEALTHY
22
+ # RUNNING_TASK -> UNHEALTHY
23
+ UNHEALTHY = "Unhealthy"
24
+ # STARTUP_FAILED_CUSTOMER_ERROR -> DESTROYING
25
+ # STARTUP_FAILED_PLATFORM_ERROR -> DESTROYING
26
+ # UNHEALTHY -> DESTROYING
27
+ # IDLE -> DESTROYING
28
+ DESTROYING = "Destroying"
29
+ # DESTROYED (initial status)
30
+ # DESTROYING -> DESTROYED
31
+ DESTROYED = "Destroyed"
32
+ # Any state -> SHUTDOWN
33
+ SHUTDOWN = "Shutdown" # Permanent stop state
34
+
35
+
36
+ def is_status_change_allowed(
37
+ current_status: FunctionExecutorStatus, new_status: FunctionExecutorStatus
38
+ ) -> bool:
39
+ """Returns True if the transition is allowed."""
40
+ allowed_transitions = {
41
+ FunctionExecutorStatus.DESTROYED: [
42
+ FunctionExecutorStatus.DESTROYED,
43
+ FunctionExecutorStatus.STARTING_UP,
44
+ FunctionExecutorStatus.SHUTDOWN,
45
+ ],
46
+ FunctionExecutorStatus.STARTING_UP: [
47
+ FunctionExecutorStatus.STARTING_UP,
48
+ FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
49
+ FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
50
+ FunctionExecutorStatus.IDLE,
51
+ FunctionExecutorStatus.SHUTDOWN,
52
+ ],
53
+ FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR: [
54
+ FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
55
+ FunctionExecutorStatus.DESTROYING,
56
+ FunctionExecutorStatus.SHUTDOWN,
57
+ ],
58
+ FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR: [
59
+ FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
60
+ FunctionExecutorStatus.DESTROYING,
61
+ FunctionExecutorStatus.SHUTDOWN,
62
+ ],
63
+ FunctionExecutorStatus.IDLE: [
64
+ FunctionExecutorStatus.IDLE,
65
+ FunctionExecutorStatus.RUNNING_TASK,
66
+ FunctionExecutorStatus.UNHEALTHY,
67
+ FunctionExecutorStatus.DESTROYING,
68
+ FunctionExecutorStatus.SHUTDOWN,
69
+ ],
70
+ FunctionExecutorStatus.RUNNING_TASK: [
71
+ FunctionExecutorStatus.RUNNING_TASK,
72
+ FunctionExecutorStatus.IDLE,
73
+ FunctionExecutorStatus.UNHEALTHY,
74
+ FunctionExecutorStatus.SHUTDOWN,
75
+ ],
76
+ FunctionExecutorStatus.UNHEALTHY: [
77
+ FunctionExecutorStatus.UNHEALTHY,
78
+ FunctionExecutorStatus.DESTROYING,
79
+ FunctionExecutorStatus.SHUTDOWN,
80
+ ],
81
+ FunctionExecutorStatus.DESTROYING: [
82
+ FunctionExecutorStatus.DESTROYING,
83
+ FunctionExecutorStatus.DESTROYED,
84
+ FunctionExecutorStatus.SHUTDOWN,
85
+ ],
86
+ FunctionExecutorStatus.SHUTDOWN: [
87
+ FunctionExecutorStatus.SHUTDOWN
88
+ ], # No transitions allowed from SHUTDOWN
89
+ }
90
+
91
+ return new_status in allowed_transitions.get(current_status, [])
@@ -90,7 +90,7 @@ metric_get_info_rpc_errors: prometheus_client.Counter = prometheus_client.Counte
90
90
  )
91
91
  metric_function_executor_infos: prometheus_client.Counter = prometheus_client.Counter(
92
92
  "function_executor_infos",
93
- "Number of Function Executors with particular info",
93
+ "Number of Function Executor creations with particular info",
94
94
  ["version", "sdk_version", "sdk_language", "sdk_language_version"],
95
95
  )
96
96
 
@@ -1,5 +1,7 @@
1
1
  import prometheus_client
2
2
 
3
+ from ..function_executor_status import FunctionExecutorStatus
4
+
3
5
  # This file contains all metrics used by FunctionExecutorState.
4
6
 
5
7
  metric_function_executor_state_not_locked_errors: prometheus_client.Counter = (
@@ -8,3 +10,37 @@ metric_function_executor_state_not_locked_errors: prometheus_client.Counter = (
8
10
  "Number of times a Function Executor state was used without acquiring its lock",
9
11
  )
10
12
  )
13
+
14
+ # Function Executors count with a particular status.
15
+ metric_function_executors_with_status: prometheus_client.Gauge = (
16
+ prometheus_client.Gauge(
17
+ "function_executors_with_status",
18
+ "Number of Function Executors with a particular status",
19
+ ["status"],
20
+ )
21
+ )
22
+ metric_function_executors_with_status.labels(
23
+ status=FunctionExecutorStatus.STARTING_UP.name
24
+ )
25
+ metric_function_executors_with_status.labels(
26
+ status=FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR.name
27
+ )
28
+ metric_function_executors_with_status.labels(
29
+ status=FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR.name
30
+ )
31
+ metric_function_executors_with_status.labels(status=FunctionExecutorStatus.IDLE.name)
32
+ metric_function_executors_with_status.labels(
33
+ status=FunctionExecutorStatus.RUNNING_TASK.name
34
+ )
35
+ metric_function_executors_with_status.labels(
36
+ status=FunctionExecutorStatus.UNHEALTHY.name
37
+ )
38
+ metric_function_executors_with_status.labels(
39
+ status=FunctionExecutorStatus.DESTROYING.name
40
+ )
41
+ metric_function_executors_with_status.labels(
42
+ status=FunctionExecutorStatus.DESTROYED.name
43
+ )
44
+ metric_function_executors_with_status.labels(
45
+ status=FunctionExecutorStatus.SHUTDOWN.name
46
+ )
@@ -1,8 +1,10 @@
1
- from typing import Any, Optional
1
+ from dataclasses import dataclass
2
+ from typing import Any, List, Optional
2
3
 
3
4
  from .function_executor_server import FunctionExecutorServer
4
5
 
5
6
 
7
+ @dataclass
6
8
  class FunctionExecutorServerConfiguration:
7
9
  """Configuration for creating a FunctionExecutorServer.
8
10
 
@@ -14,13 +16,11 @@ class FunctionExecutorServerConfiguration:
14
16
  configuration parameters or raise an exception if it can't implement
15
17
  them."""
16
18
 
17
- def __init__(
18
- self, executor_id: str, function_executor_id: str, image_uri: Optional[str]
19
- ):
20
- self.executor_id: str = executor_id
21
- self.function_executor_id: str = function_executor_id
22
- # Container image URI of the Function Executor Server.
23
- self.image_uri: Optional[str] = image_uri
19
+ executor_id: str
20
+ function_executor_id: str
21
+ namespace: str
22
+ image_uri: Optional[str]
23
+ secret_names: List[str]
24
24
 
25
25
 
26
26
  class FunctionExecutorServerFactory:
@@ -14,6 +14,7 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
14
14
  from ..api_objects import Task
15
15
  from .function_executor import CustomerError, FunctionExecutor
16
16
  from .function_executor_state import FunctionExecutorState
17
+ from .function_executor_status import FunctionExecutorStatus
17
18
  from .health_checker import HealthChecker, HealthCheckResult
18
19
  from .metrics.single_task_runner import (
19
20
  metric_function_executor_run_task_rpc_errors,
@@ -40,9 +41,11 @@ class SingleTaskRunner:
40
41
  logger: Any,
41
42
  ):
42
43
  self._executor_id: str = executor_id
43
- self._state: FunctionExecutorState = function_executor_state
44
+ self._function_executor_state: FunctionExecutorState = function_executor_state
44
45
  self._task_input: TaskInput = task_input
45
- self._factory: FunctionExecutorServerFactory = function_executor_server_factory
46
+ self._function_executor_server_factory: FunctionExecutorServerFactory = (
47
+ function_executor_server_factory
48
+ )
46
49
  self._base_url: str = base_url
47
50
  self._config_path: Optional[str] = config_path
48
51
  self._logger = logger.bind(module=__name__)
@@ -54,18 +57,32 @@ class SingleTaskRunner:
54
57
  The lock is released during actual task run in the server.
55
58
  The lock is relocked on return.
56
59
 
57
- Raises an exception if an error occured."""
58
- self._state.check_locked()
60
+ Raises an exception if an error occured.
61
+
62
+ On enter the Function Executor status is either IDLE, UNHEALTHY or DESTROYED.
63
+ On return the Function Executor status is either IDLE, UNHEALTHY or DESTROYED.
64
+ """
65
+ self._function_executor_state.check_locked()
59
66
 
60
- if self._state.is_shutdown:
61
- raise RuntimeError("Function Executor state is shutting down.")
67
+ if self._function_executor_state.status not in [
68
+ FunctionExecutorStatus.IDLE,
69
+ FunctionExecutorStatus.UNHEALTHY,
70
+ FunctionExecutorStatus.DESTROYED,
71
+ ]:
72
+ self._logger.error(
73
+ "Function Executor is not in oneof [IDLE, UNHEALTHY, DESTROYED] state, cannot run the task",
74
+ status=self._function_executor_state.status,
75
+ )
76
+ raise RuntimeError(
77
+ f"Unexpected Function Executor state {self._function_executor_state.status}"
78
+ )
62
79
 
63
80
  # If Function Executor became unhealthy while was idle then destroy it.
64
81
  # It'll be recreated below.
65
82
  await self._destroy_existing_function_executor_if_unhealthy()
66
83
 
67
84
  # Create Function Executor if it doesn't exist yet.
68
- if self._state.function_executor is None:
85
+ if self._function_executor_state.status == FunctionExecutorStatus.DESTROYED:
69
86
  try:
70
87
  await self._create_function_executor()
71
88
  except CustomerError as e:
@@ -87,15 +104,38 @@ class SingleTaskRunner:
87
104
  # The periodic health checker might not notice this as it does only periodic checks.
88
105
  await self._destroy_existing_function_executor_if_unhealthy()
89
106
 
90
- async def _create_function_executor(self) -> FunctionExecutor:
91
- function_executor: FunctionExecutor = FunctionExecutor(
92
- server_factory=self._factory, logger=self._logger
107
+ if self._function_executor_state.status not in [
108
+ FunctionExecutorStatus.IDLE,
109
+ FunctionExecutorStatus.UNHEALTHY,
110
+ FunctionExecutorStatus.DESTROYED,
111
+ ]:
112
+ self._logger.error(
113
+ "Function Executor status is not oneof [IDLE, UNHEALTHY, DESTROYED] after running the task, resetting the state to mitigate a possible bug",
114
+ status=self._function_executor_state.status,
115
+ )
116
+ if self._function_executor_state.function_executor is None:
117
+ await self._function_executor_state.set_status(
118
+ FunctionExecutorStatus.DESTROYED
119
+ )
120
+ else:
121
+ await self._function_executor_state.set_status(
122
+ FunctionExecutorStatus.UNHEALTHY
123
+ )
124
+
125
+ async def _create_function_executor(self) -> None:
126
+ await self._function_executor_state.set_status(
127
+ FunctionExecutorStatus.STARTING_UP
128
+ )
129
+ self._function_executor_state.function_executor = FunctionExecutor(
130
+ server_factory=self._function_executor_server_factory, logger=self._logger
93
131
  )
94
132
  config: FunctionExecutorServerConfiguration = (
95
133
  FunctionExecutorServerConfiguration(
96
134
  executor_id=self._executor_id,
97
- function_executor_id=self._state.id,
135
+ function_executor_id=self._function_executor_state.id,
136
+ namespace=self._task_input.task.namespace,
98
137
  image_uri=self._task_input.task.image_uri,
138
+ secret_names=self._task_input.task.secret_names or [],
99
139
  )
100
140
  )
101
141
  initialize_request: InitializeRequest = InitializeRequest(
@@ -107,17 +147,29 @@ class SingleTaskRunner:
107
147
  )
108
148
 
109
149
  try:
110
- await function_executor.initialize(
150
+ await self._function_executor_state.function_executor.initialize(
111
151
  config=config,
112
152
  initialize_request=initialize_request,
113
153
  base_url=self._base_url,
114
154
  config_path=self._config_path,
115
155
  )
116
- self._state.function_executor = function_executor
156
+ except CustomerError:
157
+ # We have to follow the valid state transition sequence.
158
+ await self._function_executor_state.set_status(
159
+ FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR
160
+ )
161
+ await self._function_executor_state.destroy_function_executor()
162
+ raise
117
163
  except Exception:
118
- await function_executor.destroy()
164
+ # We have to follow the valid state transition sequence.
165
+ await self._function_executor_state.set_status(
166
+ FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
167
+ )
168
+ await self._function_executor_state.destroy_function_executor()
119
169
  raise
120
170
 
171
+ await self._function_executor_state.set_status(FunctionExecutorStatus.IDLE)
172
+
121
173
  async def _run(self) -> TaskOutput:
122
174
  request: RunTaskRequest = RunTaskRequest(
123
175
  namespace=self._task_input.task.namespace,
@@ -130,13 +182,15 @@ class SingleTaskRunner:
130
182
  )
131
183
  if self._task_input.init_value is not None:
132
184
  request.function_init_value.CopyFrom(self._task_input.init_value)
133
- channel: grpc.aio.Channel = self._state.function_executor.channel()
185
+ channel: grpc.aio.Channel = (
186
+ self._function_executor_state.function_executor.channel()
187
+ )
134
188
 
135
189
  async with _RunningTaskContextManager(
136
190
  invocation_id=self._task_input.task.invocation_id,
137
191
  task_id=self._task_input.task.id,
138
192
  health_check_failed_callback=self._health_check_failed_callback,
139
- function_executor_state=self._state,
193
+ function_executor_state=self._function_executor_state,
140
194
  ):
141
195
  with (
142
196
  metric_function_executor_run_task_rpc_errors.count_exceptions(),
@@ -154,31 +208,40 @@ class SingleTaskRunner:
154
208
  async def _health_check_failed_callback(self, result: HealthCheckResult):
155
209
  # Function Executor destroy due to the periodic health check failure ensures that
156
210
  # a running task RPC stuck in unhealthy Function Executor fails immidiately.
157
- async with self._state.lock:
158
- if self._state.function_executor is not None:
159
- await self._destroy_function_executor_on_failed_health_check(
160
- result.reason
161
- )
211
+ async with self._function_executor_state.lock:
212
+ if (
213
+ self._function_executor_state.status
214
+ != FunctionExecutorStatus.RUNNING_TASK
215
+ ):
216
+ # Protection in case the callback gets delivered after we finished running the task.
217
+ return
218
+
219
+ await self._function_executor_state.set_status(
220
+ FunctionExecutorStatus.UNHEALTHY
221
+ )
222
+ await self._destroy_function_executor_on_failed_health_check(result.reason)
162
223
 
163
224
  async def _destroy_existing_function_executor_if_unhealthy(self):
164
- self._state.check_locked()
165
- if self._state.function_executor is None:
166
- return
167
- result: HealthCheckResult = (
168
- await self._state.function_executor.health_checker().check()
169
- )
170
- if result.is_healthy:
171
- return
172
- await self._destroy_function_executor_on_failed_health_check(result.reason)
225
+ self._function_executor_state.check_locked()
226
+ if self._function_executor_state.status == FunctionExecutorStatus.IDLE:
227
+ result: HealthCheckResult = (
228
+ await self._function_executor_state.function_executor.health_checker().check()
229
+ )
230
+ if not result.is_healthy:
231
+ await self._function_executor_state.set_status(
232
+ FunctionExecutorStatus.UNHEALTHY
233
+ )
234
+
235
+ if self._function_executor_state.status == FunctionExecutorStatus.UNHEALTHY:
236
+ await self._destroy_function_executor_on_failed_health_check(result.reason)
173
237
 
174
238
  async def _destroy_function_executor_on_failed_health_check(self, reason: str):
175
- self._state.check_locked()
239
+ self._function_executor_state.check_locked()
176
240
  self._logger.error(
177
241
  "Function Executor health check failed, destroying Function Executor",
178
242
  health_check_fail_reason=reason,
179
243
  )
180
- self._state.health_check_failed = True
181
- await self._state.destroy_function_executor()
244
+ await self._function_executor_state.destroy_function_executor()
182
245
 
183
246
 
184
247
  class _RunningTaskContextManager:
@@ -199,7 +262,7 @@ class _RunningTaskContextManager:
199
262
  self._state: FunctionExecutorState = function_executor_state
200
263
 
201
264
  async def __aenter__(self):
202
- self._state.increment_running_tasks()
265
+ await self._state.set_status(FunctionExecutorStatus.RUNNING_TASK)
203
266
  self._state.function_executor.invocation_state_client().add_task_to_invocation_id_entry(
204
267
  task_id=self._task_id,
205
268
  invocation_id=self._invocation_id,
@@ -213,9 +276,9 @@ class _RunningTaskContextManager:
213
276
 
214
277
  async def __aexit__(self, exc_type, exc_val, exc_tb):
215
278
  await self._state.lock.acquire()
216
- self._state.decrement_running_tasks()
217
- # Health check callback could destroy the FunctionExecutor.
218
- if self._state.function_executor is not None:
279
+ # Health check callback could destroy the FunctionExecutor and set status to UNHEALTHY
280
+ if self._state.status == FunctionExecutorStatus.RUNNING_TASK:
281
+ await self._state.set_status(FunctionExecutorStatus.IDLE)
219
282
  self._state.function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
220
283
  task_id=self._task_id
221
284
  )
@@ -0,0 +1,53 @@
1
+ import asyncio
2
+ from typing import Any
3
+
4
+ import grpc.aio
5
+
6
+ from .metrics.channel_creator import (
7
+ metric_grpc_server_channel_creation_latency,
8
+ metric_grpc_server_channel_creation_retries,
9
+ metric_grpc_server_channel_creations,
10
+ )
11
+
12
+ _RETRY_INTERVAL_SEC = 5
13
+ _CONNECT_TIMEOUT_SEC = 5
14
+
15
+
16
+ class ChannelCreator:
17
+ def __init__(self, server_address: str, logger: Any):
18
+ self._logger = logger.bind(module=__name__)
19
+ self._server_address = server_address
20
+ self._is_shutdown = False
21
+
22
+ async def create(self) -> grpc.aio.Channel:
23
+ """Creates a channel to the gRPC server.
24
+
25
+ Blocks until the channel is ready.
26
+ Never raises any exceptions.
27
+ """
28
+ with metric_grpc_server_channel_creation_latency.time():
29
+ metric_grpc_server_channel_creations.inc()
30
+ while not self._is_shutdown:
31
+ try:
32
+ channel = grpc.aio.insecure_channel(self._server_address)
33
+ await asyncio.wait_for(
34
+ channel.channel_ready(),
35
+ timeout=_CONNECT_TIMEOUT_SEC,
36
+ )
37
+ return channel
38
+ except Exception:
39
+ self._logger.error(
40
+ f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
41
+ )
42
+ try:
43
+ await channel.close()
44
+ except Exception as e:
45
+ self._logger.error(
46
+ "failed closing not established channel", exc_info=e
47
+ )
48
+
49
+ metric_grpc_server_channel_creation_retries.inc()
50
+ await asyncio.sleep(_RETRY_INTERVAL_SEC)
51
+
52
+ async def shutdown(self):
53
+ self._is_shutdown = True
@@ -0,0 +1,18 @@
1
+ import prometheus_client
2
+
3
+ from ...monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ metric_grpc_server_channel_creations = prometheus_client.Counter(
6
+ "grpc_server_channel_creations",
7
+ "Number of times a channel to gRPC Server was created",
8
+ )
9
+ metric_grpc_server_channel_creation_retries = prometheus_client.Counter(
10
+ "grpc_server_channel_creation_retries",
11
+ "Number of retries during a channel creation to gRPC Server",
12
+ )
13
+ metric_grpc_server_channel_creation_latency: prometheus_client.Histogram = (
14
+ latency_metric_for_fast_operation(
15
+ "grpc_server_channel_creation",
16
+ "gRPC server channel creation",
17
+ )
18
+ )
@@ -0,0 +1,17 @@
1
+ import prometheus_client
2
+
3
+ from ...monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ metric_state_report_rpcs = prometheus_client.Counter(
6
+ "state_report_rpcs",
7
+ "Number of Executor state report RPCs to Server",
8
+ )
9
+ metric_state_report_errors = prometheus_client.Counter(
10
+ "state_report_rpc_errors",
11
+ "Number of Executor state report RPC errors",
12
+ )
13
+ metric_state_report_latency: prometheus_client.Histogram = (
14
+ latency_metric_for_fast_operation(
15
+ "state_report_rpc", "Executor state report rpc to Server"
16
+ )
17
+ )
@@ -7,29 +7,29 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
7
7
  SerializedObject,
8
8
  )
9
9
 
10
- from indexify.task_scheduler.proto.task_scheduler_pb2 import (
10
+ from indexify.proto.task_scheduler_pb2 import (
11
11
  DesiredExecutorState,
12
12
  FunctionExecutorDescription,
13
13
  FunctionExecutorStatus,
14
14
  GetDesiredExecutorStatesRequest,
15
15
  )
16
- from indexify.task_scheduler.proto.task_scheduler_pb2_grpc import (
16
+ from indexify.proto.task_scheduler_pb2_grpc import (
17
17
  TaskSchedulerServiceStub,
18
18
  )
19
19
 
20
- from .downloader import Downloader
21
- from .function_executor.function_executor import CustomerError, FunctionExecutor
22
- from .function_executor.function_executor_state import FunctionExecutorState
23
- from .function_executor.function_executor_states_container import (
20
+ from ..downloader import Downloader
21
+ from ..function_executor.function_executor import CustomerError, FunctionExecutor
22
+ from ..function_executor.function_executor_state import FunctionExecutorState
23
+ from ..function_executor.function_executor_states_container import (
24
24
  FunctionExecutorStatesContainer,
25
25
  )
26
- from .function_executor.server.function_executor_server_factory import (
26
+ from ..function_executor.server.function_executor_server_factory import (
27
27
  FunctionExecutorServerConfiguration,
28
28
  FunctionExecutorServerFactory,
29
29
  )
30
- from .function_executor.task_input import TaskInput
31
- from .function_executor.task_output import TaskOutput
32
- from .metrics.executor import (
30
+ from ..function_executor.task_input import TaskInput
31
+ from ..function_executor.task_output import TaskOutput
32
+ from ..metrics.executor import (
33
33
  METRIC_TASKS_COMPLETED_OUTCOME_ALL,
34
34
  METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
35
35
  METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
@@ -42,7 +42,10 @@ from .metrics.executor import (
42
42
  metric_tasks_fetched,
43
43
  metric_tasks_reporting_outcome,
44
44
  )
45
- from .task_reporter import TaskReporter
45
+ from ..task_reporter import TaskReporter
46
+ from .channel_creator import ChannelCreator
47
+
48
+ _RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
46
49
 
47
50
 
48
51
  class ExecutorStateReconciler:
@@ -55,11 +58,13 @@ class ExecutorStateReconciler:
55
58
  config_path: Optional[str],
56
59
  downloader: Downloader,
57
60
  task_reporter: TaskReporter,
58
- server_channel: grpc.aio.Channel,
61
+ channel_creator: ChannelCreator,
59
62
  logger: Any,
60
63
  ):
61
64
  self._executor_id: str = executor_id
62
- self._factory: FunctionExecutorServerFactory = function_executor_server_factory
65
+ self._function_executor_server_factory: FunctionExecutorServerFactory = (
66
+ function_executor_server_factory
67
+ )
63
68
  self._base_url: str = base_url
64
69
  self._config_path: Optional[str] = config_path
65
70
  self._downloader: Downloader = downloader
@@ -67,39 +72,60 @@ class ExecutorStateReconciler:
67
72
  self._function_executor_states: FunctionExecutorStatesContainer = (
68
73
  function_executor_states
69
74
  )
70
- self._stub: TaskSchedulerServiceStub = TaskSchedulerServiceStub(server_channel)
75
+ self._channel_creator = channel_creator
71
76
  self._logger: Any = logger.bind(module=__name__)
72
77
  self._is_shutdown: bool = False
73
- self._reconciliation_lock: asyncio.Lock = asyncio.Lock()
74
78
  self._server_last_clock: Optional[int] = None
75
79
 
76
80
  async def run(self):
77
- desired_states: AsyncGenerator[DesiredExecutorState, None] = (
78
- self._stub.get_desired_executor_states(
79
- GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
80
- )
81
- )
81
+ """Runs the state reconciler.
82
+
83
+ Never raises any exceptions.
84
+ """
85
+ while not self._is_shutdown:
86
+ async with await self._channel_creator.create() as server_channel:
87
+ server_channel: grpc.aio.Channel
88
+ stub = TaskSchedulerServiceStub(server_channel)
89
+ while not self._is_shutdown:
90
+ try:
91
+ # TODO: Report state once before starting the stream.
92
+ desired_states_stream: AsyncGenerator[
93
+ DesiredExecutorState, None
94
+ ] = stub.get_desired_executor_states(
95
+ GetDesiredExecutorStatesRequest(
96
+ executor_id=self._executor_id
97
+ )
98
+ )
99
+ await self._process_desired_states_stream(desired_states_stream)
100
+ except Exception as e:
101
+ self._logger.error(
102
+ f"Failed processing desired states stream, reconnecting in {_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC} sec.",
103
+ exc_info=e,
104
+ )
105
+ await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
106
+ break
107
+
108
+ self._logger.info("State reconciler shutdown.")
109
+
110
+ async def _process_desired_states_stream(
111
+ self, desired_states: AsyncGenerator[DesiredExecutorState, None]
112
+ ):
82
113
  async for new_state in desired_states:
83
114
  if self._is_shutdown:
84
115
  return
116
+
85
117
  new_state: DesiredExecutorState
86
118
  if self._server_last_clock is not None:
87
119
  if self._server_last_clock >= new_state.clock:
88
120
  continue # Duplicate or outdated message state sent by Server.
89
121
 
90
122
  self._server_last_clock = new_state.clock
91
- asyncio.create_task(self._reconcile_state(new_state))
123
+ await self._reconcile_state(new_state)
92
124
 
93
125
  async def _reconcile_state(self, new_state: DesiredExecutorState):
94
- if self._is_shutdown:
95
- return
96
-
97
- # Simple non concurrent implementation for now for the PoC.
98
- # Obtain this lock to force only a single coroutine doing the reconciliation.
99
- async with self._reconciliation_lock:
100
- await self._reconcile_function_executors(new_state)
101
- # TODO
102
- # await self._reconcile_task_allocations(new_state)
126
+ await self._reconcile_function_executors(new_state)
127
+ # TODO
128
+ # await self._reconcile_task_allocations(new_state)
103
129
 
104
130
  async def shutdown(self):
105
131
  """Shuts down the state reconciler.
@@ -121,6 +147,7 @@ class ExecutorStateReconciler:
121
147
  graph_name=desired_function_executor.graph_name,
122
148
  graph_version=desired_function_executor.graph_version,
123
149
  function_name=desired_function_executor.function_name,
150
+ image_uri=desired_function_executor.image_uri,
124
151
  )
125
152
  )
126
153
 
@@ -203,13 +230,15 @@ class ExecutorStateReconciler:
203
230
  logger=logger,
204
231
  )
205
232
  function_executor: FunctionExecutor = FunctionExecutor(
206
- server_factory=self._factory, logger=logger
233
+ server_factory=self._function_executor_server_factory, logger=logger
207
234
  )
208
235
  config: FunctionExecutorServerConfiguration = (
209
236
  FunctionExecutorServerConfiguration(
210
237
  executor_id=self._executor_id,
211
238
  function_executor_id=description.id,
239
+ namespace=description.namespace,
212
240
  image_uri=description.image_uri,
241
+ secret_names=list(description.secret_names),
213
242
  )
214
243
  )
215
244
  initialize_request: InitializeRequest = InitializeRequest(