indexify 0.3.15__py3-none-any.whl → 0.3.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. indexify/cli/cli.py +20 -91
  2. indexify/executor/api_objects.py +2 -0
  3. indexify/executor/executor.py +75 -84
  4. indexify/executor/function_executor/function_executor.py +5 -2
  5. indexify/executor/function_executor/function_executor_state.py +43 -43
  6. indexify/executor/function_executor/function_executor_states_container.py +10 -4
  7. indexify/executor/function_executor/function_executor_status.py +91 -0
  8. indexify/executor/function_executor/health_checker.py +37 -13
  9. indexify/executor/function_executor/metrics/function_executor.py +1 -1
  10. indexify/executor/function_executor/metrics/function_executor_state.py +36 -0
  11. indexify/executor/function_executor/server/function_executor_server_factory.py +8 -8
  12. indexify/executor/function_executor/single_task_runner.py +100 -37
  13. indexify/executor/grpc/channel_creator.py +53 -0
  14. indexify/executor/grpc/metrics/channel_creator.py +18 -0
  15. indexify/executor/grpc/metrics/state_reporter.py +17 -0
  16. indexify/executor/{state_reconciler.py → grpc/state_reconciler.py} +60 -31
  17. indexify/executor/grpc/state_reporter.py +199 -0
  18. indexify/executor/metrics/task_runner.py +7 -0
  19. indexify/executor/monitoring/health_checker/generic_health_checker.py +27 -12
  20. indexify/executor/task_runner.py +34 -6
  21. indexify/{task_scheduler/proto → proto}/task_scheduler.proto +23 -17
  22. indexify/proto/task_scheduler_pb2.py +64 -0
  23. indexify/{task_scheduler/proto → proto}/task_scheduler_pb2.pyi +28 -10
  24. indexify/{task_scheduler/proto → proto}/task_scheduler_pb2_grpc.py +16 -16
  25. {indexify-0.3.15.dist-info → indexify-0.3.17.dist-info}/METADATA +1 -1
  26. {indexify-0.3.15.dist-info → indexify-0.3.17.dist-info}/RECORD +28 -24
  27. indexify/executor/state_reporter.py +0 -127
  28. indexify/task_scheduler/proto/task_scheduler_pb2.py +0 -69
  29. {indexify-0.3.15.dist-info → indexify-0.3.17.dist-info}/WHEEL +0 -0
  30. {indexify-0.3.15.dist-info → indexify-0.3.17.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,91 @@
1
+ from enum import Enum
2
+
3
+
4
+ class FunctionExecutorStatus(Enum):
5
+ """Status of a Function Executor.
6
+
7
+ Each status lists transitions allowed to it.
8
+ """
9
+
10
+ # DESTROYED -> STARTING_UP
11
+ STARTING_UP = "Starting Up"
12
+ # STARTING_UP -> STARTUP_FAILED_CUSTOMER_ERROR
13
+ STARTUP_FAILED_CUSTOMER_ERROR = "Startup Failed (Customer Error)"
14
+ # STARTING_UP -> STARTUP_FAILED_PLATFORM_ERROR
15
+ STARTUP_FAILED_PLATFORM_ERROR = "Startup Failed (Platform Error)"
16
+ # STARTING_UP -> IDLE
17
+ # RUNNING_TASK -> IDLE
18
+ IDLE = "Idle"
19
+ # IDLE -> RUNNING_TASK
20
+ RUNNING_TASK = "Running Task"
21
+ # IDLE -> UNHEALTHY
22
+ # RUNNING_TASK -> UNHEALTHY
23
+ UNHEALTHY = "Unhealthy"
24
+ # STARTUP_FAILED_CUSTOMER_ERROR -> DESTROYING
25
+ # STARTUP_FAILED_PLATFORM_ERROR -> DESTROYING
26
+ # UNHEALTHY -> DESTROYING
27
+ # IDLE -> DESTROYING
28
+ DESTROYING = "Destroying"
29
+ # DESTROYED (initial status)
30
+ # DESTROYING -> DESTROYED
31
+ DESTROYED = "Destroyed"
32
+ # Any state -> SHUTDOWN
33
+ SHUTDOWN = "Shutdown" # Permanent stop state
34
+
35
+
36
+ def is_status_change_allowed(
37
+ current_status: FunctionExecutorStatus, new_status: FunctionExecutorStatus
38
+ ) -> bool:
39
+ """Returns True if the transition is allowed."""
40
+ allowed_transitions = {
41
+ FunctionExecutorStatus.DESTROYED: [
42
+ FunctionExecutorStatus.DESTROYED,
43
+ FunctionExecutorStatus.STARTING_UP,
44
+ FunctionExecutorStatus.SHUTDOWN,
45
+ ],
46
+ FunctionExecutorStatus.STARTING_UP: [
47
+ FunctionExecutorStatus.STARTING_UP,
48
+ FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
49
+ FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
50
+ FunctionExecutorStatus.IDLE,
51
+ FunctionExecutorStatus.SHUTDOWN,
52
+ ],
53
+ FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR: [
54
+ FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
55
+ FunctionExecutorStatus.DESTROYING,
56
+ FunctionExecutorStatus.SHUTDOWN,
57
+ ],
58
+ FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR: [
59
+ FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
60
+ FunctionExecutorStatus.DESTROYING,
61
+ FunctionExecutorStatus.SHUTDOWN,
62
+ ],
63
+ FunctionExecutorStatus.IDLE: [
64
+ FunctionExecutorStatus.IDLE,
65
+ FunctionExecutorStatus.RUNNING_TASK,
66
+ FunctionExecutorStatus.UNHEALTHY,
67
+ FunctionExecutorStatus.DESTROYING,
68
+ FunctionExecutorStatus.SHUTDOWN,
69
+ ],
70
+ FunctionExecutorStatus.RUNNING_TASK: [
71
+ FunctionExecutorStatus.RUNNING_TASK,
72
+ FunctionExecutorStatus.IDLE,
73
+ FunctionExecutorStatus.UNHEALTHY,
74
+ FunctionExecutorStatus.SHUTDOWN,
75
+ ],
76
+ FunctionExecutorStatus.UNHEALTHY: [
77
+ FunctionExecutorStatus.UNHEALTHY,
78
+ FunctionExecutorStatus.DESTROYING,
79
+ FunctionExecutorStatus.SHUTDOWN,
80
+ ],
81
+ FunctionExecutorStatus.DESTROYING: [
82
+ FunctionExecutorStatus.DESTROYING,
83
+ FunctionExecutorStatus.DESTROYED,
84
+ FunctionExecutorStatus.SHUTDOWN,
85
+ ],
86
+ FunctionExecutorStatus.SHUTDOWN: [
87
+ FunctionExecutorStatus.SHUTDOWN
88
+ ], # No transitions allowed from SHUTDOWN
89
+ }
90
+
91
+ return new_status in allowed_transitions.get(current_status, [])
@@ -1,8 +1,10 @@
1
1
  import asyncio
2
+ import os
2
3
  from collections.abc import Awaitable, Callable
3
4
  from typing import Any, Optional
4
5
 
5
- from grpc.aio import AioRpcError
6
+ import grpc
7
+ import grpc.aio
6
8
  from tensorlake.function_executor.proto.function_executor_pb2 import (
7
9
  HealthCheckRequest,
8
10
  HealthCheckResponse,
@@ -27,7 +29,10 @@ class HealthCheckResult:
27
29
 
28
30
 
29
31
  class HealthChecker:
30
- def __init__(self, stub: FunctionExecutorStub, logger: Any):
32
+ def __init__(
33
+ self, channel: grpc.aio.Channel, stub: FunctionExecutorStub, logger: Any
34
+ ):
35
+ self._channel: grpc.aio.Channel = channel
31
36
  self._stub: FunctionExecutorStub = stub
32
37
  self._logger: Any = logger.bind(module=__name__)
33
38
  self._health_check_loop_task: Optional[asyncio.Task] = None
@@ -39,6 +44,12 @@ class HealthChecker:
39
44
  """Runs the health check once and returns the result.
40
45
 
41
46
  Does not raise any exceptions."""
47
+ if os.getenv("INDEXIFY_DISABLE_FUNCTION_EXECUTOR_HEALTH_CHECKS", "0") == "1":
48
+ return HealthCheckResult(
49
+ is_healthy=True,
50
+ reason="Function Executor health checks are disabled using INDEXIFY_DISABLE_FUNCTION_EXECUTOR_HEALTH_CHECKS env var.",
51
+ )
52
+
42
53
  with metric_health_check_latency.time():
43
54
  try:
44
55
  response: HealthCheckResponse = await self._stub.check_health(
@@ -49,19 +60,32 @@ class HealthChecker:
49
60
  return HealthCheckResult(
50
61
  is_healthy=response.healthy, reason=response.status_message
51
62
  )
52
- except AioRpcError as e:
53
- metric_failed_health_checks.inc()
54
- # Expected exception when there are problems with communication because e.g. the server is unhealthy.
55
- return HealthCheckResult(
56
- is_healthy=False,
57
- reason=f"Executor side RPC channel error: {str(e)}",
58
- )
63
+ except grpc.aio.AioRpcError as e:
64
+ # Due to the customer code running in Function Executor we can't reliably conclude
65
+ # that the FE is unhealthy when RPC status code is not OK. E.g. customer code can
66
+ # hold Python GIL and prevent the health check RPC from being processed by FE Python code.
67
+ #
68
+ # The only unhealthy condition we can be sure about is when the channel can't re-establish
69
+ # the TCP connection within HEALTH_CHECK_TIMEOUT_SEC deadline. This is because FE Python
70
+ # code is not involved when TCP connections are established to FE. Problems reestablishing
71
+ # the TCP connection are usually due to the FE process crashing and its gRPC server socket
72
+ # not being available anymore or due to prolonged local networking failures on Executor.
73
+ channel_connectivity = self._channel.get_state()
74
+ if channel_connectivity == grpc.ChannelConnectivity.TRANSIENT_FAILURE:
75
+ return HealthCheckResult(
76
+ is_healthy=False,
77
+ reason="Channel is in TRANSIENT_FAILURE state, assuming Function Executor crashed.",
78
+ )
79
+ else:
80
+ return HealthCheckResult(
81
+ is_healthy=True,
82
+ reason=f"Health check RPC failed with status code: {e.code().name}. Assuming Function Executor is healthy.",
83
+ )
59
84
  except Exception as e:
60
- metric_failed_health_checks.inc()
61
- self._logger.warning("Got unexpected exception, ignoring", exc_info=e)
85
+ self._logger.error("Got unexpected exception, ignoring", exc_info=e)
62
86
  return HealthCheckResult(
63
- is_healthy=False,
64
- reason=f"Unexpected exception in Executor: {str(e)}",
87
+ is_healthy=True,
88
+ reason=f"Unexpected exception in Executor: {str(e)}. Assuming Function Executor is healthy.",
65
89
  )
66
90
 
67
91
  def start(self, callback: Callable[[HealthCheckResult], Awaitable[None]]) -> None:
@@ -90,7 +90,7 @@ metric_get_info_rpc_errors: prometheus_client.Counter = prometheus_client.Counte
90
90
  )
91
91
  metric_function_executor_infos: prometheus_client.Counter = prometheus_client.Counter(
92
92
  "function_executor_infos",
93
- "Number of Function Executors with particular info",
93
+ "Number of Function Executor creations with particular info",
94
94
  ["version", "sdk_version", "sdk_language", "sdk_language_version"],
95
95
  )
96
96
 
@@ -1,5 +1,7 @@
1
1
  import prometheus_client
2
2
 
3
+ from ..function_executor_status import FunctionExecutorStatus
4
+
3
5
  # This file contains all metrics used by FunctionExecutorState.
4
6
 
5
7
  metric_function_executor_state_not_locked_errors: prometheus_client.Counter = (
@@ -8,3 +10,37 @@ metric_function_executor_state_not_locked_errors: prometheus_client.Counter = (
8
10
  "Number of times a Function Executor state was used without acquiring its lock",
9
11
  )
10
12
  )
13
+
14
+ # Function Executors count with a particular status.
15
+ metric_function_executors_with_status: prometheus_client.Gauge = (
16
+ prometheus_client.Gauge(
17
+ "function_executors_with_status",
18
+ "Number of Function Executors with a particular status",
19
+ ["status"],
20
+ )
21
+ )
22
+ metric_function_executors_with_status.labels(
23
+ status=FunctionExecutorStatus.STARTING_UP.name
24
+ )
25
+ metric_function_executors_with_status.labels(
26
+ status=FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR.name
27
+ )
28
+ metric_function_executors_with_status.labels(
29
+ status=FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR.name
30
+ )
31
+ metric_function_executors_with_status.labels(status=FunctionExecutorStatus.IDLE.name)
32
+ metric_function_executors_with_status.labels(
33
+ status=FunctionExecutorStatus.RUNNING_TASK.name
34
+ )
35
+ metric_function_executors_with_status.labels(
36
+ status=FunctionExecutorStatus.UNHEALTHY.name
37
+ )
38
+ metric_function_executors_with_status.labels(
39
+ status=FunctionExecutorStatus.DESTROYING.name
40
+ )
41
+ metric_function_executors_with_status.labels(
42
+ status=FunctionExecutorStatus.DESTROYED.name
43
+ )
44
+ metric_function_executors_with_status.labels(
45
+ status=FunctionExecutorStatus.SHUTDOWN.name
46
+ )
@@ -1,8 +1,10 @@
1
- from typing import Any, Optional
1
+ from dataclasses import dataclass
2
+ from typing import Any, List, Optional
2
3
 
3
4
  from .function_executor_server import FunctionExecutorServer
4
5
 
5
6
 
7
+ @dataclass
6
8
  class FunctionExecutorServerConfiguration:
7
9
  """Configuration for creating a FunctionExecutorServer.
8
10
 
@@ -14,13 +16,11 @@ class FunctionExecutorServerConfiguration:
14
16
  configuration parameters or raise an exception if it can't implement
15
17
  them."""
16
18
 
17
- def __init__(
18
- self, executor_id: str, function_executor_id: str, image_uri: Optional[str]
19
- ):
20
- self.executor_id: str = executor_id
21
- self.function_executor_id: str = function_executor_id
22
- # Container image URI of the Function Executor Server.
23
- self.image_uri: Optional[str] = image_uri
19
+ executor_id: str
20
+ function_executor_id: str
21
+ namespace: str
22
+ image_uri: Optional[str]
23
+ secret_names: List[str]
24
24
 
25
25
 
26
26
  class FunctionExecutorServerFactory:
@@ -14,6 +14,7 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
14
14
  from ..api_objects import Task
15
15
  from .function_executor import CustomerError, FunctionExecutor
16
16
  from .function_executor_state import FunctionExecutorState
17
+ from .function_executor_status import FunctionExecutorStatus
17
18
  from .health_checker import HealthChecker, HealthCheckResult
18
19
  from .metrics.single_task_runner import (
19
20
  metric_function_executor_run_task_rpc_errors,
@@ -40,9 +41,11 @@ class SingleTaskRunner:
40
41
  logger: Any,
41
42
  ):
42
43
  self._executor_id: str = executor_id
43
- self._state: FunctionExecutorState = function_executor_state
44
+ self._function_executor_state: FunctionExecutorState = function_executor_state
44
45
  self._task_input: TaskInput = task_input
45
- self._factory: FunctionExecutorServerFactory = function_executor_server_factory
46
+ self._function_executor_server_factory: FunctionExecutorServerFactory = (
47
+ function_executor_server_factory
48
+ )
46
49
  self._base_url: str = base_url
47
50
  self._config_path: Optional[str] = config_path
48
51
  self._logger = logger.bind(module=__name__)
@@ -54,18 +57,32 @@ class SingleTaskRunner:
54
57
  The lock is released during actual task run in the server.
55
58
  The lock is relocked on return.
56
59
 
57
- Raises an exception if an error occured."""
58
- self._state.check_locked()
60
+ Raises an exception if an error occured.
61
+
62
+ On enter the Function Executor status is either IDLE, UNHEALTHY or DESTROYED.
63
+ On return the Function Executor status is either IDLE, UNHEALTHY or DESTROYED.
64
+ """
65
+ self._function_executor_state.check_locked()
59
66
 
60
- if self._state.is_shutdown:
61
- raise RuntimeError("Function Executor state is shutting down.")
67
+ if self._function_executor_state.status not in [
68
+ FunctionExecutorStatus.IDLE,
69
+ FunctionExecutorStatus.UNHEALTHY,
70
+ FunctionExecutorStatus.DESTROYED,
71
+ ]:
72
+ self._logger.error(
73
+ "Function Executor is not in oneof [IDLE, UNHEALTHY, DESTROYED] state, cannot run the task",
74
+ status=self._function_executor_state.status,
75
+ )
76
+ raise RuntimeError(
77
+ f"Unexpected Function Executor state {self._function_executor_state.status}"
78
+ )
62
79
 
63
80
  # If Function Executor became unhealthy while was idle then destroy it.
64
81
  # It'll be recreated below.
65
82
  await self._destroy_existing_function_executor_if_unhealthy()
66
83
 
67
84
  # Create Function Executor if it doesn't exist yet.
68
- if self._state.function_executor is None:
85
+ if self._function_executor_state.status == FunctionExecutorStatus.DESTROYED:
69
86
  try:
70
87
  await self._create_function_executor()
71
88
  except CustomerError as e:
@@ -87,15 +104,38 @@ class SingleTaskRunner:
87
104
  # The periodic health checker might not notice this as it does only periodic checks.
88
105
  await self._destroy_existing_function_executor_if_unhealthy()
89
106
 
90
- async def _create_function_executor(self) -> FunctionExecutor:
91
- function_executor: FunctionExecutor = FunctionExecutor(
92
- server_factory=self._factory, logger=self._logger
107
+ if self._function_executor_state.status not in [
108
+ FunctionExecutorStatus.IDLE,
109
+ FunctionExecutorStatus.UNHEALTHY,
110
+ FunctionExecutorStatus.DESTROYED,
111
+ ]:
112
+ self._logger.error(
113
+ "Function Executor status is not oneof [IDLE, UNHEALTHY, DESTROYED] after running the task, resetting the state to mitigate a possible bug",
114
+ status=self._function_executor_state.status,
115
+ )
116
+ if self._function_executor_state.function_executor is None:
117
+ await self._function_executor_state.set_status(
118
+ FunctionExecutorStatus.DESTROYED
119
+ )
120
+ else:
121
+ await self._function_executor_state.set_status(
122
+ FunctionExecutorStatus.UNHEALTHY
123
+ )
124
+
125
+ async def _create_function_executor(self) -> None:
126
+ await self._function_executor_state.set_status(
127
+ FunctionExecutorStatus.STARTING_UP
128
+ )
129
+ self._function_executor_state.function_executor = FunctionExecutor(
130
+ server_factory=self._function_executor_server_factory, logger=self._logger
93
131
  )
94
132
  config: FunctionExecutorServerConfiguration = (
95
133
  FunctionExecutorServerConfiguration(
96
134
  executor_id=self._executor_id,
97
- function_executor_id=self._state.id,
135
+ function_executor_id=self._function_executor_state.id,
136
+ namespace=self._task_input.task.namespace,
98
137
  image_uri=self._task_input.task.image_uri,
138
+ secret_names=self._task_input.task.secret_names or [],
99
139
  )
100
140
  )
101
141
  initialize_request: InitializeRequest = InitializeRequest(
@@ -107,17 +147,29 @@ class SingleTaskRunner:
107
147
  )
108
148
 
109
149
  try:
110
- await function_executor.initialize(
150
+ await self._function_executor_state.function_executor.initialize(
111
151
  config=config,
112
152
  initialize_request=initialize_request,
113
153
  base_url=self._base_url,
114
154
  config_path=self._config_path,
115
155
  )
116
- self._state.function_executor = function_executor
156
+ except CustomerError:
157
+ # We have to follow the valid state transition sequence.
158
+ await self._function_executor_state.set_status(
159
+ FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR
160
+ )
161
+ await self._function_executor_state.destroy_function_executor()
162
+ raise
117
163
  except Exception:
118
- await function_executor.destroy()
164
+ # We have to follow the valid state transition sequence.
165
+ await self._function_executor_state.set_status(
166
+ FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
167
+ )
168
+ await self._function_executor_state.destroy_function_executor()
119
169
  raise
120
170
 
171
+ await self._function_executor_state.set_status(FunctionExecutorStatus.IDLE)
172
+
121
173
  async def _run(self) -> TaskOutput:
122
174
  request: RunTaskRequest = RunTaskRequest(
123
175
  namespace=self._task_input.task.namespace,
@@ -130,13 +182,15 @@ class SingleTaskRunner:
130
182
  )
131
183
  if self._task_input.init_value is not None:
132
184
  request.function_init_value.CopyFrom(self._task_input.init_value)
133
- channel: grpc.aio.Channel = self._state.function_executor.channel()
185
+ channel: grpc.aio.Channel = (
186
+ self._function_executor_state.function_executor.channel()
187
+ )
134
188
 
135
189
  async with _RunningTaskContextManager(
136
190
  invocation_id=self._task_input.task.invocation_id,
137
191
  task_id=self._task_input.task.id,
138
192
  health_check_failed_callback=self._health_check_failed_callback,
139
- function_executor_state=self._state,
193
+ function_executor_state=self._function_executor_state,
140
194
  ):
141
195
  with (
142
196
  metric_function_executor_run_task_rpc_errors.count_exceptions(),
@@ -154,31 +208,40 @@ class SingleTaskRunner:
154
208
  async def _health_check_failed_callback(self, result: HealthCheckResult):
155
209
  # Function Executor destroy due to the periodic health check failure ensures that
156
210
  # a running task RPC stuck in unhealthy Function Executor fails immidiately.
157
- async with self._state.lock:
158
- if self._state.function_executor is not None:
159
- await self._destroy_function_executor_on_failed_health_check(
160
- result.reason
161
- )
211
+ async with self._function_executor_state.lock:
212
+ if (
213
+ self._function_executor_state.status
214
+ != FunctionExecutorStatus.RUNNING_TASK
215
+ ):
216
+ # Protection in case the callback gets delivered after we finished running the task.
217
+ return
218
+
219
+ await self._function_executor_state.set_status(
220
+ FunctionExecutorStatus.UNHEALTHY
221
+ )
222
+ await self._destroy_function_executor_on_failed_health_check(result.reason)
162
223
 
163
224
  async def _destroy_existing_function_executor_if_unhealthy(self):
164
- self._state.check_locked()
165
- if self._state.function_executor is None:
166
- return
167
- result: HealthCheckResult = (
168
- await self._state.function_executor.health_checker().check()
169
- )
170
- if result.is_healthy:
171
- return
172
- await self._destroy_function_executor_on_failed_health_check(result.reason)
225
+ self._function_executor_state.check_locked()
226
+ if self._function_executor_state.status == FunctionExecutorStatus.IDLE:
227
+ result: HealthCheckResult = (
228
+ await self._function_executor_state.function_executor.health_checker().check()
229
+ )
230
+ if not result.is_healthy:
231
+ await self._function_executor_state.set_status(
232
+ FunctionExecutorStatus.UNHEALTHY
233
+ )
234
+
235
+ if self._function_executor_state.status == FunctionExecutorStatus.UNHEALTHY:
236
+ await self._destroy_function_executor_on_failed_health_check(result.reason)
173
237
 
174
238
  async def _destroy_function_executor_on_failed_health_check(self, reason: str):
175
- self._state.check_locked()
239
+ self._function_executor_state.check_locked()
176
240
  self._logger.error(
177
241
  "Function Executor health check failed, destroying Function Executor",
178
242
  health_check_fail_reason=reason,
179
243
  )
180
- self._state.health_check_failed = True
181
- await self._state.destroy_function_executor()
244
+ await self._function_executor_state.destroy_function_executor()
182
245
 
183
246
 
184
247
  class _RunningTaskContextManager:
@@ -199,7 +262,7 @@ class _RunningTaskContextManager:
199
262
  self._state: FunctionExecutorState = function_executor_state
200
263
 
201
264
  async def __aenter__(self):
202
- self._state.increment_running_tasks()
265
+ await self._state.set_status(FunctionExecutorStatus.RUNNING_TASK)
203
266
  self._state.function_executor.invocation_state_client().add_task_to_invocation_id_entry(
204
267
  task_id=self._task_id,
205
268
  invocation_id=self._invocation_id,
@@ -213,9 +276,9 @@ class _RunningTaskContextManager:
213
276
 
214
277
  async def __aexit__(self, exc_type, exc_val, exc_tb):
215
278
  await self._state.lock.acquire()
216
- self._state.decrement_running_tasks()
217
- # Health check callback could destroy the FunctionExecutor.
218
- if self._state.function_executor is not None:
279
+ # Health check callback could destroy the FunctionExecutor and set status to UNHEALTHY
280
+ if self._state.status == FunctionExecutorStatus.RUNNING_TASK:
281
+ await self._state.set_status(FunctionExecutorStatus.IDLE)
219
282
  self._state.function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
220
283
  task_id=self._task_id
221
284
  )
@@ -0,0 +1,53 @@
1
+ import asyncio
2
+ from typing import Any
3
+
4
+ import grpc.aio
5
+
6
+ from .metrics.channel_creator import (
7
+ metric_grpc_server_channel_creation_latency,
8
+ metric_grpc_server_channel_creation_retries,
9
+ metric_grpc_server_channel_creations,
10
+ )
11
+
12
+ _RETRY_INTERVAL_SEC = 5
13
+ _CONNECT_TIMEOUT_SEC = 5
14
+
15
+
16
+ class ChannelCreator:
17
+ def __init__(self, server_address: str, logger: Any):
18
+ self._logger = logger.bind(module=__name__)
19
+ self._server_address = server_address
20
+ self._is_shutdown = False
21
+
22
+ async def create(self) -> grpc.aio.Channel:
23
+ """Creates a channel to the gRPC server.
24
+
25
+ Blocks until the channel is ready.
26
+ Never raises any exceptions.
27
+ """
28
+ with metric_grpc_server_channel_creation_latency.time():
29
+ metric_grpc_server_channel_creations.inc()
30
+ while not self._is_shutdown:
31
+ try:
32
+ channel = grpc.aio.insecure_channel(self._server_address)
33
+ await asyncio.wait_for(
34
+ channel.channel_ready(),
35
+ timeout=_CONNECT_TIMEOUT_SEC,
36
+ )
37
+ return channel
38
+ except Exception:
39
+ self._logger.error(
40
+ f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
41
+ )
42
+ try:
43
+ await channel.close()
44
+ except Exception as e:
45
+ self._logger.error(
46
+ "failed closing not established channel", exc_info=e
47
+ )
48
+
49
+ metric_grpc_server_channel_creation_retries.inc()
50
+ await asyncio.sleep(_RETRY_INTERVAL_SEC)
51
+
52
+ async def shutdown(self):
53
+ self._is_shutdown = True
@@ -0,0 +1,18 @@
1
+ import prometheus_client
2
+
3
+ from ...monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ metric_grpc_server_channel_creations = prometheus_client.Counter(
6
+ "grpc_server_channel_creations",
7
+ "Number of times a channel to gRPC Server was created",
8
+ )
9
+ metric_grpc_server_channel_creation_retries = prometheus_client.Counter(
10
+ "grpc_server_channel_creation_retries",
11
+ "Number of retries during a channel creation to gRPC Server",
12
+ )
13
+ metric_grpc_server_channel_creation_latency: prometheus_client.Histogram = (
14
+ latency_metric_for_fast_operation(
15
+ "grpc_server_channel_creation",
16
+ "gRPC server channel creation",
17
+ )
18
+ )
@@ -0,0 +1,17 @@
1
+ import prometheus_client
2
+
3
+ from ...monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ metric_state_report_rpcs = prometheus_client.Counter(
6
+ "state_report_rpcs",
7
+ "Number of Executor state report RPCs to Server",
8
+ )
9
+ metric_state_report_errors = prometheus_client.Counter(
10
+ "state_report_rpc_errors",
11
+ "Number of Executor state report RPC errors",
12
+ )
13
+ metric_state_report_latency: prometheus_client.Histogram = (
14
+ latency_metric_for_fast_operation(
15
+ "state_report_rpc", "Executor state report rpc to Server"
16
+ )
17
+ )