indexify 0.4.16__tar.gz → 0.4.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {indexify-0.4.16 → indexify-0.4.18}/PKG-INFO +3 -3
  2. {indexify-0.4.16 → indexify-0.4.18}/pyproject.toml +3 -3
  3. indexify-0.4.18/src/indexify/executor/channel_manager.py +167 -0
  4. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/metrics/state_reporter.py +2 -2
  5. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/monitoring/health_checker/generic_health_checker.py +4 -0
  6. indexify-0.4.18/src/indexify/executor/monitoring/health_checker/metrics/health_checker.py +5 -0
  7. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/state_reconciler.py +37 -9
  8. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/state_reporter.py +66 -48
  9. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/proto/executor_api.proto +0 -2
  10. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/proto/executor_api_pb2_grpc.py +0 -2
  11. indexify-0.4.16/src/indexify/executor/channel_manager.py +0 -195
  12. {indexify-0.4.16 → indexify-0.4.18}/README.md +0 -0
  13. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/cli/__init__.py +0 -0
  14. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/cli/build_image.py +0 -0
  15. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/cli/deploy.py +0 -0
  16. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/cli/executor.py +0 -0
  17. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/README.md +0 -0
  18. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/blob_store/blob_store.py +0 -0
  19. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/blob_store/local_fs_blob_store.py +0 -0
  20. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/blob_store/metrics/blob_store.py +0 -0
  21. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/blob_store/s3_blob_store.py +0 -0
  22. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/executor.py +1 -1
  23. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_allowlist.py +0 -0
  24. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor/function_executor.py +0 -0
  25. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor/health_checker.py +0 -0
  26. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor/invocation_state_client.py +0 -0
  27. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor/metrics/function_executor.py +0 -0
  28. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor/metrics/health_checker.py +0 -0
  29. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor/metrics/invocation_state_client.py +0 -0
  30. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor/server/client_configuration.py +0 -0
  31. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor/server/function_executor_server.py +0 -0
  32. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor/server/function_executor_server_factory.py +0 -0
  33. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor/server/subprocess_function_executor_server.py +0 -0
  34. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +0 -0
  35. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/__init__.py +0 -0
  36. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/completed_task_metrics.py +0 -0
  37. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/create_function_executor.py +0 -0
  38. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/debug_event_loop.py +0 -0
  39. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/downloads.py +0 -0
  40. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/events.py +0 -0
  41. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/function_executor_controller.py +0 -0
  42. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/function_executor_startup_output.py +0 -0
  43. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/loggers.py +0 -0
  44. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/message_validators.py +0 -0
  45. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +0 -0
  46. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/metrics/downloads.py +0 -0
  47. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/metrics/function_executor_controller.py +0 -0
  48. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/metrics/run_task.py +0 -0
  49. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/metrics/upload_task_output.py +0 -0
  50. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/prepare_task.py +0 -0
  51. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/run_task.py +0 -0
  52. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/task_info.py +0 -0
  53. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/task_output.py +0 -0
  54. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/terminate_function_executor.py +0 -0
  55. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/function_executor_controller/upload_task_output.py +0 -0
  56. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/host_resources/host_resources.py +0 -0
  57. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/host_resources/nvidia_gpu.py +0 -0
  58. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/host_resources/nvidia_gpu_allocator.py +0 -0
  59. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/metrics/channel_manager.py +0 -0
  60. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/metrics/executor.py +0 -0
  61. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/metrics/state_reconciler.py +0 -0
  62. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/monitoring/handler.py +0 -0
  63. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/monitoring/health_check_handler.py +0 -0
  64. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/monitoring/health_checker/health_checker.py +0 -0
  65. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/monitoring/metrics.py +0 -0
  66. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/monitoring/prometheus_metrics_handler.py +0 -0
  67. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/monitoring/server.py +0 -0
  68. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/executor/monitoring/startup_probe_handler.py +0 -0
  69. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/proto/executor_api_pb2.py +0 -0
  70. {indexify-0.4.16 → indexify-0.4.18}/src/indexify/proto/executor_api_pb2.pyi +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: indexify
3
- Version: 0.4.16
3
+ Version: 0.4.18
4
4
  Summary: Open Source Indexify components and helper tools
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -14,10 +14,10 @@ Classifier: Programming Language :: Python :: 3.11
14
14
  Classifier: Programming Language :: Python :: 3.12
15
15
  Classifier: Programming Language :: Python :: 3.13
16
16
  Requires-Dist: aiohttp (>=3.12.14,<4.0.0)
17
- Requires-Dist: boto3 (>=1.39.6,<2.0.0)
17
+ Requires-Dist: boto3 (>=1.39.8,<2.0.0)
18
18
  Requires-Dist: prometheus-client (>=0.22.1,<0.23.0)
19
19
  Requires-Dist: psutil (>=7.0.0,<8.0.0)
20
- Requires-Dist: tensorlake (==0.2.25)
20
+ Requires-Dist: tensorlake (==0.2.27)
21
21
  Project-URL: Repository, https://github.com/tensorlakeai/indexify
22
22
  Description-Content-Type: text/markdown
23
23
 
@@ -1,7 +1,7 @@
1
1
  [tool.poetry]
2
2
  name = "indexify"
3
3
  # Incremented if any of the components provided in this packages are updated.
4
- version = "0.4.16"
4
+ version = "0.4.18"
5
5
  description = "Open Source Indexify components and helper tools"
6
6
  authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
7
7
  license = "Apache 2.0"
@@ -23,10 +23,10 @@ python = "^3.10"
23
23
  aiohttp = "^3.12.14"
24
24
  prometheus-client = "^0.22.1"
25
25
  psutil = "^7.0.0"
26
- boto3 = "^1.39.6"
26
+ boto3 = "^1.39.8"
27
27
  # Adds function-executor binary, utils lib, sdk used in indexify-cli commands.
28
28
  # We need to specify the tensorlake version exactly because pip install doesn't respect poetry.lock files.
29
- tensorlake = "0.2.25"
29
+ tensorlake = "0.2.27"
30
30
  # Uncomment the next line to use local tensorlake package (only for development!)
31
31
  # tensorlake = { path = "../tensorlake", develop = true }
32
32
  # pydantic is provided by tensorlake
@@ -0,0 +1,167 @@
1
+ import asyncio
2
+ import time
3
+ from typing import Any, Dict, Optional
4
+
5
+ import grpc.aio
6
+ import yaml
7
+
8
+ from .metrics.channel_manager import (
9
+ metric_grpc_server_channel_creation_latency,
10
+ metric_grpc_server_channel_creation_retries,
11
+ metric_grpc_server_channel_creations,
12
+ )
13
+
14
+ _RETRY_INTERVAL_SEC = 5
15
+
16
+
17
+ class ChannelManager:
18
+ def __init__(
19
+ self,
20
+ server_address: str,
21
+ config_path: Optional[str],
22
+ logger: Any,
23
+ ):
24
+ self._logger: Any = logger.bind(module=__name__, server_address=server_address)
25
+ self._server_address: str = server_address
26
+ self._channel_credentials: Optional[grpc.ChannelCredentials] = None
27
+ # Shared channel used by different Executor components to communicate with Server.
28
+ self._shared_channel_lock = asyncio.Lock()
29
+ self._shared_channel: Optional[grpc.aio.Channel] = None
30
+
31
+ self._init_tls(config_path)
32
+
33
+ def _init_tls(self, config_path: Optional[str]):
34
+ if config_path is None:
35
+ return
36
+
37
+ # The same config file format as in Tensorlake SDK HTTP client, see:
38
+ # https://github.com/tensorlakeai/tensorlake/blob/main/src/tensorlake/utils/http_client.py
39
+ with open(config_path, "r") as config_file:
40
+ config = yaml.safe_load(config_file)
41
+
42
+ if not config.get("use_tls", False):
43
+ return
44
+
45
+ tls_config: Dict[str, str] = config["tls_config"]
46
+ cert_path: Optional[str] = tls_config.get("cert_path", None)
47
+ key_path: Optional[str] = tls_config.get("key_path", None)
48
+ ca_bundle_path: Optional[str] = tls_config.get("ca_bundle_path", None)
49
+
50
+ self._logger = self._logger.bind(
51
+ cert_path=cert_path,
52
+ key_path=key_path,
53
+ ca_bundle_path=ca_bundle_path,
54
+ )
55
+ self._logger.info("TLS is enabled for grpc channels to server")
56
+
57
+ private_key: Optional[bytes] = None
58
+ certificate_chain: Optional[bytes] = None
59
+ root_certificates: Optional[bytes] = None
60
+
61
+ if cert_path is not None:
62
+ with open(cert_path, "rb") as cert_file:
63
+ certificate_chain = cert_file.read()
64
+ if key_path is not None:
65
+ with open(key_path, "rb") as key_file:
66
+ private_key = key_file.read()
67
+ if ca_bundle_path is not None:
68
+ with open(ca_bundle_path, "rb") as ca_bundle_file:
69
+ root_certificates = ca_bundle_file.read()
70
+
71
+ self._channel_credentials = grpc.ssl_channel_credentials(
72
+ root_certificates=root_certificates,
73
+ private_key=private_key,
74
+ certificate_chain=certificate_chain,
75
+ )
76
+
77
+ async def destroy(self):
78
+ # Okay to not hold the lock here as we're destroying the server channel forever.
79
+ if self._shared_channel is not None:
80
+ await self._destroy_shared_channel()
81
+
82
+ async def fail_shared_channel(self) -> None:
83
+ """Marks the shared channel as unhealthy and creates a new one.
84
+
85
+ Doesn't raise any exceptions.
86
+ """
87
+ async with self._shared_channel_lock:
88
+ if self._shared_channel is None:
89
+ self._logger.error(
90
+ "grpc server channel doesn't exist, can't mark it unhealthy"
91
+ )
92
+ return
93
+
94
+ self._logger.info("marking grpc server channel as unhealthy")
95
+ # All the channel users will see it failing cause we destroyed it and call get_channel() again.
96
+ await self._destroy_shared_channel()
97
+
98
+ async def get_shared_channel(self) -> grpc.aio.Channel:
99
+ """Returns shared channel to the gRPC server.
100
+
101
+ The health of the shared channel is constantly monitored so it's more reliable than using a
102
+ standalone channel created for a particular short term need. Doesn't raise any exceptions.
103
+ """
104
+ # Use the lock to ensure that we only create one channel without race conditions.
105
+ async with self._shared_channel_lock:
106
+ if self._shared_channel is None:
107
+ await self._create_shared_channel()
108
+
109
+ return self._shared_channel
110
+
111
+ def create_standalone_channel(self) -> grpc.aio.Channel:
112
+ """Creates a new channel to the gRPC server.
113
+
114
+ Used for one-off RPCs where we don't need to monitor channel health or retry its creation indefinitely.
115
+ Raises an exception on failure.
116
+ """
117
+ with (
118
+ metric_grpc_server_channel_creation_retries.count_exceptions(),
119
+ metric_grpc_server_channel_creation_latency.time(),
120
+ ):
121
+ metric_grpc_server_channel_creations.inc()
122
+ if self._channel_credentials is None:
123
+ return grpc.aio.insecure_channel(target=self._server_address)
124
+ else:
125
+ return grpc.aio.secure_channel(
126
+ target=self._server_address,
127
+ credentials=self._channel_credentials,
128
+ )
129
+
130
+ async def _create_shared_channel(self) -> None:
131
+ """Creates new shared channel.
132
+
133
+ self._shared_channel_lock must be acquired before calling this method.
134
+ Never raises any exceptions.
135
+ """
136
+ while True:
137
+ try:
138
+ create_channel_start = time.monotonic()
139
+ self._logger.info("creating new grpc channel to server")
140
+ self._shared_channel = self.create_standalone_channel()
141
+ # Ensure the channel tried to connect to not get "channel closed errors" without actually trying to connect.
142
+ self._shared_channel.get_state(try_to_connect=True)
143
+ self._logger.info(
144
+ "created new grpc channel to server",
145
+ duration_sec=time.monotonic() - create_channel_start,
146
+ )
147
+ break
148
+ except Exception as e:
149
+ self._logger.error(
150
+ f"failed creating grpc channel to server, retrying in {_RETRY_INTERVAL_SEC} seconds",
151
+ exc_info=e,
152
+ )
153
+ await asyncio.sleep(_RETRY_INTERVAL_SEC)
154
+
155
+ async def _destroy_shared_channel(self) -> None:
156
+ """Closes the existing shared channel.
157
+
158
+ self._shared_channel_lock must be acquired before calling this method.
159
+ Never raises any exceptions.
160
+ """
161
+ try:
162
+ self._logger.info("closing grpc channel to server")
163
+ await self._shared_channel.close()
164
+ self._logger.info("closed grpc channel to server")
165
+ except Exception as e:
166
+ self._logger.error("failed closing grpc channel to server", exc_info=e)
167
+ self._shared_channel = None
@@ -6,11 +6,11 @@ metric_state_report_rpcs = prometheus_client.Counter(
6
6
  "state_report_rpcs",
7
7
  "Number of Executor state report RPCs to Server",
8
8
  )
9
- metric_state_report_errors = prometheus_client.Counter(
9
+ metric_state_report_rpc_errors = prometheus_client.Counter(
10
10
  "state_report_rpc_errors",
11
11
  "Number of Executor state report RPC errors",
12
12
  )
13
- metric_state_report_latency: prometheus_client.Histogram = (
13
+ metric_state_report_rpc_latency: prometheus_client.Histogram = (
14
14
  latency_metric_for_fast_operation(
15
15
  "state_report_rpc", "Executor state report rpc to Server"
16
16
  )
@@ -1,6 +1,7 @@
1
1
  from typing import Optional
2
2
 
3
3
  from .health_checker import HealthChecker, HealthCheckResult
4
+ from .metrics.health_checker import metric_healthy
4
5
 
5
6
  _HEALTH_CHECKER_NAME = "GenericHealthChecker"
6
7
 
@@ -13,13 +14,16 @@ class GenericHealthChecker(HealthChecker):
13
14
 
14
15
  def __init__(self):
15
16
  self._server_connection_unhealthy_status_message: Optional[str] = None
17
+ metric_healthy.set(1)
16
18
 
17
19
  def server_connection_state_changed(self, is_healthy: bool, status_message: str):
18
20
  """Handle changes in server connection state."""
19
21
  if is_healthy:
20
22
  self._server_connection_unhealthy_status_message = None
23
+ metric_healthy.set(1)
21
24
  else:
22
25
  self._server_connection_unhealthy_status_message = status_message
26
+ metric_healthy.set(0)
23
27
 
24
28
  async def check(self) -> HealthCheckResult:
25
29
  if self._server_connection_unhealthy_status_message is not None:
@@ -0,0 +1,5 @@
1
+ import prometheus_client
2
+
3
+ metric_healthy = prometheus_client.Gauge(
4
+ "healthy", "1 if the executor is healthy, 0 otherwise"
5
+ )
@@ -1,6 +1,15 @@
1
1
  import asyncio
2
2
  from pathlib import Path
3
- from typing import Any, AsyncGenerator, Dict, Iterable, List, Optional, Set
3
+ from typing import (
4
+ Any,
5
+ AsyncIterable,
6
+ AsyncIterator,
7
+ Dict,
8
+ Iterable,
9
+ List,
10
+ Optional,
11
+ Set,
12
+ )
4
13
 
5
14
  from tensorlake.function_executor.proto.message_validator import MessageValidator
6
15
 
@@ -33,6 +42,10 @@ from .state_reporter import ExecutorStateReporter
33
42
 
34
43
  _RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
35
44
  _RECONCILIATION_RETRIES = 3
45
+ # If we didn't get a new desired state from the stream within this timeout then the stream might
46
+ # not be healthy due to network disruption. In this case we need to recreate the stream to make
47
+ # sure that Server really doesn't want to send us a new state.
48
+ _DESIRED_EXECUTOR_STATES_TIMEOUT_SEC = 5 * 60 # 5 minutes
36
49
 
37
50
 
38
51
  class ExecutorStateReconciler:
@@ -141,16 +154,15 @@ class ExecutorStateReconciler:
141
154
  Never raises any exceptions. Get cancelled via aio task cancellation.
142
155
  """
143
156
  while True:
157
+ desired_states_stream: Optional[AsyncIterable[DesiredExecutorState]] = None
144
158
  try:
145
- stub = ExecutorAPIStub(await self._channel_manager.get_channel())
159
+ stub = ExecutorAPIStub(await self._channel_manager.get_shared_channel())
146
160
  # Report state once before starting the stream so Server
147
161
  # doesn't use stale state it knew about this Executor in the past.
148
162
  await self._state_reporter.report_state_and_wait_for_completion()
149
163
 
150
- desired_states_stream: AsyncGenerator[DesiredExecutorState, None] = (
151
- stub.get_desired_executor_states(
152
- GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
153
- )
164
+ desired_states_stream = stub.get_desired_executor_states(
165
+ GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
154
166
  )
155
167
  self._logger.info("created new desired states stream")
156
168
  await self._process_desired_states_stream(desired_states_stream)
@@ -159,6 +171,11 @@ class ExecutorStateReconciler:
159
171
  f"error while processing desired states stream",
160
172
  exc_info=e,
161
173
  )
174
+ finally:
175
+ # Cleanly signal Server that the stream is closed by client.
176
+ # See https://stackoverflow.com/questions/72207914/how-to-stop-listening-on-a-stream-in-python-grpc-client
177
+ if desired_states_stream is not None:
178
+ desired_states_stream.cancel()
162
179
 
163
180
  self._logger.info(
164
181
  f"desired states stream closed, reconnecting in {self._server_backoff_interval_sec} sec"
@@ -166,10 +183,21 @@ class ExecutorStateReconciler:
166
183
  await asyncio.sleep(self._server_backoff_interval_sec)
167
184
 
168
185
  async def _process_desired_states_stream(
169
- self, desired_states: AsyncGenerator[DesiredExecutorState, None]
186
+ self, desired_states: AsyncIterable[DesiredExecutorState]
170
187
  ):
171
- async for new_state in desired_states:
172
- new_state: DesiredExecutorState
188
+ desired_states_iter: AsyncIterator[DesiredExecutorState] = aiter(desired_states)
189
+ while True:
190
+ try:
191
+ new_state: DesiredExecutorState = await asyncio.wait_for(
192
+ anext(desired_states_iter),
193
+ timeout=_DESIRED_EXECUTOR_STATES_TIMEOUT_SEC,
194
+ )
195
+ except asyncio.TimeoutError:
196
+ self._logger.info(
197
+ f"No desired state received from Server within {_DESIRED_EXECUTOR_STATES_TIMEOUT_SEC} sec, recreating the stream to ensure it is healthy"
198
+ )
199
+ break # Timeout reached, stream might be unhealthy, exit the loop to recreate the stream.
200
+
173
201
  validator: MessageValidator = MessageValidator(new_state)
174
202
  try:
175
203
  validator.required_field("clock")
@@ -30,10 +30,11 @@ from .function_executor_controller.loggers import task_result_logger
30
30
  from .host_resources.host_resources import HostResources, HostResourcesProvider
31
31
  from .host_resources.nvidia_gpu import NVIDIA_GPU_MODEL
32
32
  from .metrics.state_reporter import (
33
- metric_state_report_errors,
34
- metric_state_report_latency,
33
+ metric_state_report_rpc_errors,
34
+ metric_state_report_rpc_latency,
35
35
  metric_state_report_rpcs,
36
36
  )
37
+ from .monitoring.health_checker.health_checker import HealthChecker
37
38
 
38
39
  _REPORTING_INTERVAL_SEC = 5
39
40
  _REPORTING_BACKOFF_SEC = 5
@@ -49,6 +50,7 @@ class ExecutorStateReporter:
49
50
  function_allowlist: List[FunctionURI],
50
51
  channel_manager: ChannelManager,
51
52
  host_resources_provider: HostResourcesProvider,
53
+ health_checker: HealthChecker,
52
54
  logger: Any,
53
55
  ):
54
56
  self._executor_id: str = executor_id
@@ -57,6 +59,7 @@ class ExecutorStateReporter:
57
59
  self._labels.update(_executor_labels())
58
60
  self._hostname: str = gethostname()
59
61
  self._channel_manager = channel_manager
62
+ self._health_checker: HealthChecker = health_checker
60
63
  self._logger: Any = logger.bind(module=__name__)
61
64
  self._allowed_functions: List[AllowedFunction] = _to_allowed_function_protos(
62
65
  function_allowlist
@@ -167,10 +170,15 @@ class ExecutorStateReporter:
167
170
  # Don't retry state report if it failed during shutdown.
168
171
  # We only do best effort last state report and Server might not be available.
169
172
  try:
170
- async with self._channel_manager.create_channel() as channel:
171
- stub = ExecutorAPIStub(channel)
172
- await self._report_state(stub)
173
- except BaseException as e:
173
+ async with self._channel_manager.create_standalone_channel() as channel:
174
+ await ExecutorAPIStub(channel).report_executor_state(
175
+ ReportExecutorStateRequest(
176
+ executor_state=self._current_executor_state(),
177
+ executor_update=self._remove_pending_update(),
178
+ ),
179
+ timeout=_REPORT_RPC_TIMEOUT_SEC,
180
+ )
181
+ except Exception as e:
174
182
  self._logger.error(
175
183
  "failed to report state during shutdown",
176
184
  exc_info=e,
@@ -187,60 +195,48 @@ class ExecutorStateReporter:
187
195
  Never raises any exceptions.
188
196
  """
189
197
  while True:
190
- stub = ExecutorAPIStub(await self._channel_manager.get_channel())
198
+ stub = ExecutorAPIStub(await self._channel_manager.get_shared_channel())
191
199
  while True:
192
200
  await self._state_report_scheduled_event.wait()
193
201
  # Clear the event immidiately to report again asap if needed. This reduces latency in the system.
194
202
  self._state_report_scheduled_event.clear()
195
203
  try:
196
- # The periodic state reports serve as channel health monitoring requests
197
- # (same as TCP keep-alive). Channel Manager returns the same healthy channel
198
- # for all RPCs that we do from Executor to Server. So all the RPCs benefit
199
- # from this channel health monitoring.
200
- await self._report_state(stub)
204
+ state: ExecutorState = self._current_executor_state()
205
+ update: ExecutorUpdate = self._remove_pending_update()
206
+ _log_reported_executor_update(update, self._logger)
207
+
208
+ with (
209
+ metric_state_report_rpc_errors.count_exceptions(),
210
+ metric_state_report_rpc_latency.time(),
211
+ ):
212
+ metric_state_report_rpcs.inc()
213
+ await stub.report_executor_state(
214
+ ReportExecutorStateRequest(
215
+ executor_state=state, executor_update=update
216
+ ),
217
+ timeout=_REPORT_RPC_TIMEOUT_SEC,
218
+ )
201
219
  self._state_reported_event.set()
220
+ self._health_checker.server_connection_state_changed(
221
+ is_healthy=True, status_message="grpc server channel is healthy"
222
+ )
202
223
  except Exception as e:
224
+ self._add_to_pending_update(update)
203
225
  self._logger.error(
204
226
  f"failed to report state to the server, backing-off for {_REPORTING_BACKOFF_SEC} sec.",
205
227
  exc_info=e,
206
228
  )
229
+ # The periodic state reports serve as channel health monitoring requests
230
+ # (same as TCP keep-alive). Channel Manager returns the same healthy channel
231
+ # for all RPCs that we do from Executor to Server. So all the RPCs benefit
232
+ # from this channel health monitoring.
233
+ self._health_checker.server_connection_state_changed(
234
+ is_healthy=False,
235
+ status_message="grpc server channel is unhealthy",
236
+ )
237
+ await self._channel_manager.fail_shared_channel()
207
238
  await asyncio.sleep(_REPORTING_BACKOFF_SEC)
208
- break # exit the inner loop to recreate the channel if needed
209
-
210
- async def _report_state(self, stub: ExecutorAPIStub):
211
- """Reports the current state to the server represented by the supplied stub.
212
-
213
- Raises an exception on failure.
214
- """
215
- with (
216
- metric_state_report_errors.count_exceptions(),
217
- metric_state_report_latency.time(),
218
- ):
219
- metric_state_report_rpcs.inc()
220
- state: ExecutorState = self._current_executor_state()
221
- update: ExecutorUpdate = self._remove_pending_update()
222
-
223
- for task_result in update.task_results:
224
- task_result_logger(task_result, self._logger).info(
225
- "reporting task outcome",
226
- outcome_code=TaskOutcomeCode.Name(task_result.outcome_code),
227
- failure_reason=(
228
- TaskFailureReason.Name(task_result.failure_reason)
229
- if task_result.HasField("failure_reason")
230
- else "None"
231
- ),
232
- )
233
-
234
- try:
235
- await stub.report_executor_state(
236
- ReportExecutorStateRequest(
237
- executor_state=state, executor_update=update
238
- ),
239
- timeout=_REPORT_RPC_TIMEOUT_SEC,
240
- )
241
- except Exception as e:
242
- self._add_to_pending_update(update)
243
- raise
239
+ break # exit the inner loop to use the recreated channel
244
240
 
245
241
  def _current_executor_state(self) -> ExecutorState:
246
242
  """Returns the current executor state."""
@@ -284,6 +280,28 @@ class ExecutorStateReporter:
284
280
  self.add_function_executor_update(function_executor_update)
285
281
 
286
282
 
283
+ def _log_reported_executor_update(update: ExecutorUpdate, logger: Any) -> None:
284
+ """Logs the reported executor update.
285
+
286
+ Doesn't raise any exceptions."""
287
+ try:
288
+ for task_result in update.task_results:
289
+ task_result_logger(task_result, logger).info(
290
+ "reporting task outcome",
291
+ outcome_code=TaskOutcomeCode.Name(task_result.outcome_code),
292
+ failure_reason=(
293
+ TaskFailureReason.Name(task_result.failure_reason)
294
+ if task_result.HasField("failure_reason")
295
+ else "None"
296
+ ),
297
+ )
298
+ except Exception as e:
299
+ logger.error(
300
+ "failed to log reported executor update",
301
+ exc_info=e,
302
+ )
303
+
304
+
287
305
  def _to_allowed_function_protos(
288
306
  function_allowlist: List[FunctionURI],
289
307
  ) -> List[AllowedFunction]:
@@ -288,7 +288,5 @@ service ExecutorAPI {
288
288
 
289
289
  // Called by Executor to open a stream of its desired states. When Server wants Executor to change something
290
290
  // it puts a message on the stream with the new desired state of the Executor.
291
- //
292
- // Deprecated HTTP API is used to download the serialized graph and task inputs.
293
291
  rpc get_desired_executor_states(GetDesiredExecutorStatesRequest) returns (stream DesiredExecutorState) {}
294
292
  }
@@ -79,8 +79,6 @@ class ExecutorAPIServicer(object):
79
79
  def get_desired_executor_states(self, request, context):
80
80
  """Called by Executor to open a stream of its desired states. When Server wants Executor to change something
81
81
  it puts a message on the stream with the new desired state of the Executor.
82
-
83
- Deprecated HTTP API is used to download the serialized graph and task inputs.
84
82
  """
85
83
  context.set_code(grpc.StatusCode.UNIMPLEMENTED)
86
84
  context.set_details("Method not implemented!")
@@ -1,195 +0,0 @@
1
- import asyncio
2
- import time
3
- from typing import Any, Dict, Optional
4
-
5
- import grpc.aio
6
- import yaml
7
-
8
- from .metrics.channel_manager import (
9
- metric_grpc_server_channel_creation_latency,
10
- metric_grpc_server_channel_creation_retries,
11
- metric_grpc_server_channel_creations,
12
- )
13
- from .monitoring.health_checker.health_checker import HealthChecker
14
-
15
- _RETRY_INTERVAL_SEC = 5
16
- _CONNECT_TIMEOUT_SEC = 5
17
-
18
-
19
- class ChannelManager:
20
- def __init__(
21
- self,
22
- server_address: str,
23
- config_path: Optional[str],
24
- health_checker: HealthChecker,
25
- logger: Any,
26
- ):
27
- self._logger: Any = logger.bind(module=__name__, server_address=server_address)
28
- self._server_address: str = server_address
29
- self._health_checker: HealthChecker = health_checker
30
- self._channel_credentials: Optional[grpc.ChannelCredentials] = None
31
- # This lock protects the fields below.
32
- self._lock = asyncio.Lock()
33
- self._channel: Optional[grpc.aio.Channel] = None
34
-
35
- self._init_tls(config_path)
36
-
37
- def _init_tls(self, config_path: Optional[str]):
38
- if config_path is None:
39
- return
40
-
41
- # The same config file format as in Tensorlake SDK HTTP client, see:
42
- # https://github.com/tensorlakeai/tensorlake/blob/main/src/tensorlake/utils/http_client.py
43
- with open(config_path, "r") as config_file:
44
- config = yaml.safe_load(config_file)
45
-
46
- if not config.get("use_tls", False):
47
- return
48
-
49
- tls_config: Dict[str, str] = config["tls_config"]
50
- cert_path: Optional[str] = tls_config.get("cert_path", None)
51
- key_path: Optional[str] = tls_config.get("key_path", None)
52
- ca_bundle_path: Optional[str] = tls_config.get("ca_bundle_path", None)
53
-
54
- self._logger = self._logger.bind(
55
- cert_path=cert_path,
56
- key_path=key_path,
57
- ca_bundle_path=ca_bundle_path,
58
- )
59
- self._logger.info("TLS is enabled for grpc channels to server")
60
-
61
- private_key: Optional[bytes] = None
62
- certificate_chain: Optional[bytes] = None
63
- root_certificates: Optional[bytes] = None
64
-
65
- if cert_path is not None:
66
- with open(cert_path, "rb") as cert_file:
67
- certificate_chain = cert_file.read()
68
- if key_path is not None:
69
- with open(key_path, "rb") as key_file:
70
- private_key = key_file.read()
71
- if ca_bundle_path is not None:
72
- with open(ca_bundle_path, "rb") as ca_bundle_file:
73
- root_certificates = ca_bundle_file.read()
74
-
75
- self._channel_credentials = grpc.ssl_channel_credentials(
76
- root_certificates=root_certificates,
77
- private_key=private_key,
78
- certificate_chain=certificate_chain,
79
- )
80
-
81
- async def destroy(self):
82
- if self._channel is not None:
83
- await self._destroy_locked_channel()
84
-
85
- async def get_channel(self) -> grpc.aio.Channel:
86
- """Returns a channel to the gRPC server.
87
-
88
- Returns a ready to use channel. Blocks until the channel is ready,
89
- never raises any exceptions.
90
- If previously returned channel is healthy then returns it again.
91
- Otherwise, returns a new channel but closes the previously returned one.
92
- """
93
- # Use the lock to ensure that we only create one channel without race conditions.
94
- async with self._lock:
95
- if self._channel is None:
96
- # Only called on Executor startup when we establish the channel for the first time.
97
- self._channel = await self._create_ready_channel()
98
- elif not await self._locked_channel_is_healthy():
99
- self._logger.info("grpc channel to server is unhealthy")
100
- self._health_checker.server_connection_state_changed(
101
- is_healthy=False,
102
- status_message="grpc channel to server is unhealthy",
103
- )
104
- await self._destroy_locked_channel()
105
- self._channel = await self._create_ready_channel()
106
- self._health_checker.server_connection_state_changed(
107
- is_healthy=True, status_message="grpc channel to server is healthy"
108
- )
109
-
110
- return self._channel
111
-
112
- def create_channel(self) -> grpc.aio.Channel:
113
- """Creates a new channel to the gRPC server.
114
-
115
- The channel is not ready to use. Raises an exception on failure.
116
- """
117
- if self._channel_credentials is None:
118
- return grpc.aio.insecure_channel(target=self._server_address)
119
- else:
120
- return grpc.aio.secure_channel(
121
- target=self._server_address,
122
- credentials=self._channel_credentials,
123
- )
124
-
125
- async def _create_ready_channel(self) -> grpc.aio.Channel:
126
- """Creates a new channel to the gRPC server."
127
-
128
- Returns a ready to use channel. Blocks until the channel
129
- is ready, never raises any exceptions.
130
- """
131
- with metric_grpc_server_channel_creation_latency.time():
132
- metric_grpc_server_channel_creations.inc()
133
- while True:
134
- try:
135
- self._logger.info("creating new grpc server channel")
136
- create_channel_start = time.monotonic()
137
- channel: grpc.Channel = self.create_channel()
138
- self._logger.info(
139
- "grpc server channel created",
140
- duration_sec=time.monotonic() - create_channel_start,
141
- )
142
-
143
- channel_ready_start = time.monotonic()
144
- await asyncio.wait_for(
145
- channel.channel_ready(),
146
- timeout=_CONNECT_TIMEOUT_SEC,
147
- )
148
- self._logger.info(
149
- "grpc server channel is established (ready)",
150
- duration_sec=time.monotonic() - channel_ready_start,
151
- )
152
-
153
- return channel
154
- except BaseException:
155
- self._logger.error(
156
- f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
157
- )
158
- try:
159
- await channel.close()
160
- except BaseException as e:
161
- self._logger.error(
162
- "failed closing not established channel", exc_info=e
163
- )
164
-
165
- metric_grpc_server_channel_creation_retries.inc()
166
- await asyncio.sleep(_RETRY_INTERVAL_SEC)
167
-
168
- async def _locked_channel_is_healthy(self) -> bool:
169
- """Checks if the channel is healthy.
170
-
171
- Returns True if the channel is healthy, False otherwise.
172
- self._lock must be acquired before calling this method.
173
- Never raises any exceptions.
174
- """
175
- try:
176
- return self._channel.get_state() == grpc.ChannelConnectivity.READY
177
- except Exception as e:
178
- # Assume that the channel is healthy because get_state() method is marked as experimental
179
- # so we can't fully trust it.
180
- self._logger.error(
181
- "failed getting channel state, assuming channel is healthy", exc_info=e
182
- )
183
- return True
184
-
185
- async def _destroy_locked_channel(self):
186
- """Closes the existing channel.
187
-
188
- self._lock must be acquired before calling this method.
189
- Never raises any exceptions.
190
- """
191
- try:
192
- await self._channel.close()
193
- except Exception as e:
194
- self._logger.error("failed closing channel", exc_info=e)
195
- self._channel = None
File without changes
@@ -69,7 +69,6 @@ class Executor:
69
69
  self._channel_manager = ChannelManager(
70
70
  server_address=grpc_server_addr,
71
71
  config_path=config_path,
72
- health_checker=health_checker,
73
72
  logger=self._logger,
74
73
  )
75
74
  function_allowlist: List[FunctionURI] = parse_function_uris(function_uris)
@@ -80,6 +79,7 @@ class Executor:
80
79
  function_allowlist=function_allowlist,
81
80
  channel_manager=self._channel_manager,
82
81
  host_resources_provider=host_resources_provider,
82
+ health_checker=health_checker,
83
83
  logger=self._logger,
84
84
  )
85
85
  self._state_reporter.update_executor_status(