indexify 0.4.15__tar.gz → 0.4.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {indexify-0.4.15 → indexify-0.4.16}/PKG-INFO +3 -3
  2. {indexify-0.4.15 → indexify-0.4.16}/pyproject.toml +3 -3
  3. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/channel_manager.py +36 -58
  4. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/executor.py +1 -0
  5. indexify-0.4.16/src/indexify/executor/monitoring/health_checker/generic_health_checker.py +36 -0
  6. indexify-0.4.16/src/indexify/executor/monitoring/health_checker/health_checker.py +16 -0
  7. indexify-0.4.15/src/indexify/executor/monitoring/health_checker/generic_health_checker.py +0 -20
  8. indexify-0.4.15/src/indexify/executor/monitoring/health_checker/health_checker.py +0 -12
  9. {indexify-0.4.15 → indexify-0.4.16}/README.md +0 -0
  10. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/cli/__init__.py +0 -0
  11. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/cli/build_image.py +0 -0
  12. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/cli/deploy.py +0 -0
  13. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/cli/executor.py +0 -0
  14. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/README.md +0 -0
  15. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/blob_store/blob_store.py +0 -0
  16. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/blob_store/local_fs_blob_store.py +0 -0
  17. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/blob_store/metrics/blob_store.py +0 -0
  18. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/blob_store/s3_blob_store.py +0 -0
  19. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_allowlist.py +0 -0
  20. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor/function_executor.py +0 -0
  21. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor/health_checker.py +0 -0
  22. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor/invocation_state_client.py +0 -0
  23. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor/metrics/function_executor.py +0 -0
  24. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor/metrics/health_checker.py +0 -0
  25. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor/metrics/invocation_state_client.py +0 -0
  26. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor/server/client_configuration.py +0 -0
  27. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor/server/function_executor_server.py +0 -0
  28. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor/server/function_executor_server_factory.py +0 -0
  29. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor/server/subprocess_function_executor_server.py +0 -0
  30. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +0 -0
  31. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/__init__.py +0 -0
  32. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/completed_task_metrics.py +0 -0
  33. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/create_function_executor.py +0 -0
  34. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/debug_event_loop.py +0 -0
  35. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/downloads.py +0 -0
  36. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/events.py +0 -0
  37. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/function_executor_controller.py +0 -0
  38. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/function_executor_startup_output.py +0 -0
  39. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/loggers.py +0 -0
  40. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/message_validators.py +0 -0
  41. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +0 -0
  42. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/metrics/downloads.py +0 -0
  43. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/metrics/function_executor_controller.py +0 -0
  44. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/metrics/run_task.py +0 -0
  45. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/metrics/upload_task_output.py +0 -0
  46. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/prepare_task.py +0 -0
  47. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/run_task.py +0 -0
  48. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/task_info.py +0 -0
  49. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/task_output.py +0 -0
  50. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/terminate_function_executor.py +0 -0
  51. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/function_executor_controller/upload_task_output.py +0 -0
  52. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/host_resources/host_resources.py +0 -0
  53. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/host_resources/nvidia_gpu.py +0 -0
  54. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/host_resources/nvidia_gpu_allocator.py +0 -0
  55. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/metrics/channel_manager.py +0 -0
  56. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/metrics/executor.py +0 -0
  57. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/metrics/state_reconciler.py +0 -0
  58. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/metrics/state_reporter.py +0 -0
  59. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/monitoring/handler.py +0 -0
  60. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/monitoring/health_check_handler.py +0 -0
  61. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/monitoring/metrics.py +0 -0
  62. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/monitoring/prometheus_metrics_handler.py +0 -0
  63. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/monitoring/server.py +0 -0
  64. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/monitoring/startup_probe_handler.py +0 -0
  65. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/state_reconciler.py +0 -0
  66. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/executor/state_reporter.py +0 -0
  67. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/proto/executor_api.proto +0 -0
  68. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/proto/executor_api_pb2.py +0 -0
  69. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/proto/executor_api_pb2.pyi +0 -0
  70. {indexify-0.4.15 → indexify-0.4.16}/src/indexify/proto/executor_api_pb2_grpc.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: indexify
3
- Version: 0.4.15
3
+ Version: 0.4.16
4
4
  Summary: Open Source Indexify components and helper tools
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -14,10 +14,10 @@ Classifier: Programming Language :: Python :: 3.11
14
14
  Classifier: Programming Language :: Python :: 3.12
15
15
  Classifier: Programming Language :: Python :: 3.13
16
16
  Requires-Dist: aiohttp (>=3.12.14,<4.0.0)
17
- Requires-Dist: boto3 (>=1.39.4,<2.0.0)
17
+ Requires-Dist: boto3 (>=1.39.6,<2.0.0)
18
18
  Requires-Dist: prometheus-client (>=0.22.1,<0.23.0)
19
19
  Requires-Dist: psutil (>=7.0.0,<8.0.0)
20
- Requires-Dist: tensorlake (==0.2.24)
20
+ Requires-Dist: tensorlake (==0.2.25)
21
21
  Project-URL: Repository, https://github.com/tensorlakeai/indexify
22
22
  Description-Content-Type: text/markdown
23
23
 
@@ -1,7 +1,7 @@
1
1
  [tool.poetry]
2
2
  name = "indexify"
3
3
  # Incremented if any of the components provided in this packages are updated.
4
- version = "0.4.15"
4
+ version = "0.4.16"
5
5
  description = "Open Source Indexify components and helper tools"
6
6
  authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
7
7
  license = "Apache 2.0"
@@ -23,10 +23,10 @@ python = "^3.10"
23
23
  aiohttp = "^3.12.14"
24
24
  prometheus-client = "^0.22.1"
25
25
  psutil = "^7.0.0"
26
- boto3 = "^1.39.4"
26
+ boto3 = "^1.39.6"
27
27
  # Adds function-executor binary, utils lib, sdk used in indexify-cli commands.
28
28
  # We need to specify the tensorlake version exactly because pip install doesn't respect poetry.lock files.
29
- tensorlake = "0.2.24"
29
+ tensorlake = "0.2.25"
30
30
  # Uncomment the next line to use local tensorlake package (only for development!)
31
31
  # tensorlake = { path = "../tensorlake", develop = true }
32
32
  # pydantic is provided by tensorlake
@@ -1,5 +1,5 @@
1
1
  import asyncio
2
- import os
2
+ import time
3
3
  from typing import Any, Dict, Optional
4
4
 
5
5
  import grpc.aio
@@ -10,16 +10,23 @@ from .metrics.channel_manager import (
10
10
  metric_grpc_server_channel_creation_retries,
11
11
  metric_grpc_server_channel_creations,
12
12
  )
13
+ from .monitoring.health_checker.health_checker import HealthChecker
13
14
 
14
15
  _RETRY_INTERVAL_SEC = 5
15
16
  _CONNECT_TIMEOUT_SEC = 5
16
17
 
17
18
 
18
19
  class ChannelManager:
19
- def __init__(self, server_address: str, config_path: Optional[str], logger: Any):
20
+ def __init__(
21
+ self,
22
+ server_address: str,
23
+ config_path: Optional[str],
24
+ health_checker: HealthChecker,
25
+ logger: Any,
26
+ ):
20
27
  self._logger: Any = logger.bind(module=__name__, server_address=server_address)
21
- self._keep_alive_period_sec: int = _keep_alive_period_sec_from_env(logger)
22
28
  self._server_address: str = server_address
29
+ self._health_checker: HealthChecker = health_checker
23
30
  self._channel_credentials: Optional[grpc.ChannelCredentials] = None
24
31
  # This lock protects the fields below.
25
32
  self._lock = asyncio.Lock()
@@ -86,31 +93,33 @@ class ChannelManager:
86
93
  # Use the lock to ensure that we only create one channel without race conditions.
87
94
  async with self._lock:
88
95
  if self._channel is None:
96
+ # Only called on Executor startup when we establish the channel for the first time.
89
97
  self._channel = await self._create_ready_channel()
90
98
  elif not await self._locked_channel_is_healthy():
91
99
  self._logger.info("grpc channel to server is unhealthy")
100
+ self._health_checker.server_connection_state_changed(
101
+ is_healthy=False,
102
+ status_message="grpc channel to server is unhealthy",
103
+ )
92
104
  await self._destroy_locked_channel()
93
105
  self._channel = await self._create_ready_channel()
106
+ self._health_checker.server_connection_state_changed(
107
+ is_healthy=True, status_message="grpc channel to server is healthy"
108
+ )
94
109
 
95
110
  return self._channel
96
111
 
97
112
  def create_channel(self) -> grpc.aio.Channel:
98
113
  """Creates a new channel to the gRPC server.
99
114
 
100
- The channel is not be ready to use. Raises an exception on failure.
115
+ The channel is not ready to use. Raises an exception on failure.
101
116
  """
102
- channel_options: list[tuple[str, int]] = _channel_options(
103
- self._keep_alive_period_sec
104
- )
105
117
  if self._channel_credentials is None:
106
- return grpc.aio.insecure_channel(
107
- target=self._server_address, options=channel_options
108
- )
118
+ return grpc.aio.insecure_channel(target=self._server_address)
109
119
  else:
110
120
  return grpc.aio.secure_channel(
111
121
  target=self._server_address,
112
122
  credentials=self._channel_credentials,
113
- options=channel_options,
114
123
  )
115
124
 
116
125
  async def _create_ready_channel(self) -> grpc.aio.Channel:
@@ -119,25 +128,36 @@ class ChannelManager:
119
128
  Returns a ready to use channel. Blocks until the channel
120
129
  is ready, never raises any exceptions.
121
130
  """
122
- self._logger.info("creating new grpc server channel")
123
-
124
131
  with metric_grpc_server_channel_creation_latency.time():
125
132
  metric_grpc_server_channel_creations.inc()
126
133
  while True:
127
134
  try:
128
- channel = self.create_channel()
135
+ self._logger.info("creating new grpc server channel")
136
+ create_channel_start = time.monotonic()
137
+ channel: grpc.Channel = self.create_channel()
138
+ self._logger.info(
139
+ "grpc server channel created",
140
+ duration_sec=time.monotonic() - create_channel_start,
141
+ )
142
+
143
+ channel_ready_start = time.monotonic()
129
144
  await asyncio.wait_for(
130
145
  channel.channel_ready(),
131
146
  timeout=_CONNECT_TIMEOUT_SEC,
132
147
  )
148
+ self._logger.info(
149
+ "grpc server channel is established (ready)",
150
+ duration_sec=time.monotonic() - channel_ready_start,
151
+ )
152
+
133
153
  return channel
134
- except Exception:
154
+ except BaseException:
135
155
  self._logger.error(
136
156
  f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
137
157
  )
138
158
  try:
139
159
  await channel.close()
140
- except Exception as e:
160
+ except BaseException as e:
141
161
  self._logger.error(
142
162
  "failed closing not established channel", exc_info=e
143
163
  )
@@ -173,45 +193,3 @@ class ChannelManager:
173
193
  except Exception as e:
174
194
  self._logger.error("failed closing channel", exc_info=e)
175
195
  self._channel = None
176
-
177
-
178
- def _channel_options(keep_alive_period_sec: int) -> list[tuple[str, int]]:
179
- """Returns the gRPC channel options."""
180
- # See https://grpc.io/docs/guides/keepalive/.
181
- #
182
- # NB: Rust Tonic framework that we're using in Server is not using gRPC core and doesn't support
183
- # these options. From https://github.com/hyperium/tonic/issues/258 it supports gRPC PINGs when
184
- # there are in-flight RPCs (and streams) without any extra configuration.
185
- return [
186
- ("grpc.keepalive_time_ms", keep_alive_period_sec * 1000),
187
- (
188
- "grpc.http2.max_pings_without_data",
189
- -1,
190
- ), # Allow any number of empty PING messages
191
- (
192
- "grpc.keepalive_permit_without_calls",
193
- 0,
194
- ), # Don't send PINGs when there are no in-flight RPCs (and streams)
195
- ]
196
-
197
-
198
- def _keep_alive_period_sec_from_env(logger: Any) -> int:
199
- """Returns the keep alive period in seconds."""
200
- # We have to use gRPC keep alive (PING) to prevent proxies/load-balancers from closing underlying HTTP/2
201
- # (TCP) connections due to periods of idleness in gRPC streams that we use between Executor and Server.
202
- # If a proxy/load-balancer closes the connection, then we see it as gRPC stream errors which results in
203
- # a lot of error logs noise.
204
- #
205
- # The default period of 50 sec is used for one of the standard proxy/load-balancer timeouts of 1 minute.
206
- DEFAULT_KEEP_ALIVE_PERIOD_SEC = "50"
207
- keep_alive_period_sec = int(
208
- os.getenv(
209
- "INDEXIFY_EXECUTOR_GRPC_KEEP_ALIVE_PERIOD_SEC",
210
- DEFAULT_KEEP_ALIVE_PERIOD_SEC,
211
- )
212
- )
213
- if keep_alive_period_sec != int(DEFAULT_KEEP_ALIVE_PERIOD_SEC):
214
- logger.info(
215
- f"gRPC keep alive (PING) period is set to {keep_alive_period_sec} sec"
216
- )
217
- return keep_alive_period_sec
@@ -69,6 +69,7 @@ class Executor:
69
69
  self._channel_manager = ChannelManager(
70
70
  server_address=grpc_server_addr,
71
71
  config_path=config_path,
72
+ health_checker=health_checker,
72
73
  logger=self._logger,
73
74
  )
74
75
  function_allowlist: List[FunctionURI] = parse_function_uris(function_uris)
@@ -0,0 +1,36 @@
1
+ from typing import Optional
2
+
3
+ from .health_checker import HealthChecker, HealthCheckResult
4
+
5
+ _HEALTH_CHECKER_NAME = "GenericHealthChecker"
6
+
7
+
8
+ class GenericHealthChecker(HealthChecker):
9
+ """A generic health checker that doesn't depend on machine type and other features of the environment.
10
+
11
+ The health checker uses software signals available in all environments like Function Executor failure rates.
12
+ """
13
+
14
+ def __init__(self):
15
+ self._server_connection_unhealthy_status_message: Optional[str] = None
16
+
17
+ def server_connection_state_changed(self, is_healthy: bool, status_message: str):
18
+ """Handle changes in server connection state."""
19
+ if is_healthy:
20
+ self._server_connection_unhealthy_status_message = None
21
+ else:
22
+ self._server_connection_unhealthy_status_message = status_message
23
+
24
+ async def check(self) -> HealthCheckResult:
25
+ if self._server_connection_unhealthy_status_message is not None:
26
+ return HealthCheckResult(
27
+ is_success=False,
28
+ status_message=self._server_connection_unhealthy_status_message,
29
+ checker_name=_HEALTH_CHECKER_NAME,
30
+ )
31
+
32
+ return HealthCheckResult(
33
+ is_success=True,
34
+ status_message="Successful",
35
+ checker_name=_HEALTH_CHECKER_NAME,
36
+ )
@@ -0,0 +1,16 @@
1
+ class HealthCheckResult:
2
+ def __init__(self, checker_name: str, is_success: bool, status_message: str):
3
+ self.checker_name: str = checker_name
4
+ self.is_success: bool = is_success
5
+ self.status_message: str = status_message
6
+
7
+
8
+ class HealthChecker:
9
+ """Abstract base class for health checkers."""
10
+
11
+ def server_connection_state_changed(self, is_healthy: bool, status_message: str):
12
+ """Handle changes in server connection state."""
13
+ raise NotImplementedError("Subclasses must implement this method.")
14
+
15
+ async def check(self) -> HealthCheckResult:
16
+ raise NotImplementedError("Subclasses must implement this method.")
@@ -1,20 +0,0 @@
1
- from .health_checker import HealthChecker, HealthCheckResult
2
-
3
- HEALTH_CHECKER_NAME = "GenericHealthChecker"
4
-
5
-
6
- class GenericHealthChecker(HealthChecker):
7
- """A generic health checker that doesn't depend on machine type and other features of the environment.
8
-
9
- The health checker uses software signals available in all environments like Function Executor failure rates.
10
- """
11
-
12
- def __init__(self):
13
- pass
14
-
15
- async def check(self) -> HealthCheckResult:
16
- return HealthCheckResult(
17
- is_success=True,
18
- status_message="The health check is always successful",
19
- checker_name=HEALTH_CHECKER_NAME,
20
- )
@@ -1,12 +0,0 @@
1
- class HealthCheckResult:
2
- def __init__(self, checker_name: str, is_success: bool, status_message: str):
3
- self.checker_name = checker_name
4
- self.is_success = is_success
5
- self.status_message = status_message
6
-
7
-
8
- class HealthChecker:
9
- """Abstract base class for health checkers."""
10
-
11
- async def check(self) -> HealthCheckResult:
12
- raise NotImplementedError("Subclasses must implement this method.")
File without changes