indexify 0.3.16__py3-none-any.whl → 0.3.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -110,7 +110,7 @@ class FunctionExecutor:
110
110
  config_path=config_path,
111
111
  initialize_request=initialize_request,
112
112
  )
113
- await self._create_health_checker(stub)
113
+ await self._create_health_checker(self._channel, stub)
114
114
  self._initialized = True
115
115
  except Exception:
116
116
  await self.destroy()
@@ -243,12 +243,15 @@ class FunctionExecutor:
243
243
  finally:
244
244
  self._invocation_state_client = None
245
245
 
246
- async def _create_health_checker(self, stub: FunctionExecutorStub) -> None:
246
+ async def _create_health_checker(
247
+ self, channel: grpc.aio.Channel, stub: FunctionExecutorStub
248
+ ) -> None:
247
249
  with (
248
250
  metric_create_health_checker_errors.count_exceptions(),
249
251
  metric_create_health_checker_latency.time(),
250
252
  ):
251
253
  self._health_checker = HealthChecker(
254
+ channel=channel,
252
255
  stub=stub,
253
256
  logger=self._logger,
254
257
  )
@@ -1,8 +1,10 @@
1
1
  import asyncio
2
+ import os
2
3
  from collections.abc import Awaitable, Callable
3
4
  from typing import Any, Optional
4
5
 
5
- from grpc.aio import AioRpcError
6
+ import grpc
7
+ import grpc.aio
6
8
  from tensorlake.function_executor.proto.function_executor_pb2 import (
7
9
  HealthCheckRequest,
8
10
  HealthCheckResponse,
@@ -27,7 +29,10 @@ class HealthCheckResult:
27
29
 
28
30
 
29
31
  class HealthChecker:
30
- def __init__(self, stub: FunctionExecutorStub, logger: Any):
32
+ def __init__(
33
+ self, channel: grpc.aio.Channel, stub: FunctionExecutorStub, logger: Any
34
+ ):
35
+ self._channel: grpc.aio.Channel = channel
31
36
  self._stub: FunctionExecutorStub = stub
32
37
  self._logger: Any = logger.bind(module=__name__)
33
38
  self._health_check_loop_task: Optional[asyncio.Task] = None
@@ -39,6 +44,12 @@ class HealthChecker:
39
44
  """Runs the health check once and returns the result.
40
45
 
41
46
  Does not raise any exceptions."""
47
+ if os.getenv("INDEXIFY_DISABLE_FUNCTION_EXECUTOR_HEALTH_CHECKS", "0") == "1":
48
+ return HealthCheckResult(
49
+ is_healthy=True,
50
+ reason="Function Executor health checks are disabled using INDEXIFY_DISABLE_FUNCTION_EXECUTOR_HEALTH_CHECKS env var.",
51
+ )
52
+
42
53
  with metric_health_check_latency.time():
43
54
  try:
44
55
  response: HealthCheckResponse = await self._stub.check_health(
@@ -49,19 +60,32 @@ class HealthChecker:
49
60
  return HealthCheckResult(
50
61
  is_healthy=response.healthy, reason=response.status_message
51
62
  )
52
- except AioRpcError as e:
53
- metric_failed_health_checks.inc()
54
- # Expected exception when there are problems with communication because e.g. the server is unhealthy.
55
- return HealthCheckResult(
56
- is_healthy=False,
57
- reason=f"Executor side RPC channel error: {str(e)}",
58
- )
63
+ except grpc.aio.AioRpcError as e:
64
+ # Due to the customer code running in Function Executor we can't reliably conclude
65
+ # that the FE is unhealthy when RPC status code is not OK. E.g. customer code can
66
+ # hold Python GIL and prevent the health check RPC from being processed by FE Python code.
67
+ #
68
+ # The only unhealthy condition we can be sure about is when the channel can't re-establish
69
+ # the TCP connection within HEALTH_CHECK_TIMEOUT_SEC deadline. This is because FE Python
70
+ # code is not involved when TCP connections are established to FE. Problems reestablishing
71
+ # the TCP connection are usually due to the FE process crashing and its gRPC server socket
72
+ # not being available anymore or due to prolonged local networking failures on Executor.
73
+ channel_connectivity = self._channel.get_state()
74
+ if channel_connectivity == grpc.ChannelConnectivity.TRANSIENT_FAILURE:
75
+ return HealthCheckResult(
76
+ is_healthy=False,
77
+ reason="Channel is in TRANSIENT_FAILURE state, assuming Function Executor crashed.",
78
+ )
79
+ else:
80
+ return HealthCheckResult(
81
+ is_healthy=True,
82
+ reason=f"Health check RPC failed with status code: {e.code().name}. Assuming Function Executor is healthy.",
83
+ )
59
84
  except Exception as e:
60
- metric_failed_health_checks.inc()
61
- self._logger.warning("Got unexpected exception, ignoring", exc_info=e)
85
+ self._logger.error("Got unexpected exception, ignoring", exc_info=e)
62
86
  return HealthCheckResult(
63
- is_healthy=False,
64
- reason=f"Unexpected exception in Executor: {str(e)}",
87
+ is_healthy=True,
88
+ reason=f"Unexpected exception in Executor: {str(e)}. Assuming Function Executor is healthy.",
65
89
  )
66
90
 
67
91
  def start(self, callback: Callable[[HealthCheckResult], Awaitable[None]]) -> None:
@@ -23,6 +23,13 @@ metric_tasks_blocked_by_policy: prometheus_client.Gauge = prometheus_client.Gaug
23
23
  "tasks_blocked_by_policy",
24
24
  "Number of tasks that are ready for execution but are blocked according to the current policy (typically waiting for a free Function Executor)",
25
25
  )
26
+ metric_tasks_blocked_by_policy_per_function_name: prometheus_client.Gauge = (
27
+ prometheus_client.Gauge(
28
+ "tasks_blocked_by_policy_per_function_name",
29
+ "Number of tasks that are ready for execution but are blocked according to the current policy (typically waiting for a free Function Executor)",
30
+ ["function_name"],
31
+ )
32
+ )
26
33
 
27
34
  # Metrics for the stage when task is running.
28
35
  metric_task_runs: prometheus_client.Counter = prometheus_client.Counter(
@@ -22,6 +22,7 @@ from .metrics.task_runner import (
22
22
  metric_task_run_platform_errors,
23
23
  metric_task_runs,
24
24
  metric_tasks_blocked_by_policy,
25
+ metric_tasks_blocked_by_policy_per_function_name,
25
26
  metric_tasks_running,
26
27
  )
27
28
 
@@ -55,6 +56,9 @@ class TaskRunner:
55
56
  with (
56
57
  metric_task_policy_errors.count_exceptions(),
57
58
  metric_tasks_blocked_by_policy.track_inprogress(),
59
+ metric_tasks_blocked_by_policy_per_function_name.labels(
60
+ function_name=task_input.task.compute_fn
61
+ ).track_inprogress(),
58
62
  metric_task_policy_latency.time(),
59
63
  ):
60
64
  metric_task_policy_runs.inc()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: indexify
3
- Version: 0.3.16
3
+ Version: 0.3.17
4
4
  Summary: Open Source Indexify components and helper tools
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -3,11 +3,11 @@ indexify/executor/README.md,sha256=ozC6_hMkhQQNVCMEpBxwiUALz6lwErPQxNxQfQDqnG4,2
3
3
  indexify/executor/api_objects.py,sha256=oUlH-GQPuPmwgcBzMpI2HehXeElBTCULECk-oHiBHwU,1263
4
4
  indexify/executor/downloader.py,sha256=LkvAXfKxddnDzgfmwHcpDB_n795-eVKzn-hLjq4nUEM,9412
5
5
  indexify/executor/executor.py,sha256=FTp05YxuKlMUbI99OV7NkL3KuFD12evKcqxzb-fXWBk,14641
6
- indexify/executor/function_executor/function_executor.py,sha256=sQ5FOdrjybDDsjagghlfjV06IXTpWWDBSTHqQXI-w9M,11245
6
+ indexify/executor/function_executor/function_executor.py,sha256=s1mc7g6b8ilc98Fp7RFElEBSLJl0UGNQY0iZzCpuR2A,11334
7
7
  indexify/executor/function_executor/function_executor_state.py,sha256=b2taGClg0BUnlD_rYGkpom6syXBMUp7UWWrjLrUCwyo,3966
8
8
  indexify/executor/function_executor/function_executor_states_container.py,sha256=RclJDJqIr8ywKipPBC6_idnPAqYi0dPa1d4QUAaXqbw,3460
9
9
  indexify/executor/function_executor/function_executor_status.py,sha256=U4p1fcdVWlHr7uPY7e7ZSb2_WelUmPeH-WgboQQ9mw4,3336
10
- indexify/executor/function_executor/health_checker.py,sha256=qUUpG4oeVsPLibiCspAiRm-2Ldg46ulnnpj9EBXr1NQ,3916
10
+ indexify/executor/function_executor/health_checker.py,sha256=CPUWvvtJtJCwbEsfr_BOhSKkRki4qOoSk1oeyBosWz0,5464
11
11
  indexify/executor/function_executor/invocation_state_client.py,sha256=p-xgM4__cHR1ApvMV9hShrGWee_Je0VDhICZUGjpQY4,9644
12
12
  indexify/executor/function_executor/metrics/function_executor.py,sha256=TDksxLRJr-P9ZKhF2Orsaxzzb4lVIBxFEjd_9Zv53Ng,6313
13
13
  indexify/executor/function_executor/metrics/function_executor_state.py,sha256=qheMhnoiYLiZB7ky5EyegfDy4Mr0Zh83bOE0gJ38YmU,1607
@@ -32,7 +32,7 @@ indexify/executor/metrics/downloader.py,sha256=lctPh8xjkXeLEFJnl1hNrD1yEhLhIl5sg
32
32
  indexify/executor/metrics/executor.py,sha256=ua-Vv_k1CB4juJdF7tEBQbBMksqWAA3iXKKMKXZUCLk,2369
33
33
  indexify/executor/metrics/task_fetcher.py,sha256=iJEwCLzYr2cuz7hRvNiqaa2nvQP4OrA0hm0iJY0YKG0,736
34
34
  indexify/executor/metrics/task_reporter.py,sha256=zUA9RpkSgx5lG_ZqDDuela5VuhtsnC0IKoQcEvHND0Y,730
35
- indexify/executor/metrics/task_runner.py,sha256=o5ERNePKPmVKknFoSZUr-r597dEOOWvWn3ocbiL2jxI,1699
35
+ indexify/executor/metrics/task_runner.py,sha256=ZGFrl7zzfUdgPZnklxRIbnv9wVcHIQRhOGNqn9V2hSk,2047
36
36
  indexify/executor/monitoring/function_allowlist.py,sha256=wUGeiv3aAGWMlQXzHXq9O6MVHby6Tu-zY4U0MyWiQu0,683
37
37
  indexify/executor/monitoring/handler.py,sha256=Cj1cu_LcsAP0tdviqNhoEtGm4h0OJAxxzW9C2YdNXYU,240
38
38
  indexify/executor/monitoring/health_check_handler.py,sha256=e1pEtWFKaVs6H57Z4YLejNECrJtC38PweZc7xTJeqVw,695
@@ -45,12 +45,12 @@ indexify/executor/monitoring/startup_probe_handler.py,sha256=zXXsBU15SMlBx1bSFpx
45
45
  indexify/executor/runtime_probes.py,sha256=bo6Dq6AGZpJH099j0DHtVSDEH80tv3j9MXf3VXSx_p8,2182
46
46
  indexify/executor/task_fetcher.py,sha256=NpFfHgaY99bSL-K2D5kcDAMNUG2FArq0-qF_mgF-LBQ,3375
47
47
  indexify/executor/task_reporter.py,sha256=mYgwozUO95PEwYMmeeIS0-HfMrO4z3Nhy6IduMsMahM,7367
48
- indexify/executor/task_runner.py,sha256=TGiTNE68HCm38HcpPRN5-hwNiTFw_gwBRkCcrw7sz2Q,6847
48
+ indexify/executor/task_runner.py,sha256=1zYH03yS_FaFk9xXBl-ioM74-L2xdW3vHJt522mseds,7073
49
49
  indexify/proto/task_scheduler.proto,sha256=kxMIJCj1pXG-fHeJGHXlthZTsB1dy_yvshQLt0UJRTM,5672
50
50
  indexify/proto/task_scheduler_pb2.py,sha256=X97JBJZ2n6ToDtUlDjPFV66_vZ05-vO8wPATrpzAonA,9085
51
51
  indexify/proto/task_scheduler_pb2.pyi,sha256=aXrB7-eNwgchy2OVlvEfPXtr9EyYoU-sgbdSRVNEI8s,11357
52
52
  indexify/proto/task_scheduler_pb2_grpc.py,sha256=STtk9XrBzLbmWdLwpL55Obyf9ehUesfxxysxER32SEE,6854
53
- indexify-0.3.16.dist-info/METADATA,sha256=3jBagszOve3WAI9dTFN7ZtT1x_0hBFhLPOFYs-KQ6AU,1158
54
- indexify-0.3.16.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
55
- indexify-0.3.16.dist-info/entry_points.txt,sha256=GU9wmsgvN7nQw3N2X0PMYn1RSvF6CrhH9RuC2D8d3Gk,53
56
- indexify-0.3.16.dist-info/RECORD,,
53
+ indexify-0.3.17.dist-info/METADATA,sha256=hREMWJfSrd4Vcclp2w8fcUnjtvkiXHw6jMCTWECKAtw,1158
54
+ indexify-0.3.17.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
55
+ indexify-0.3.17.dist-info/entry_points.txt,sha256=GU9wmsgvN7nQw3N2X0PMYn1RSvF6CrhH9RuC2D8d3Gk,53
56
+ indexify-0.3.17.dist-info/RECORD,,