indexify 0.3.16__tar.gz → 0.3.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {indexify-0.3.16 → indexify-0.3.17}/PKG-INFO +1 -1
- {indexify-0.3.16 → indexify-0.3.17}/pyproject.toml +1 -1
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/function_executor.py +5 -2
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/health_checker.py +37 -13
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/metrics/task_runner.py +7 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/task_runner.py +4 -0
- {indexify-0.3.16 → indexify-0.3.17}/README.md +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/cli/cli.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/README.md +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/api_objects.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/downloader.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/executor.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/function_executor_state.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/function_executor_states_container.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/function_executor_status.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/invocation_state_client.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/metrics/function_executor.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/metrics/function_executor_state.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/metrics/health_checker.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/metrics/invocation_state_client.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/metrics/single_task_runner.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/server/client_configuration.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/server/function_executor_server.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/server/function_executor_server_factory.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/server/subprocess_function_executor_server.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/single_task_runner.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/task_input.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/task_output.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/grpc/channel_creator.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/grpc/metrics/channel_creator.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/grpc/metrics/state_reporter.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/grpc/state_reconciler.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/grpc/state_reporter.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/metrics/downloader.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/metrics/executor.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/metrics/task_fetcher.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/metrics/task_reporter.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/monitoring/function_allowlist.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/monitoring/handler.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/monitoring/health_check_handler.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/monitoring/health_checker/generic_health_checker.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/monitoring/health_checker/health_checker.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/monitoring/metrics.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/monitoring/prometheus_metrics_handler.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/monitoring/server.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/monitoring/startup_probe_handler.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/runtime_probes.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/task_fetcher.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/task_reporter.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/proto/task_scheduler.proto +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/proto/task_scheduler_pb2.py +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/proto/task_scheduler_pb2.pyi +0 -0
- {indexify-0.3.16 → indexify-0.3.17}/src/indexify/proto/task_scheduler_pb2_grpc.py +0 -0
@@ -1,7 +1,7 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "indexify"
|
3
3
|
# Incremented if any of the components provided in this packages are updated.
|
4
|
-
version = "0.3.
|
4
|
+
version = "0.3.17"
|
5
5
|
description = "Open Source Indexify components and helper tools"
|
6
6
|
authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
|
7
7
|
license = "Apache 2.0"
|
{indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/function_executor.py
RENAMED
@@ -110,7 +110,7 @@ class FunctionExecutor:
|
|
110
110
|
config_path=config_path,
|
111
111
|
initialize_request=initialize_request,
|
112
112
|
)
|
113
|
-
await self._create_health_checker(stub)
|
113
|
+
await self._create_health_checker(self._channel, stub)
|
114
114
|
self._initialized = True
|
115
115
|
except Exception:
|
116
116
|
await self.destroy()
|
@@ -243,12 +243,15 @@ class FunctionExecutor:
|
|
243
243
|
finally:
|
244
244
|
self._invocation_state_client = None
|
245
245
|
|
246
|
-
async def _create_health_checker(
|
246
|
+
async def _create_health_checker(
|
247
|
+
self, channel: grpc.aio.Channel, stub: FunctionExecutorStub
|
248
|
+
) -> None:
|
247
249
|
with (
|
248
250
|
metric_create_health_checker_errors.count_exceptions(),
|
249
251
|
metric_create_health_checker_latency.time(),
|
250
252
|
):
|
251
253
|
self._health_checker = HealthChecker(
|
254
|
+
channel=channel,
|
252
255
|
stub=stub,
|
253
256
|
logger=self._logger,
|
254
257
|
)
|
{indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/health_checker.py
RENAMED
@@ -1,8 +1,10 @@
|
|
1
1
|
import asyncio
|
2
|
+
import os
|
2
3
|
from collections.abc import Awaitable, Callable
|
3
4
|
from typing import Any, Optional
|
4
5
|
|
5
|
-
|
6
|
+
import grpc
|
7
|
+
import grpc.aio
|
6
8
|
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
7
9
|
HealthCheckRequest,
|
8
10
|
HealthCheckResponse,
|
@@ -27,7 +29,10 @@ class HealthCheckResult:
|
|
27
29
|
|
28
30
|
|
29
31
|
class HealthChecker:
|
30
|
-
def __init__(
|
32
|
+
def __init__(
|
33
|
+
self, channel: grpc.aio.Channel, stub: FunctionExecutorStub, logger: Any
|
34
|
+
):
|
35
|
+
self._channel: grpc.aio.Channel = channel
|
31
36
|
self._stub: FunctionExecutorStub = stub
|
32
37
|
self._logger: Any = logger.bind(module=__name__)
|
33
38
|
self._health_check_loop_task: Optional[asyncio.Task] = None
|
@@ -39,6 +44,12 @@ class HealthChecker:
|
|
39
44
|
"""Runs the health check once and returns the result.
|
40
45
|
|
41
46
|
Does not raise any exceptions."""
|
47
|
+
if os.getenv("INDEXIFY_DISABLE_FUNCTION_EXECUTOR_HEALTH_CHECKS", "0") == "1":
|
48
|
+
return HealthCheckResult(
|
49
|
+
is_healthy=True,
|
50
|
+
reason="Function Executor health checks are disabled using INDEXIFY_DISABLE_FUNCTION_EXECUTOR_HEALTH_CHECKS env var.",
|
51
|
+
)
|
52
|
+
|
42
53
|
with metric_health_check_latency.time():
|
43
54
|
try:
|
44
55
|
response: HealthCheckResponse = await self._stub.check_health(
|
@@ -49,19 +60,32 @@ class HealthChecker:
|
|
49
60
|
return HealthCheckResult(
|
50
61
|
is_healthy=response.healthy, reason=response.status_message
|
51
62
|
)
|
52
|
-
except AioRpcError as e:
|
53
|
-
|
54
|
-
#
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
63
|
+
except grpc.aio.AioRpcError as e:
|
64
|
+
# Due to the customer code running in Function Executor we can't reliably conclude
|
65
|
+
# that the FE is unhealthy when RPC status code is not OK. E.g. customer code can
|
66
|
+
# hold Python GIL and prevent the health check RPC from being processed by FE Python code.
|
67
|
+
#
|
68
|
+
# The only unhealthy condition we can be sure about is when the channel can't re-establish
|
69
|
+
# the TCP connection within HEALTH_CHECK_TIMEOUT_SEC deadline. This is because FE Python
|
70
|
+
# code is not involved when TCP connections are established to FE. Problems reestablishing
|
71
|
+
# the TCP connection are usually due to the FE process crashing and its gRPC server socket
|
72
|
+
# not being available anymore or due to prolonged local networking failures on Executor.
|
73
|
+
channel_connectivity = self._channel.get_state()
|
74
|
+
if channel_connectivity == grpc.ChannelConnectivity.TRANSIENT_FAILURE:
|
75
|
+
return HealthCheckResult(
|
76
|
+
is_healthy=False,
|
77
|
+
reason="Channel is in TRANSIENT_FAILURE state, assuming Function Executor crashed.",
|
78
|
+
)
|
79
|
+
else:
|
80
|
+
return HealthCheckResult(
|
81
|
+
is_healthy=True,
|
82
|
+
reason=f"Health check RPC failed with status code: {e.code().name}. Assuming Function Executor is healthy.",
|
83
|
+
)
|
59
84
|
except Exception as e:
|
60
|
-
|
61
|
-
self._logger.warning("Got unexpected exception, ignoring", exc_info=e)
|
85
|
+
self._logger.error("Got unexpected exception, ignoring", exc_info=e)
|
62
86
|
return HealthCheckResult(
|
63
|
-
is_healthy=
|
64
|
-
reason=f"Unexpected exception in Executor: {str(e)}",
|
87
|
+
is_healthy=True,
|
88
|
+
reason=f"Unexpected exception in Executor: {str(e)}. Assuming Function Executor is healthy.",
|
65
89
|
)
|
66
90
|
|
67
91
|
def start(self, callback: Callable[[HealthCheckResult], Awaitable[None]]) -> None:
|
@@ -23,6 +23,13 @@ metric_tasks_blocked_by_policy: prometheus_client.Gauge = prometheus_client.Gaug
|
|
23
23
|
"tasks_blocked_by_policy",
|
24
24
|
"Number of tasks that are ready for execution but are blocked according to the current policy (typically waiting for a free Function Executor)",
|
25
25
|
)
|
26
|
+
metric_tasks_blocked_by_policy_per_function_name: prometheus_client.Gauge = (
|
27
|
+
prometheus_client.Gauge(
|
28
|
+
"tasks_blocked_by_policy_per_function_name",
|
29
|
+
"Number of tasks that are ready for execution but are blocked according to the current policy (typically waiting for a free Function Executor)",
|
30
|
+
["function_name"],
|
31
|
+
)
|
32
|
+
)
|
26
33
|
|
27
34
|
# Metrics for the stage when task is running.
|
28
35
|
metric_task_runs: prometheus_client.Counter = prometheus_client.Counter(
|
@@ -22,6 +22,7 @@ from .metrics.task_runner import (
|
|
22
22
|
metric_task_run_platform_errors,
|
23
23
|
metric_task_runs,
|
24
24
|
metric_tasks_blocked_by_policy,
|
25
|
+
metric_tasks_blocked_by_policy_per_function_name,
|
25
26
|
metric_tasks_running,
|
26
27
|
)
|
27
28
|
|
@@ -55,6 +56,9 @@ class TaskRunner:
|
|
55
56
|
with (
|
56
57
|
metric_task_policy_errors.count_exceptions(),
|
57
58
|
metric_tasks_blocked_by_policy.track_inprogress(),
|
59
|
+
metric_tasks_blocked_by_policy_per_function_name.labels(
|
60
|
+
function_name=task_input.task.compute_fn
|
61
|
+
).track_inprogress(),
|
58
62
|
metric_task_policy_latency.time(),
|
59
63
|
):
|
60
64
|
metric_task_policy_runs.inc()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/function_executor/single_task_runner.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/monitoring/health_check_handler.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/monitoring/prometheus_metrics_handler.py
RENAMED
File without changes
|
File without changes
|
{indexify-0.3.16 → indexify-0.3.17}/src/indexify/executor/monitoring/startup_probe_handler.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|