PyPI - indexify - Versions diffs - 0.3.15__py3-none-any.whl → 0.3.17__py3-none-any.whl - Mend

indexify 0.3.15py3-none-any.whl → 0.3.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

indexify/executor/function_executor/function_executor_status.py ADDED Viewed

@@ -0,0 +1,91 @@
+from enum import Enum
+class FunctionExecutorStatus(Enum):
+    """Status of a Function Executor.
+    Each status lists transitions allowed to it.
+    """
+    # DESTROYED -> STARTING_UP
+    STARTING_UP = "Starting Up"
+    # STARTING_UP -> STARTUP_FAILED_CUSTOMER_ERROR
+    STARTUP_FAILED_CUSTOMER_ERROR = "Startup Failed (Customer Error)"
+    # STARTING_UP -> STARTUP_FAILED_PLATFORM_ERROR
+    STARTUP_FAILED_PLATFORM_ERROR = "Startup Failed (Platform Error)"
+    # STARTING_UP -> IDLE
+    # RUNNING_TASK -> IDLE
+    IDLE = "Idle"
+    # IDLE -> RUNNING_TASK
+    RUNNING_TASK = "Running Task"
+    # IDLE -> UNHEALTHY
+    # RUNNING_TASK -> UNHEALTHY
+    UNHEALTHY = "Unhealthy"
+    # STARTUP_FAILED_CUSTOMER_ERROR -> DESTROYING
+    # STARTUP_FAILED_PLATFORM_ERROR -> DESTROYING
+    # UNHEALTHY -> DESTROYING
+    # IDLE -> DESTROYING
+    DESTROYING = "Destroying"
+    # DESTROYED (initial status)
+    # DESTROYING -> DESTROYED
+    DESTROYED = "Destroyed"
+    # Any state -> SHUTDOWN
+    SHUTDOWN = "Shutdown"  # Permanent stop state
+def is_status_change_allowed(
+    current_status: FunctionExecutorStatus, new_status: FunctionExecutorStatus
+) -> bool:
+    """Returns True if the transition is allowed."""
+    allowed_transitions = {
+        FunctionExecutorStatus.DESTROYED: [
+            FunctionExecutorStatus.DESTROYED,
+            FunctionExecutorStatus.STARTING_UP,
+            FunctionExecutorStatus.SHUTDOWN,
+        ],
+        FunctionExecutorStatus.STARTING_UP: [
+            FunctionExecutorStatus.STARTING_UP,
+            FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
+            FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
+            FunctionExecutorStatus.IDLE,
+            FunctionExecutorStatus.SHUTDOWN,
+        ],
+        FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR: [
+            FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
+            FunctionExecutorStatus.DESTROYING,
+            FunctionExecutorStatus.SHUTDOWN,
+        ],
+        FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR: [
+            FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
+            FunctionExecutorStatus.DESTROYING,
+            FunctionExecutorStatus.SHUTDOWN,
+        ],
+        FunctionExecutorStatus.IDLE: [
+            FunctionExecutorStatus.IDLE,
+            FunctionExecutorStatus.RUNNING_TASK,
+            FunctionExecutorStatus.UNHEALTHY,
+            FunctionExecutorStatus.DESTROYING,
+            FunctionExecutorStatus.SHUTDOWN,
+        ],
+        FunctionExecutorStatus.RUNNING_TASK: [
+            FunctionExecutorStatus.RUNNING_TASK,
+            FunctionExecutorStatus.IDLE,
+            FunctionExecutorStatus.UNHEALTHY,
+            FunctionExecutorStatus.SHUTDOWN,
+        ],
+        FunctionExecutorStatus.UNHEALTHY: [
+            FunctionExecutorStatus.UNHEALTHY,
+            FunctionExecutorStatus.DESTROYING,
+            FunctionExecutorStatus.SHUTDOWN,
+        ],
+        FunctionExecutorStatus.DESTROYING: [
+            FunctionExecutorStatus.DESTROYING,
+            FunctionExecutorStatus.DESTROYED,
+            FunctionExecutorStatus.SHUTDOWN,
+        ],
+        FunctionExecutorStatus.SHUTDOWN: [
+            FunctionExecutorStatus.SHUTDOWN
+        ],  # No transitions allowed from SHUTDOWN
+    }
+    return new_status in allowed_transitions.get(current_status, [])

indexify/executor/function_executor/health_checker.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import asyncio
+import os
 from collections.abc import Awaitable, Callable
 from typing import Any, Optional
-from grpc.aio import AioRpcError
+import grpc
+import grpc.aio
 from tensorlake.function_executor.proto.function_executor_pb2 import (
     HealthCheckRequest,
     HealthCheckResponse,
@@ -27,7 +29,10 @@ class HealthCheckResult:
 class HealthChecker:
-    def __init__(self, stub: FunctionExecutorStub, logger: Any):
+    def __init__(
+        self, channel: grpc.aio.Channel, stub: FunctionExecutorStub, logger: Any
+    ):
+        self._channel: grpc.aio.Channel = channel
         self._stub: FunctionExecutorStub = stub
         self._logger: Any = logger.bind(module=__name__)
         self._health_check_loop_task: Optional[asyncio.Task] = None
@@ -39,6 +44,12 @@ class HealthChecker:
         """Runs the health check once and returns the result.
         Does not raise any exceptions."""
+        if os.getenv("INDEXIFY_DISABLE_FUNCTION_EXECUTOR_HEALTH_CHECKS", "0") == "1":
+            return HealthCheckResult(
+                is_healthy=True,
+                reason="Function Executor health checks are disabled using INDEXIFY_DISABLE_FUNCTION_EXECUTOR_HEALTH_CHECKS env var.",
+            )
         with metric_health_check_latency.time():
             try:
                 response: HealthCheckResponse = await self._stub.check_health(
@@ -49,19 +60,32 @@ class HealthChecker:
                 return HealthCheckResult(
                     is_healthy=response.healthy, reason=response.status_message
                 )
-            except AioRpcError as e:
-                metric_failed_health_checks.inc()
-                # Expected exception when there are problems with communication because e.g. the server is unhealthy.
-                return HealthCheckResult(
-                    is_healthy=False,
-                    reason=f"Executor side RPC channel error: {str(e)}",
-                )
+            except grpc.aio.AioRpcError as e:
+                # Due to the customer code running in Function Executor we can't reliably conclude
+                # that the FE is unhealthy when RPC status code is not OK. E.g. customer code can
+                # hold Python GIL and prevent the health check RPC from being processed by FE Python code.
+                #
+                # The only unhealthy condition we can be sure about is when the channel can't re-establish
+                # the TCP connection within HEALTH_CHECK_TIMEOUT_SEC deadline. This is because FE Python
+                # code is not involved when TCP connections are established to FE. Problems reestablishing
+                # the TCP connection are usually due to the FE process crashing and its gRPC server socket
+                # not being available anymore or due to prolonged local networking failures on Executor.
+                channel_connectivity = self._channel.get_state()
+                if channel_connectivity == grpc.ChannelConnectivity.TRANSIENT_FAILURE:
+                    return HealthCheckResult(
+                        is_healthy=False,
+                        reason="Channel is in TRANSIENT_FAILURE state, assuming Function Executor crashed.",
+                    )
+                else:
+                    return HealthCheckResult(
+                        is_healthy=True,
+                        reason=f"Health check RPC failed with status code: {e.code().name}. Assuming Function Executor is healthy.",
+                    )
             except Exception as e:
-                metric_failed_health_checks.inc()
-                self._logger.warning("Got unexpected exception, ignoring", exc_info=e)
+                self._logger.error("Got unexpected exception, ignoring", exc_info=e)
                 return HealthCheckResult(
-                    is_healthy=False,
-                    reason=f"Unexpected exception in Executor: {str(e)}",
+                    is_healthy=True,
+                    reason=f"Unexpected exception in Executor: {str(e)}. Assuming Function Executor is healthy.",
                 )
     def start(self, callback: Callable[[HealthCheckResult], Awaitable[None]]) -> None:

indexify/executor/function_executor/metrics/function_executor.py CHANGED Viewed

@@ -90,7 +90,7 @@ metric_get_info_rpc_errors: prometheus_client.Counter = prometheus_client.Counte
 )
 metric_function_executor_infos: prometheus_client.Counter = prometheus_client.Counter(
     "function_executor_infos",
-    "Number of Function Executors with particular info",
+    "Number of Function Executor creations with particular info",
     ["version", "sdk_version", "sdk_language", "sdk_language_version"],
 )

indexify/executor/function_executor/metrics/function_executor_state.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import prometheus_client
+from ..function_executor_status import FunctionExecutorStatus
 # This file contains all metrics used by FunctionExecutorState.
 metric_function_executor_state_not_locked_errors: prometheus_client.Counter = (
@@ -8,3 +10,37 @@ metric_function_executor_state_not_locked_errors: prometheus_client.Counter = (
         "Number of times a Function Executor state was used without acquiring its lock",
     )
 )
+# Function Executors count with a particular status.
+metric_function_executors_with_status: prometheus_client.Gauge = (
+    prometheus_client.Gauge(
+        "function_executors_with_status",
+        "Number of Function Executors with a particular status",
+        ["status"],
+    )
+)
+metric_function_executors_with_status.labels(
+    status=FunctionExecutorStatus.STARTING_UP.name
+)
+metric_function_executors_with_status.labels(
+    status=FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR.name
+)
+metric_function_executors_with_status.labels(
+    status=FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR.name
+)
+metric_function_executors_with_status.labels(status=FunctionExecutorStatus.IDLE.name)
+metric_function_executors_with_status.labels(
+    status=FunctionExecutorStatus.RUNNING_TASK.name
+)
+metric_function_executors_with_status.labels(
+    status=FunctionExecutorStatus.UNHEALTHY.name
+)
+metric_function_executors_with_status.labels(
+    status=FunctionExecutorStatus.DESTROYING.name
+)
+metric_function_executors_with_status.labels(
+    status=FunctionExecutorStatus.DESTROYED.name
+)
+metric_function_executors_with_status.labels(
+    status=FunctionExecutorStatus.SHUTDOWN.name
+)

indexify/executor/function_executor/server/function_executor_server_factory.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from typing import Any, Optional
+from dataclasses import dataclass
+from typing import Any, List, Optional
 from .function_executor_server import FunctionExecutorServer
+@dataclass
 class FunctionExecutorServerConfiguration:
     """Configuration for creating a FunctionExecutorServer.
@@ -14,13 +16,11 @@ class FunctionExecutorServerConfiguration:
     configuration parameters or raise an exception if it can't implement
     them."""
-    def __init__(
-        self, executor_id: str, function_executor_id: str, image_uri: Optional[str]
-    ):
-        self.executor_id: str = executor_id
-        self.function_executor_id: str = function_executor_id
-        # Container image URI of the Function Executor Server.
-        self.image_uri: Optional[str] = image_uri
+    executor_id: str
+    function_executor_id: str
+    namespace: str
+    image_uri: Optional[str]
+    secret_names: List[str]
 class FunctionExecutorServerFactory:

indexify/executor/function_executor/single_task_runner.py CHANGED Viewed

@@ -14,6 +14,7 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
 from ..api_objects import Task
 from .function_executor import CustomerError, FunctionExecutor
 from .function_executor_state import FunctionExecutorState
+from .function_executor_status import FunctionExecutorStatus
 from .health_checker import HealthChecker, HealthCheckResult
 from .metrics.single_task_runner import (
     metric_function_executor_run_task_rpc_errors,
@@ -40,9 +41,11 @@ class SingleTaskRunner:
         logger: Any,
     ):
         self._executor_id: str = executor_id
-        self._state: FunctionExecutorState = function_executor_state
+        self._function_executor_state: FunctionExecutorState = function_executor_state
         self._task_input: TaskInput = task_input
-        self._factory: FunctionExecutorServerFactory = function_executor_server_factory
+        self._function_executor_server_factory: FunctionExecutorServerFactory = (
+            function_executor_server_factory
+        )
         self._base_url: str = base_url
         self._config_path: Optional[str] = config_path
         self._logger = logger.bind(module=__name__)
@@ -54,18 +57,32 @@ class SingleTaskRunner:
         The lock is released during actual task run in the server.
         The lock is relocked on return.
-        Raises an exception if an error occured."""
-        self._state.check_locked()
+        Raises an exception if an error occured.
+        On enter the Function Executor status is either IDLE, UNHEALTHY or DESTROYED.
+        On return the Function Executor status is either IDLE, UNHEALTHY or DESTROYED.
+        """
+        self._function_executor_state.check_locked()
-        if self._state.is_shutdown:
-            raise RuntimeError("Function Executor state is shutting down.")
+        if self._function_executor_state.status not in [
+            FunctionExecutorStatus.IDLE,
+            FunctionExecutorStatus.UNHEALTHY,
+            FunctionExecutorStatus.DESTROYED,
+        ]:
+            self._logger.error(
+                "Function Executor is not in oneof [IDLE, UNHEALTHY, DESTROYED] state, cannot run the task",
+                status=self._function_executor_state.status,
+            )
+            raise RuntimeError(
+                f"Unexpected Function Executor state {self._function_executor_state.status}"
+            )
         # If Function Executor became unhealthy while was idle then destroy it.
         # It'll be recreated below.
         await self._destroy_existing_function_executor_if_unhealthy()
         # Create Function Executor if it doesn't exist yet.
-        if self._state.function_executor is None:
+        if self._function_executor_state.status == FunctionExecutorStatus.DESTROYED:
             try:
                 await self._create_function_executor()
             except CustomerError as e:
@@ -87,15 +104,38 @@ class SingleTaskRunner:
             # The periodic health checker might not notice this as it does only periodic checks.
             await self._destroy_existing_function_executor_if_unhealthy()
-    async def _create_function_executor(self) -> FunctionExecutor:
-        function_executor: FunctionExecutor = FunctionExecutor(
-            server_factory=self._factory, logger=self._logger
+            if self._function_executor_state.status not in [
+                FunctionExecutorStatus.IDLE,
+                FunctionExecutorStatus.UNHEALTHY,
+                FunctionExecutorStatus.DESTROYED,
+            ]:
+                self._logger.error(
+                    "Function Executor status is not oneof [IDLE, UNHEALTHY, DESTROYED] after running the task, resetting the state to mitigate a possible bug",
+                    status=self._function_executor_state.status,
+                )
+                if self._function_executor_state.function_executor is None:
+                    await self._function_executor_state.set_status(
+                        FunctionExecutorStatus.DESTROYED
+                    )
+                else:
+                    await self._function_executor_state.set_status(
+                        FunctionExecutorStatus.UNHEALTHY
+                    )
+    async def _create_function_executor(self) -> None:
+        await self._function_executor_state.set_status(
+            FunctionExecutorStatus.STARTING_UP
+        )
+        self._function_executor_state.function_executor = FunctionExecutor(
+            server_factory=self._function_executor_server_factory, logger=self._logger
         )
         config: FunctionExecutorServerConfiguration = (
             FunctionExecutorServerConfiguration(
                 executor_id=self._executor_id,
-                function_executor_id=self._state.id,
+                function_executor_id=self._function_executor_state.id,
+                namespace=self._task_input.task.namespace,
                 image_uri=self._task_input.task.image_uri,
+                secret_names=self._task_input.task.secret_names or [],
             )
         )
         initialize_request: InitializeRequest = InitializeRequest(
@@ -107,17 +147,29 @@ class SingleTaskRunner:
         )
         try:
-            await function_executor.initialize(
+            await self._function_executor_state.function_executor.initialize(
                 config=config,
                 initialize_request=initialize_request,
                 base_url=self._base_url,
                 config_path=self._config_path,
             )
-            self._state.function_executor = function_executor
+        except CustomerError:
+            # We have to follow the valid state transition sequence.
+            await self._function_executor_state.set_status(
+                FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR
+            )
+            await self._function_executor_state.destroy_function_executor()
+            raise
         except Exception:
-            await function_executor.destroy()
+            # We have to follow the valid state transition sequence.
+            await self._function_executor_state.set_status(
+                FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
+            )
+            await self._function_executor_state.destroy_function_executor()
             raise
+        await self._function_executor_state.set_status(FunctionExecutorStatus.IDLE)
     async def _run(self) -> TaskOutput:
         request: RunTaskRequest = RunTaskRequest(
             namespace=self._task_input.task.namespace,
@@ -130,13 +182,15 @@ class SingleTaskRunner:
         )
         if self._task_input.init_value is not None:
             request.function_init_value.CopyFrom(self._task_input.init_value)
-        channel: grpc.aio.Channel = self._state.function_executor.channel()
+        channel: grpc.aio.Channel = (
+            self._function_executor_state.function_executor.channel()
+        )
         async with _RunningTaskContextManager(
             invocation_id=self._task_input.task.invocation_id,
             task_id=self._task_input.task.id,
             health_check_failed_callback=self._health_check_failed_callback,
-            function_executor_state=self._state,
+            function_executor_state=self._function_executor_state,
         ):
             with (
                 metric_function_executor_run_task_rpc_errors.count_exceptions(),
@@ -154,31 +208,40 @@ class SingleTaskRunner:
     async def _health_check_failed_callback(self, result: HealthCheckResult):
         # Function Executor destroy due to the periodic health check failure ensures that
         # a running task RPC stuck in unhealthy Function Executor fails immidiately.
-        async with self._state.lock:
-            if self._state.function_executor is not None:
-                await self._destroy_function_executor_on_failed_health_check(
-                    result.reason
-                )
+        async with self._function_executor_state.lock:
+            if (
+                self._function_executor_state.status
+                != FunctionExecutorStatus.RUNNING_TASK
+            ):
+                # Protection in case the callback gets delivered after we finished running the task.
+                return
+            await self._function_executor_state.set_status(
+                FunctionExecutorStatus.UNHEALTHY
+            )
+            await self._destroy_function_executor_on_failed_health_check(result.reason)
     async def _destroy_existing_function_executor_if_unhealthy(self):
-        self._state.check_locked()
-        if self._state.function_executor is None:
-            return
-        result: HealthCheckResult = (
-            await self._state.function_executor.health_checker().check()
-        )
-        if result.is_healthy:
-            return
-        await self._destroy_function_executor_on_failed_health_check(result.reason)
+        self._function_executor_state.check_locked()
+        if self._function_executor_state.status == FunctionExecutorStatus.IDLE:
+            result: HealthCheckResult = (
+                await self._function_executor_state.function_executor.health_checker().check()
+            )
+            if not result.is_healthy:
+                await self._function_executor_state.set_status(
+                    FunctionExecutorStatus.UNHEALTHY
+                )
+        if self._function_executor_state.status == FunctionExecutorStatus.UNHEALTHY:
+            await self._destroy_function_executor_on_failed_health_check(result.reason)
     async def _destroy_function_executor_on_failed_health_check(self, reason: str):
-        self._state.check_locked()
+        self._function_executor_state.check_locked()
         self._logger.error(
             "Function Executor health check failed, destroying Function Executor",
             health_check_fail_reason=reason,
         )
-        self._state.health_check_failed = True
-        await self._state.destroy_function_executor()
+        await self._function_executor_state.destroy_function_executor()
 class _RunningTaskContextManager:
@@ -199,7 +262,7 @@ class _RunningTaskContextManager:
         self._state: FunctionExecutorState = function_executor_state
     async def __aenter__(self):
-        self._state.increment_running_tasks()
+        await self._state.set_status(FunctionExecutorStatus.RUNNING_TASK)
         self._state.function_executor.invocation_state_client().add_task_to_invocation_id_entry(
             task_id=self._task_id,
             invocation_id=self._invocation_id,
@@ -213,9 +276,9 @@ class _RunningTaskContextManager:
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         await self._state.lock.acquire()
-        self._state.decrement_running_tasks()
-        # Health check callback could destroy the FunctionExecutor.
-        if self._state.function_executor is not None:
+        # Health check callback could destroy the FunctionExecutor and set status to UNHEALTHY
+        if self._state.status == FunctionExecutorStatus.RUNNING_TASK:
+            await self._state.set_status(FunctionExecutorStatus.IDLE)
             self._state.function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
                 task_id=self._task_id
             )

indexify/executor/grpc/channel_creator.py ADDED Viewed

@@ -0,0 +1,53 @@
+import asyncio
+from typing import Any
+import grpc.aio
+from .metrics.channel_creator import (
+    metric_grpc_server_channel_creation_latency,
+    metric_grpc_server_channel_creation_retries,
+    metric_grpc_server_channel_creations,
+)
+_RETRY_INTERVAL_SEC = 5
+_CONNECT_TIMEOUT_SEC = 5
+class ChannelCreator:
+    def __init__(self, server_address: str, logger: Any):
+        self._logger = logger.bind(module=__name__)
+        self._server_address = server_address
+        self._is_shutdown = False
+    async def create(self) -> grpc.aio.Channel:
+        """Creates a channel to the gRPC server.
+        Blocks until the channel is ready.
+        Never raises any exceptions.
+        """
+        with metric_grpc_server_channel_creation_latency.time():
+            metric_grpc_server_channel_creations.inc()
+            while not self._is_shutdown:
+                try:
+                    channel = grpc.aio.insecure_channel(self._server_address)
+                    await asyncio.wait_for(
+                        channel.channel_ready(),
+                        timeout=_CONNECT_TIMEOUT_SEC,
+                    )
+                    return channel
+                except Exception:
+                    self._logger.error(
+                        f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
+                    )
+                    try:
+                        await channel.close()
+                    except Exception as e:
+                        self._logger.error(
+                            "failed closing not established channel", exc_info=e
+                        )
+                    metric_grpc_server_channel_creation_retries.inc()
+                    await asyncio.sleep(_RETRY_INTERVAL_SEC)
+    async def shutdown(self):
+        self._is_shutdown = True

indexify/executor/grpc/metrics/channel_creator.py ADDED Viewed

@@ -0,0 +1,18 @@
+import prometheus_client
+from ...monitoring.metrics import latency_metric_for_fast_operation
+metric_grpc_server_channel_creations = prometheus_client.Counter(
+    "grpc_server_channel_creations",
+    "Number of times a channel to gRPC Server was created",
+)
+metric_grpc_server_channel_creation_retries = prometheus_client.Counter(
+    "grpc_server_channel_creation_retries",
+    "Number of retries during a channel creation to gRPC Server",
+)
+metric_grpc_server_channel_creation_latency: prometheus_client.Histogram = (
+    latency_metric_for_fast_operation(
+        "grpc_server_channel_creation",
+        "gRPC server channel creation",
+    )
+)

indexify/executor/grpc/metrics/state_reporter.py ADDED Viewed

@@ -0,0 +1,17 @@
+import prometheus_client
+from ...monitoring.metrics import latency_metric_for_fast_operation
+metric_state_report_rpcs = prometheus_client.Counter(
+    "state_report_rpcs",
+    "Number of Executor state report RPCs to Server",
+)
+metric_state_report_errors = prometheus_client.Counter(
+    "state_report_rpc_errors",
+    "Number of Executor state report RPC errors",
+)
+metric_state_report_latency: prometheus_client.Histogram = (
+    latency_metric_for_fast_operation(
+        "state_report_rpc", "Executor state report rpc to Server"
+    )
+)

indexify 0.3.15__py3-none-any.whl → 0.3.17__py3-none-any.whl

indexify 0.3.15py3-none-any.whl → 0.3.17py3-none-any.whl