PyPI - indexify - Versions diffs - 0.3.16__py3-none-any.whl → 0.3.18__py3-none-any.whl - Mend

indexify 0.3.16py3-none-any.whl → 0.3.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

indexify/cli/cli.py +19 -2
indexify/executor/executor.py +24 -9
indexify/executor/executor_flavor.py +7 -0
indexify/executor/function_executor/function_executor.py +5 -2
indexify/executor/function_executor/health_checker.py +55 -13
indexify/executor/grpc/channel_manager.py +160 -0
indexify/executor/grpc/state_reconciler.py +14 -9
indexify/executor/grpc/state_reporter.py +72 -14
indexify/executor/metrics/task_runner.py +7 -0
indexify/executor/task_fetcher.py +8 -3
indexify/executor/task_reporter.py +17 -0
indexify/executor/task_runner.py +4 -0
indexify/proto/{task_scheduler.proto → executor_api.proto} +23 -6
indexify/proto/executor_api_pb2.py +70 -0
indexify/proto/{task_scheduler_pb2.pyi → executor_api_pb2.pyi} +44 -4
indexify/proto/{task_scheduler_pb2_grpc.py → executor_api_pb2_grpc.py} +36 -26
{indexify-0.3.16.dist-info → indexify-0.3.18.dist-info}/METADATA +1 -1
{indexify-0.3.16.dist-info → indexify-0.3.18.dist-info}/RECORD +21 -20
indexify/executor/grpc/channel_creator.py +0 -53
indexify/proto/task_scheduler_pb2.py +0 -64
/indexify/executor/grpc/metrics/{channel_creator.py → channel_manager.py} +0 -0
{indexify-0.3.16.dist-info → indexify-0.3.18.dist-info}/WHEEL +0 -0
{indexify-0.3.16.dist-info → indexify-0.3.18.dist-info}/entry_points.txt +0 -0

indexify/cli/cli.py CHANGED Viewed

@@ -13,7 +13,7 @@ import sys
 from importlib.metadata import version
 from pathlib import Path
 from socket import gethostname
-from typing import Annotated, List, Optional, Tuple
+from typing import Annotated, Dict, List, Optional, Tuple
 import nanoid
 import prometheus_client
@@ -26,6 +26,7 @@ from tensorlake.functions_sdk.image import Image
 from indexify.executor.api_objects import FunctionURI
 from indexify.executor.executor import Executor
+from indexify.executor.executor_flavor import ExecutorFlavor
 from indexify.executor.function_executor.server.subprocess_function_executor_server_factory import (
     SubprocessFunctionExecutorServerFactory,
 )
@@ -119,7 +120,6 @@ def executor(
             help="Port where to run Executor Monitoring server",
         ),
     ] = 7000,
-    # TODO: Figure out mTLS for gRPC.
     grpc_server_addr: Annotated[
         Optional[str],
         typer.Option(
@@ -140,6 +140,15 @@ def executor(
             ),
         ),
     ] = False,
+    labels: Annotated[
+        List[str],
+        typer.Option(
+            "--label",
+            "-l",
+            help="Executor key-value label to be sent to the Server. "
+            "Specified as <key>=<value>",
+        ),
+    ] = [],
 ):
     if dev:
         configure_development_mode_logging()
@@ -162,6 +171,11 @@ def executor(
             "--grpc-server-addr must be set when --enable-grpc-state-reconciler is set"
         )
+    kv_labels: Dict[str, str] = {}
+    for label in labels:
+        key, value = label.split("=")
+        kv_labels[key] = value
     executor_version = version("indexify")
     logger = structlog.get_logger(module=__name__, executor_id=executor_id)
@@ -171,6 +185,7 @@ def executor(
         server_addr=server_addr,
         config_path=config_path,
         executor_version=executor_version,
+        labels=kv_labels,
         executor_cache=executor_cache,
         ports=ports,
         functions=function_uris,
@@ -205,7 +220,9 @@ def executor(
     Executor(
         id=executor_id,
         development_mode=dev,
+        flavor=ExecutorFlavor.OSS,
         version=executor_version,
+        labels=kv_labels,
         health_checker=GenericHealthChecker(),
         code_path=executor_cache,
         function_allowlist=_parse_function_uris(function_uris),

indexify/executor/executor.py CHANGED Viewed

@@ -9,17 +9,18 @@ import structlog
 from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
 from tensorlake.utils.logging import suppress as suppress_logging
-from indexify.proto.task_scheduler_pb2 import ExecutorStatus
+from indexify.proto.executor_api_pb2 import ExecutorStatus
 from .api_objects import FunctionURI, Task
 from .downloader import Downloader
+from .executor_flavor import ExecutorFlavor
 from .function_executor.function_executor_states_container import (
     FunctionExecutorStatesContainer,
 )
 from .function_executor.server.function_executor_server_factory import (
     FunctionExecutorServerFactory,
 )
-from .grpc.channel_creator import ChannelCreator
+from .grpc.channel_manager import ChannelManager
 from .grpc.state_reconciler import ExecutorStateReconciler
 from .grpc.state_reporter import ExecutorStateReporter
 from .metrics.executor import (
@@ -55,7 +56,9 @@ class Executor:
         self,
         id: str,
         development_mode: bool,
+        flavor: ExecutorFlavor,
         version: str,
+        labels: Dict[str, str],
         code_path: Path,
         health_checker: HealthChecker,
         function_allowlist: Optional[List[FunctionURI]],
@@ -106,18 +109,25 @@ class Executor:
         self._task_runner: Optional[TaskRunner] = None
         self._task_fetcher: Optional[TaskFetcher] = None
         # gRPC mode services
-        self._channel_creator: Optional[ChannelCreator] = None
+        self._channel_manager: Optional[ChannelManager] = None
         self._state_reporter: Optional[ExecutorStateReporter] = None
         self._state_reconciler: Optional[ExecutorStateReconciler] = None
         if grpc_server_addr is not None:
-            self._channel_creator = ChannelCreator(grpc_server_addr, self._logger)
+            self._channel_manager = ChannelManager(
+                server_address=grpc_server_addr,
+                config_path=config_path,
+                logger=self._logger,
+            )
             self._state_reporter = ExecutorStateReporter(
                 executor_id=id,
+                flavor=flavor,
+                version=version,
+                labels=labels,
                 development_mode=development_mode,
                 function_allowlist=self._function_allowlist,
                 function_executor_states=self._function_executor_states,
-                channel_creator=self._channel_creator,
+                channel_manager=self._channel_manager,
                 logger=self._logger,
             )
             self._state_reporter.update_executor_status(
@@ -133,7 +143,8 @@ class Executor:
                 config_path=config_path,
                 downloader=self._downloader,
                 task_reporter=self._task_reporter,
-                channel_creator=self._channel_creator,
+                channel_manager=self._channel_manager,
+                state_reporter=self._state_reporter,
                 logger=self._logger,
             )
         else:
@@ -147,6 +158,7 @@ class Executor:
             self._task_fetcher = TaskFetcher(
                 executor_id=id,
                 executor_version=version,
+                labels=labels,
                 function_allowlist=function_allowlist,
                 protocol=protocol,
                 indexify_server_addr=self._server_addr,
@@ -326,7 +338,9 @@ class Executor:
             ).inc()
     async def _shutdown(self, loop):
-        self._logger.info("shutting down")
+        self._logger.info(
+            "shutting down, all Executor logs are suppressed, no task outcomes will be reported to Server from this point"
+        )
         if self._state_reporter is not None:
             self._state_reporter.update_executor_status(
                 ExecutorStatus.EXECUTOR_STATUS_STOPPING
@@ -339,12 +353,13 @@ class Executor:
         self._is_shutdown = True
         await self._monitoring_server.shutdown()
+        await self._task_reporter.shutdown()
         if self._task_runner is not None:
             await self._task_runner.shutdown()
-        if self._channel_creator is not None:
-            await self._channel_creator.shutdown()
+        if self._channel_manager is not None:
+            await self._channel_manager.shutdown()
         if self._state_reporter is not None:
             await self._state_reporter.shutdown()
         if self._state_reconciler is not None:

indexify/executor/executor_flavor.py ADDED Viewed

@@ -0,0 +1,7 @@
+from enum import Enum
+class ExecutorFlavor(Enum):
+    UNKNOWN = "unknown"
+    OSS = "oss"
+    PLATFORM = "platform"

indexify/executor/function_executor/function_executor.py CHANGED Viewed

@@ -110,7 +110,7 @@ class FunctionExecutor:
                     config_path=config_path,
                     initialize_request=initialize_request,
                 )
-                await self._create_health_checker(stub)
+                await self._create_health_checker(self._channel, stub)
                 self._initialized = True
         except Exception:
             await self.destroy()
@@ -243,12 +243,15 @@ class FunctionExecutor:
         finally:
             self._invocation_state_client = None
-    async def _create_health_checker(self, stub: FunctionExecutorStub) -> None:
+    async def _create_health_checker(
+        self, channel: grpc.aio.Channel, stub: FunctionExecutorStub
+    ) -> None:
         with (
             metric_create_health_checker_errors.count_exceptions(),
             metric_create_health_checker_latency.time(),
         ):
             self._health_checker = HealthChecker(
+                channel=channel,
                 stub=stub,
                 logger=self._logger,
             )

indexify/executor/function_executor/health_checker.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import asyncio
+import os
 from collections.abc import Awaitable, Callable
 from typing import Any, Optional
-from grpc.aio import AioRpcError
+import grpc
+import grpc.aio
 from tensorlake.function_executor.proto.function_executor_pb2 import (
     HealthCheckRequest,
     HealthCheckResponse,
@@ -27,7 +29,10 @@ class HealthCheckResult:
 class HealthChecker:
-    def __init__(self, stub: FunctionExecutorStub, logger: Any):
+    def __init__(
+        self, channel: grpc.aio.Channel, stub: FunctionExecutorStub, logger: Any
+    ):
+        self._channel: grpc.aio.Channel = channel
         self._stub: FunctionExecutorStub = stub
         self._logger: Any = logger.bind(module=__name__)
         self._health_check_loop_task: Optional[asyncio.Task] = None
@@ -39,6 +44,12 @@ class HealthChecker:
         """Runs the health check once and returns the result.
         Does not raise any exceptions."""
+        if os.getenv("INDEXIFY_DISABLE_FUNCTION_EXECUTOR_HEALTH_CHECKS", "0") == "1":
+            return HealthCheckResult(
+                is_healthy=True,
+                reason="Function Executor health checks are disabled using INDEXIFY_DISABLE_FUNCTION_EXECUTOR_HEALTH_CHECKS env var.",
+            )
         with metric_health_check_latency.time():
             try:
                 response: HealthCheckResponse = await self._stub.check_health(
@@ -49,19 +60,34 @@ class HealthChecker:
                 return HealthCheckResult(
                     is_healthy=response.healthy, reason=response.status_message
                 )
-            except AioRpcError as e:
-                metric_failed_health_checks.inc()
-                # Expected exception when there are problems with communication because e.g. the server is unhealthy.
-                return HealthCheckResult(
-                    is_healthy=False,
-                    reason=f"Executor side RPC channel error: {str(e)}",
-                )
+            except grpc.aio.AioRpcError as e:
+                # Due to the customer code running in Function Executor we can't reliably conclude
+                # that the FE is unhealthy when RPC status code is not OK. E.g. customer code can
+                # hold Python GIL and prevent the health check RPC from being processed by FE Python code.
+                #
+                # The only unhealthy condition we can be sure about is when the channel can't re-establish
+                # the TCP connection within HEALTH_CHECK_TIMEOUT_SEC deadline. This is because FE Python
+                # code is not involved when TCP connections are established to FE. Problems reestablishing
+                # the TCP connection are usually due to the FE process crashing and its gRPC server socket
+                # not being available anymore or due to prolonged local networking failures on Executor.
+                if (
+                    _channel_state(self._channel, self._logger)
+                    == grpc.ChannelConnectivity.TRANSIENT_FAILURE
+                ):
+                    return HealthCheckResult(
+                        is_healthy=False,
+                        reason="Channel is in TRANSIENT_FAILURE state, assuming Function Executor crashed.",
+                    )
+                else:
+                    return HealthCheckResult(
+                        is_healthy=True,
+                        reason=f"Health check RPC failed with status code: {e.code().name}. Assuming Function Executor is healthy.",
+                    )
             except Exception as e:
-                metric_failed_health_checks.inc()
-                self._logger.warning("Got unexpected exception, ignoring", exc_info=e)
+                self._logger.error("Got unexpected exception, ignoring", exc_info=e)
                 return HealthCheckResult(
-                    is_healthy=False,
-                    reason=f"Unexpected exception in Executor: {str(e)}",
+                    is_healthy=True,
+                    reason=f"Unexpected exception in Executor: {str(e)}. Assuming Function Executor is healthy.",
                 )
     def start(self, callback: Callable[[HealthCheckResult], Awaitable[None]]) -> None:
@@ -102,3 +128,19 @@ class HealthChecker:
         asyncio.create_task(self._health_check_failed_callback(result))
         self._health_check_loop_task = None
+def _channel_state(channel: grpc.aio.Channel, logger: Any) -> grpc.ChannelConnectivity:
+    """Get channel connectivity state and suppresses all exceptions.
+    Suppressing the exceptions is important because the channel connectivity state is an experimental
+    feature. On error fallse back to READY state which assumes that the channel is okay.
+    """
+    try:
+        return channel.get_state()
+    except Exception as e:
+        logger.error(
+            "Failed getting channel state, falling back to default READY state",
+            exc_info=e,
+        )
+        return grpc.ChannelConnectivity.READY

indexify/executor/grpc/channel_manager.py ADDED Viewed

@@ -0,0 +1,160 @@
+import asyncio
+from typing import Any, Dict, Optional
+import grpc.aio
+import yaml
+from .metrics.channel_manager import (
+    metric_grpc_server_channel_creation_latency,
+    metric_grpc_server_channel_creation_retries,
+    metric_grpc_server_channel_creations,
+)
+_RETRY_INTERVAL_SEC = 5
+_CONNECT_TIMEOUT_SEC = 5
+class ChannelManager:
+    def __init__(self, server_address: str, config_path: Optional[str], logger: Any):
+        self._logger: Any = logger.bind(module=__name__, server_address=server_address)
+        self._server_address: str = server_address
+        self._channel_credentials: Optional[grpc.ChannelCredentials] = None
+        # This lock protects the fields below.
+        self._lock = asyncio.Lock()
+        self._channel: Optional[grpc.aio.Channel] = None
+        self._init_tls(config_path)
+    def _init_tls(self, config_path: Optional[str]):
+        if config_path is None:
+            return
+        # The same config file format as in Tensorlake SDK HTTP client, see:
+        # https://github.com/tensorlakeai/tensorlake/blob/main/src/tensorlake/utils/http_client.py
+        with open(config_path, "r") as config_file:
+            config = yaml.safe_load(config_file)
+        if not config.get("use_tls", False):
+            return
+        tls_config: Dict[str, str] = config["tls_config"]
+        cert_path: Optional[str] = tls_config.get("cert_path", None)
+        key_path: Optional[str] = tls_config.get("key_path", None)
+        ca_bundle_path: Optional[str] = tls_config.get("ca_bundle_path", None)
+        self._logger = self._logger.bind(
+            cert_path=cert_path,
+            key_path=key_path,
+            ca_bundle_path=ca_bundle_path,
+        )
+        self._logger.info("TLS is enabled for grpc channels to server")
+        private_key: Optional[bytes] = None
+        certificate_chain: Optional[bytes] = None
+        root_certificates: Optional[bytes] = None
+        if cert_path is not None:
+            with open(cert_path, "rb") as cert_file:
+                certificate_chain = cert_file.read()
+        if key_path is not None:
+            with open(key_path, "rb") as key_file:
+                private_key = key_file.read()
+        if ca_bundle_path is not None:
+            with open(ca_bundle_path, "rb") as ca_bundle_file:
+                root_certificates = ca_bundle_file.read()
+        self._channel_credentials = grpc.ssl_channel_credentials(
+            root_certificates=root_certificates,
+            private_key=private_key,
+            certificate_chain=certificate_chain,
+        )
+    async def get_channel(self) -> grpc.aio.Channel:
+        """Returns a channel to the gRPC server.
+        Returns a ready to use channel. Blocks until the channel is ready,
+        never raises any exceptions.
+        If previously returned channel is healthy then returns it again.
+        Otherwise, returns a new channel but closes the previously returned one.
+        """
+        # Use the lock to ensure that we only create one channel without race conditions.
+        async with self._lock:
+            if self._channel is None:
+                self._channel = await self._create_channel()
+            elif not await self._locked_channel_is_healthy():
+                self._logger.info("grpc channel to server is unhealthy")
+                await self._destroy_locked_channel()
+                self._channel = await self._create_channel()
+            return self._channel
+    async def _create_channel(self) -> grpc.aio.Channel:
+        """Creates a new channel to the gRPC server."
+        Returns a ready to use channel. Blocks until the channel
+        is ready, never raises any exceptions.
+        """
+        self._logger.info("creating new grpc server channel")
+        with metric_grpc_server_channel_creation_latency.time():
+            metric_grpc_server_channel_creations.inc()
+            while True:
+                try:
+                    if self._channel_credentials is None:
+                        channel = grpc.aio.insecure_channel(target=self._server_address)
+                    else:
+                        channel = grpc.aio.secure_channel(
+                            target=self._server_address,
+                            credentials=self._channel_credentials,
+                        )
+                    await asyncio.wait_for(
+                        channel.channel_ready(),
+                        timeout=_CONNECT_TIMEOUT_SEC,
+                    )
+                    return channel
+                except Exception:
+                    self._logger.error(
+                        f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
+                    )
+                    try:
+                        await channel.close()
+                    except Exception as e:
+                        self._logger.error(
+                            "failed closing not established channel", exc_info=e
+                        )
+                    metric_grpc_server_channel_creation_retries.inc()
+                    await asyncio.sleep(_RETRY_INTERVAL_SEC)
+    async def _locked_channel_is_healthy(self) -> bool:
+        """Checks if the channel is healthy.
+        Returns True if the channel is healthy, False otherwise.
+        self._lock must be acquired before calling this method.
+        Never raises any exceptions.
+        """
+        try:
+            return self._channel.get_state() == grpc.ChannelConnectivity.READY
+        except Exception as e:
+            # Assume that the channel is healthy because get_state() method is marked as experimental
+            # so we can't fully trust it.
+            self._logger.error(
+                "failed getting channel state, assuming channel is healthy", exc_info=e
+            )
+            return True
+    async def _destroy_locked_channel(self):
+        """Closes the existing channel.
+        self._lock must be acquired before calling this method.
+        Never raises any exceptions.
+        """
+        try:
+            await self._channel.close()
+        except Exception as e:
+            self._logger.error("failed closing channel", exc_info=e)
+        self._channel = None
+    async def shutdown(self):
+        pass

indexify/executor/grpc/state_reconciler.py CHANGED Viewed

@@ -7,14 +7,14 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
     SerializedObject,
 )
-from indexify.proto.task_scheduler_pb2 import (
+from indexify.proto.executor_api_pb2 import (
     DesiredExecutorState,
     FunctionExecutorDescription,
     FunctionExecutorStatus,
     GetDesiredExecutorStatesRequest,
 )
-from indexify.proto.task_scheduler_pb2_grpc import (
-    TaskSchedulerServiceStub,
+from indexify.proto.executor_api_pb2_grpc import (
+    ExecutorAPIStub,
 )
 from ..downloader import Downloader
@@ -43,7 +43,8 @@ from ..metrics.executor import (
     metric_tasks_reporting_outcome,
 )
 from ..task_reporter import TaskReporter
-from .channel_creator import ChannelCreator
+from .channel_manager import ChannelManager
+from .state_reporter import ExecutorStateReporter
 _RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
@@ -58,7 +59,8 @@ class ExecutorStateReconciler:
         config_path: Optional[str],
         downloader: Downloader,
         task_reporter: TaskReporter,
-        channel_creator: ChannelCreator,
+        channel_manager: ChannelManager,
+        state_reporter: ExecutorStateReporter,
         logger: Any,
     ):
         self._executor_id: str = executor_id
@@ -72,7 +74,8 @@ class ExecutorStateReconciler:
         self._function_executor_states: FunctionExecutorStatesContainer = (
             function_executor_states
         )
-        self._channel_creator = channel_creator
+        self._channel_manager: ChannelManager = channel_manager
+        self._state_reporter: ExecutorStateReporter = state_reporter
         self._logger: Any = logger.bind(module=__name__)
         self._is_shutdown: bool = False
         self._server_last_clock: Optional[int] = None
@@ -83,12 +86,14 @@ class ExecutorStateReconciler:
         Never raises any exceptions.
         """
         while not self._is_shutdown:
-            async with await self._channel_creator.create() as server_channel:
+            async with await self._channel_manager.get_channel() as server_channel:
                 server_channel: grpc.aio.Channel
-                stub = TaskSchedulerServiceStub(server_channel)
+                stub = ExecutorAPIStub(server_channel)
                 while not self._is_shutdown:
                     try:
-                        # TODO: Report state once before starting the stream.
+                        # Report state once before starting the stream so Server
+                        # doesn't use old state it knew about this Executor in the past.
+                        await self._state_reporter.report_state(stub)
                         desired_states_stream: AsyncGenerator[
                             DesiredExecutorState, None
                         ] = stub.get_desired_executor_states(

indexify 0.3.16__py3-none-any.whl → 0.3.18__py3-none-any.whl

indexify 0.3.16py3-none-any.whl → 0.3.18py3-none-any.whl