PyPI - indexify - Versions diffs - 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl - Mend

indexify 0.3.12py3-none-any.whl → 0.3.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

indexify/cli/cli.py +11 -7
indexify/executor/downloader.py +99 -50
indexify/executor/executor.py +149 -28
indexify/executor/function_executor/function_executor.py +28 -1
indexify/executor/function_executor/function_executor_state.py +23 -4
indexify/executor/function_executor/function_executor_states_container.py +28 -16
indexify/executor/function_executor/health_checker.py +26 -11
indexify/executor/function_executor/metrics/function_executor.py +16 -0
indexify/executor/function_executor/server/function_executor_server_factory.py +4 -1
indexify/executor/function_executor/single_task_runner.py +28 -8
indexify/executor/function_executor/task_output.py +27 -4
indexify/executor/state_reconciler.py +288 -0
indexify/executor/state_reporter.py +127 -0
indexify/executor/task_reporter.py +6 -6
indexify/executor/task_runner.py +20 -12
indexify/task_scheduler/proto/task_scheduler.proto +147 -0
indexify/task_scheduler/proto/task_scheduler_pb2.py +69 -0
indexify/task_scheduler/proto/task_scheduler_pb2.pyi +286 -0
indexify/task_scheduler/proto/task_scheduler_pb2_grpc.py +170 -0
{indexify-0.3.12.dist-info → indexify-0.3.14.dist-info}/METADATA +1 -1
{indexify-0.3.12.dist-info → indexify-0.3.14.dist-info}/RECORD +23 -17
{indexify-0.3.12.dist-info → indexify-0.3.14.dist-info}/WHEEL +0 -0
{indexify-0.3.12.dist-info → indexify-0.3.14.dist-info}/entry_points.txt +0 -0

indexify/executor/function_executor/function_executor_states_container.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import asyncio
-from typing import AsyncGenerator, Dict
+from typing import AsyncGenerator, Dict, Optional
-from ..api_objects import Task
 from .function_executor_state import FunctionExecutorState
 from .metrics.function_executor_state_container import (
     metric_function_executor_states_count,
@@ -17,19 +16,33 @@ class FunctionExecutorStatesContainer:
         self._states: Dict[str, FunctionExecutorState] = {}
         self._is_shutdown: bool = False
-    async def get_or_create_state(self, task: Task) -> FunctionExecutorState:
-        """Get or create a function executor state for the given task.
+    async def get_or_create_state(
+        self,
+        id: str,
+        namespace: str,
+        graph_name: str,
+        graph_version: str,
+        function_name: str,
+        image_uri: Optional[str],
+    ) -> FunctionExecutorState:
+        """Get or create a function executor state with the given ID.
+        If the state already exists, it is returned. Otherwise, a new state is created from the supplied task.
         Raises Exception if it's not possible to create a new state at this time."""
         async with self._lock:
             if self._is_shutdown:
-                raise RuntimeError("Task runner is shutting down.")
+                raise RuntimeError(
+                    "Function Executor states container is shutting down."
+                )
-            id = function_id_without_version(task)
             if id not in self._states:
                 state = FunctionExecutorState(
-                    function_id_with_version=function_id_with_version(task),
-                    function_id_without_version=id,
+                    id=id,
+                    namespace=namespace,
+                    graph_name=graph_name,
+                    graph_version=graph_version,
+                    function_name=function_name,
+                    image_uri=image_uri,
                 )
                 self._states[id] = state
                 metric_function_executor_states_count.set(len(self._states))
@@ -41,6 +54,13 @@ class FunctionExecutorStatesContainer:
             for state in self._states.values():
                 yield state
+    async def pop(self, id: str) -> FunctionExecutorState:
+        """Removes the state with the given ID and returns it."""
+        async with self._lock:
+            state = self._states.pop(id)
+            metric_function_executor_states_count.set(len(self._states))
+            return state
     async def shutdown(self):
         # Function Executors are outside the Executor process
         # so they need to get cleaned up explicitly and reliably.
@@ -54,11 +74,3 @@ class FunctionExecutorStatesContainer:
                 async with state.lock:
                     await state.shutdown()
                     # The task running inside the Function Executor will fail because it's destroyed.
-def function_id_with_version(task: Task) -> str:
-    return f"versioned/{task.namespace}/{task.compute_graph}/{task.graph_version}/{task.compute_fn}"
-def function_id_without_version(task: Task) -> str:
-    return f"not_versioned/{task.namespace}/{task.compute_graph}/{task.compute_fn}"

indexify/executor/function_executor/health_checker.py CHANGED Viewed

@@ -20,16 +20,22 @@ from .server.client_configuration import HEALTH_CHECK_TIMEOUT_SEC
 HEALTH_CHECK_POLL_PERIOD_SEC = 10
+class HealthCheckResult:
+    def __init__(self, is_healthy: bool, reason: str):
+        self.is_healthy: bool = is_healthy
+        self.reason: str = reason
 class HealthChecker:
     def __init__(self, stub: FunctionExecutorStub, logger: Any):
         self._stub: FunctionExecutorStub = stub
         self._logger: Any = logger.bind(module=__name__)
         self._health_check_loop_task: Optional[asyncio.Task] = None
-        self._health_check_failed_callback: Optional[Callable[[], Awaitable[None]]] = (
-            None
-        )
+        self._health_check_failed_callback: Optional[
+            Callable[[HealthCheckResult], Awaitable[None]]
+        ] = None
-    async def check(self) -> bool:
+    async def check(self) -> HealthCheckResult:
         """Runs the health check once and returns the result.
         Does not raise any exceptions."""
@@ -40,17 +46,25 @@ class HealthChecker:
                 )
                 if not response.healthy:
                     metric_failed_health_checks.inc()
-                return response.healthy
-            except AioRpcError:
+                return HealthCheckResult(
+                    is_healthy=response.healthy, reason=response.status_message
+                )
+            except AioRpcError as e:
                 metric_failed_health_checks.inc()
                 # Expected exception when there are problems with communication because e.g. the server is unhealthy.
-                return False
+                return HealthCheckResult(
+                    is_healthy=False,
+                    reason=f"Executor side RPC channel error: {str(e)}",
+                )
             except Exception as e:
                 metric_failed_health_checks.inc()
                 self._logger.warning("Got unexpected exception, ignoring", exc_info=e)
-                return False
+                return HealthCheckResult(
+                    is_healthy=False,
+                    reason=f"Unexpected exception in Executor: {str(e)}",
+                )
-    def start(self, callback: Callable[[], Awaitable[None]]) -> None:
+    def start(self, callback: Callable[[HealthCheckResult], Awaitable[None]]) -> None:
         """Starts periodic health checks.
         The supplied callback is an async function called in the calling thread's
@@ -81,9 +95,10 @@ class HealthChecker:
     async def _health_check_loop(self) -> None:
         while True:
-            if not await self.check():
+            result: HealthCheckResult = await self.check()
+            if not result.is_healthy:
                 break
             await asyncio.sleep(HEALTH_CHECK_POLL_PERIOD_SEC)
-        asyncio.create_task(self._health_check_failed_callback())
+        asyncio.create_task(self._health_check_failed_callback(result))
         self._health_check_loop_task = None

indexify/executor/function_executor/metrics/function_executor.py CHANGED Viewed

@@ -78,6 +78,22 @@ metric_destroy_channel_errors: prometheus_client.Counter = prometheus_client.Cou
     "Number of Function Executor channel destruction errors",
 )
+# FE get_info RPC metrics.
+metric_get_info_rpc_latency: prometheus_client.Histogram = (
+    latency_metric_for_fast_operation(
+        "function_executor_get_info_rpc", "Function Executor get_info RPC"
+    )
+)
+metric_get_info_rpc_errors: prometheus_client.Counter = prometheus_client.Counter(
+    "function_executor_get_info_rpc_errors",
+    "Number of Function Executor get_info RPC errors",
+)
+metric_function_executor_infos: prometheus_client.Counter = prometheus_client.Counter(
+    "function_executor_infos",
+    "Number of Function Executors with particular info",
+    ["version", "sdk_version", "sdk_language", "sdk_language_version"],
+)
 # FE initialization RPC metrics.
 metric_initialize_rpc_latency: prometheus_client.Histogram = (
     latency_metric_for_customer_controlled_operation(

indexify/executor/function_executor/server/function_executor_server_factory.py CHANGED Viewed

@@ -14,8 +14,11 @@ class FunctionExecutorServerConfiguration:
     configuration parameters or raise an exception if it can't implement
     them."""
-    def __init__(self, executor_id: str, image_uri: Optional[str]):
+    def __init__(
+        self, executor_id: str, function_executor_id: str, image_uri: Optional[str]
+    ):
         self.executor_id: str = executor_id
+        self.function_executor_id: str = function_executor_id
         # Container image URI of the Function Executor Server.
         self.image_uri: Optional[str] = image_uri

indexify/executor/function_executor/single_task_runner.py CHANGED Viewed

@@ -14,6 +14,7 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
 from ..api_objects import Task
 from .function_executor import CustomerError, FunctionExecutor
 from .function_executor_state import FunctionExecutorState
+from .health_checker import HealthChecker, HealthCheckResult
 from .metrics.single_task_runner import (
     metric_function_executor_run_task_rpc_errors,
     metric_function_executor_run_task_rpc_latency,
@@ -69,7 +70,12 @@ class SingleTaskRunner:
                 await self._create_function_executor()
             except CustomerError as e:
                 return TaskOutput(
-                    task=self._task_input.task,
+                    task_id=self._task_input.task.id,
+                    namespace=self._task_input.task.namespace,
+                    graph_name=self._task_input.task.compute_graph,
+                    function_name=self._task_input.task.compute_fn,
+                    graph_version=self._task_input.task.graph_version,
+                    graph_invocation_id=self._task_input.task.invocation_id,
                     stderr=str(e),
                     success=False,
                 )
@@ -88,6 +94,7 @@ class SingleTaskRunner:
         config: FunctionExecutorServerConfiguration = (
             FunctionExecutorServerConfiguration(
                 executor_id=self._executor_id,
+                function_executor_id=self._state.id,
                 image_uri=self._task_input.task.image_uri,
             )
         )
@@ -144,24 +151,32 @@ class SingleTaskRunner:
                 ).run_task(request)
             return _task_output(task=self._task_input.task, response=response)
-    async def _health_check_failed_callback(self):
+    async def _health_check_failed_callback(self, result: HealthCheckResult):
         # Function Executor destroy due to the periodic health check failure ensures that
         # a running task RPC stuck in unhealthy Function Executor fails immidiately.
         async with self._state.lock:
             if self._state.function_executor is not None:
-                await self._destroy_function_executor_on_failed_health_check()
+                await self._destroy_function_executor_on_failed_health_check(
+                    result.reason
+                )
     async def _destroy_existing_function_executor_if_unhealthy(self):
         self._state.check_locked()
         if self._state.function_executor is None:
             return
-        if await self._state.function_executor.health_checker().check():
+        result: HealthCheckResult = (
+            await self._state.function_executor.health_checker().check()
+        )
+        if result.is_healthy:
             return
-        await self._destroy_function_executor_on_failed_health_check()
+        await self._destroy_function_executor_on_failed_health_check(result.reason)
-    async def _destroy_function_executor_on_failed_health_check(self):
+    async def _destroy_function_executor_on_failed_health_check(self, reason: str):
         self._state.check_locked()
-        self._logger.error("Health check failed, destroying FunctionExecutor.")
+        self._logger.error(
+            "Function Executor health check failed, destroying Function Executor",
+            health_check_fail_reason=reason,
+        )
         self._state.health_check_failed = True
         await self._state.destroy_function_executor()
@@ -220,7 +235,12 @@ def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
             raise ValueError(f"Response is missing required field: {field}")
     output = TaskOutput(
-        task=task,
+        task_id=task.id,
+        namespace=task.namespace,
+        graph_name=task.compute_graph,
+        function_name=task.compute_fn,
+        graph_version=task.graph_version,
+        graph_invocation_id=task.invocation_id,
         stdout=response.stdout,
         stderr=response.stderr,
         reducer=response.is_reducer,

indexify/executor/function_executor/task_output.py CHANGED Viewed

@@ -13,7 +13,12 @@ class TaskOutput:
     def __init__(
         self,
-        task: Task,
+        task_id: str,
+        namespace: str,
+        graph_name: str,
+        function_name: str,
+        graph_version: str,
+        graph_invocation_id: str,
         function_output: Optional[FunctionOutput] = None,
         router_output: Optional[RouterOutput] = None,
         stdout: Optional[str] = None,
@@ -22,7 +27,12 @@ class TaskOutput:
         success: bool = False,
         is_internal_error: bool = False,
     ):
-        self.task = task
+        self.task_id = task_id
+        self.namespace = namespace
+        self.graph_name = graph_name
+        self.function_name = function_name
+        self.graph_version = graph_version
+        self.graph_invocation_id = graph_invocation_id
         self.function_output = function_output
         self.router_output = router_output
         self.stdout = stdout
@@ -32,11 +42,24 @@ class TaskOutput:
         self.is_internal_error = is_internal_error
     @classmethod
-    def internal_error(cls, task: Task) -> "TaskOutput":
+    def internal_error(
+        cls,
+        task_id: str,
+        namespace: str,
+        graph_name: str,
+        function_name: str,
+        graph_version: str,
+        graph_invocation_id: str,
+    ) -> "TaskOutput":
         """Creates a TaskOutput for an internal error."""
         # We are not sharing internal error messages with the customer.
         return TaskOutput(
-            task=task,
+            task_id=task_id,
+            namespace=namespace,
+            graph_name=graph_name,
+            function_name=function_name,
+            graph_version=graph_version,
+            graph_invocation_id=graph_invocation_id,
             stderr="Platform failed to execute the function.",
             is_internal_error=True,
         )

indexify/executor/state_reconciler.py ADDED Viewed

@@ -0,0 +1,288 @@
+import asyncio
+from typing import Any, AsyncGenerator, List, Optional, Set
+import grpc
+from tensorlake.function_executor.proto.function_executor_pb2 import (
+    InitializeRequest,
+    SerializedObject,
+)
+from indexify.task_scheduler.proto.task_scheduler_pb2 import (
+    DesiredExecutorState,
+    FunctionExecutorDescription,
+    FunctionExecutorStatus,
+    GetDesiredExecutorStatesRequest,
+)
+from indexify.task_scheduler.proto.task_scheduler_pb2_grpc import (
+    TaskSchedulerServiceStub,
+)
+from .downloader import Downloader
+from .function_executor.function_executor import CustomerError, FunctionExecutor
+from .function_executor.function_executor_state import FunctionExecutorState
+from .function_executor.function_executor_states_container import (
+    FunctionExecutorStatesContainer,
+)
+from .function_executor.server.function_executor_server_factory import (
+    FunctionExecutorServerConfiguration,
+    FunctionExecutorServerFactory,
+)
+from .function_executor.task_input import TaskInput
+from .function_executor.task_output import TaskOutput
+from .metrics.executor import (
+    METRIC_TASKS_COMPLETED_OUTCOME_ALL,
+    METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
+    METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
+    METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
+    metric_task_completion_latency,
+    metric_task_outcome_report_latency,
+    metric_task_outcome_report_retries,
+    metric_task_outcome_reports,
+    metric_tasks_completed,
+    metric_tasks_fetched,
+    metric_tasks_reporting_outcome,
+)
+from .task_reporter import TaskReporter
+class ExecutorStateReconciler:
+    def __init__(
+        self,
+        executor_id: str,
+        function_executor_server_factory: FunctionExecutorServerFactory,
+        base_url: str,
+        function_executor_states: FunctionExecutorStatesContainer,
+        config_path: Optional[str],
+        downloader: Downloader,
+        task_reporter: TaskReporter,
+        server_channel: grpc.aio.Channel,
+        logger: Any,
+    ):
+        self._executor_id: str = executor_id
+        self._factory: FunctionExecutorServerFactory = function_executor_server_factory
+        self._base_url: str = base_url
+        self._config_path: Optional[str] = config_path
+        self._downloader: Downloader = downloader
+        self._task_reporter: TaskReporter = task_reporter
+        self._function_executor_states: FunctionExecutorStatesContainer = (
+            function_executor_states
+        )
+        self._stub: TaskSchedulerServiceStub = TaskSchedulerServiceStub(server_channel)
+        self._logger: Any = logger.bind(module=__name__)
+        self._is_shutdown: bool = False
+        self._reconciliation_lock: asyncio.Lock = asyncio.Lock()
+        self._server_last_clock: Optional[int] = None
+    async def run(self):
+        desired_states: AsyncGenerator[DesiredExecutorState, None] = (
+            self._stub.get_desired_executor_states(
+                GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
+            )
+        )
+        async for new_state in desired_states:
+            if self._is_shutdown:
+                return
+            new_state: DesiredExecutorState
+            if self._server_last_clock is not None:
+                if self._server_last_clock >= new_state.clock:
+                    continue  # Duplicate or outdated message state sent by Server.
+            self._server_last_clock = new_state.clock
+            asyncio.create_task(self._reconcile_state(new_state))
+    async def _reconcile_state(self, new_state: DesiredExecutorState):
+        if self._is_shutdown:
+            return
+        # Simple non concurrent implementation for now for the PoC.
+        # Obtain this lock to force only a single coroutine doing the reconciliation.
+        async with self._reconciliation_lock:
+            await self._reconcile_function_executors(new_state)
+            # TODO
+            # await self._reconcile_task_allocations(new_state)
+    async def shutdown(self):
+        """Shuts down the state reconciler.
+        Never raises any exceptions.
+        """
+        self._is_shutdown = True
+    async def _reconcile_function_executors(self, desired_state: DesiredExecutorState):
+        desired_function_executor_ids: Set[str] = set()
+        for desired_function_executor in desired_state.function_executors:
+            desired_function_executor: FunctionExecutorDescription
+            desired_function_executor_ids.add(desired_function_executor.id)
+            function_executor_state: FunctionExecutorState = (
+                self._function_executor_states.get_or_create_state(
+                    id=desired_function_executor.id,
+                    namespace=desired_function_executor.namespace,
+                    graph_name=desired_function_executor.graph_name,
+                    graph_version=desired_function_executor.graph_version,
+                    function_name=desired_function_executor.function_name,
+                )
+            )
+            async with function_executor_state.lock:
+                if (
+                    function_executor_state.status
+                    == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPED
+                ):
+                    function_executor_state.status = (
+                        FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTING_UP
+                    )
+                    try:
+                        function_executor_state.function_executor = (
+                            await self._create_function_executor()
+                        )
+                        function_executor_state.status = (
+                            FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_IDLE
+                        )
+                    except CustomerError as e:
+                        function_executor_state.status = (
+                            FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR
+                        )
+                    except Exception as e:
+                        function_executor_state.status = (
+                            FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR
+                        )
+                        self._logger.error(
+                            f"Failed to create Function Executor", exc_info=e
+                        )
+        function_executor_state_ids_to_destroy: List[str] = []
+        async for function_executor_state in self._function_executor_states:
+            function_executor_state: FunctionExecutorState
+            if function_executor_state.id not in desired_function_executor_ids:
+                function_executor_state_ids_to_destroy.append(
+                    function_executor_state.id
+                )
+        for function_executor_state_id in function_executor_state_ids_to_destroy:
+            function_executor_state: FunctionExecutorState = (
+                self._function_executor_states.pop_state(function_executor_state_id)
+            )
+            async with function_executor_state.lock:
+                logger = self._function_executor_logger(
+                    id=function_executor_state.id,
+                    namespace=function_executor_state.namespace,
+                    graph_name=function_executor_state.graph_name,
+                    graph_version=function_executor_state.graph_version,
+                    function_name=function_executor_state.function_name,
+                )
+                if (
+                    function_executor_state.status
+                    == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK
+                ):
+                    logger.warning(
+                        "Destroying Function Executor that is running a task. No task output will be reported as this is expected by the Server."
+                    )
+                function_executor_state.status = (
+                    FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPING
+                )
+                await function_executor_state.destroy_function_executor()
+                function_executor_state.status = (
+                    FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPED
+                )
+    async def _create_function_executor(
+        self, description: FunctionExecutorDescription
+    ) -> FunctionExecutor:
+        logger = self._function_executor_logger(
+            id=description.id,
+            namespace=description.namespace,
+            graph_name=description.graph_name,
+            graph_version=description.graph_version,
+            function_name=description.function_name,
+        )
+        graph: SerializedObject = await self._downloader.download_graph(
+            namespace=description.namespace,
+            graph_name=description.graph_name,
+            graph_version=description.graph_version,
+            logger=logger,
+        )
+        function_executor: FunctionExecutor = FunctionExecutor(
+            server_factory=self._factory, logger=logger
+        )
+        config: FunctionExecutorServerConfiguration = (
+            FunctionExecutorServerConfiguration(
+                executor_id=self._executor_id,
+                function_executor_id=description.id,
+                image_uri=description.image_uri,
+            )
+        )
+        initialize_request: InitializeRequest = InitializeRequest(
+            namespace=description.namespace,
+            graph_name=description.graph_name,
+            graph_version=description.graph_version,
+            function_name=description.function_name,
+            graph=graph,
+        )
+        try:
+            await function_executor.initialize(
+                config=config,
+                initialize_request=initialize_request,
+                base_url=self._base_url,
+                config_path=self._config_path,
+            )
+            return function_executor
+        except Exception:
+            await function_executor.destroy()
+            raise
+    async def _cancel_running_tasks(
+        self, function_executor_state: FunctionExecutorState
+    ):
+        pass
+    def _function_executor_logger(
+        self,
+        id: str,
+        namespace: str,
+        graph_name: str,
+        graph_version: str,
+        function_name: str,
+    ) -> Any:
+        return self._logger.bind(
+            id=id,
+            namespace=namespace,
+            graph=graph_name,
+            graph_version=graph_version,
+            function_name=function_name,
+        )
+    async def _report_task_outcome(self, task_output: TaskOutput):
+        """Reports the task with the given output to the server.
+        Doesn't raise any Exceptions. Runs till the reporting is successful."""
+        reporting_retries: int = 0
+        while True:
+            logger = logger.bind(retries=reporting_retries)
+            try:
+                await self._task_reporter.report(output=task_output, logger=logger)
+                break
+            except Exception as e:
+                logger.error(
+                    "failed to report task",
+                    exc_info=e,
+                )
+                reporting_retries += 1
+                metric_task_outcome_report_retries.inc()
+                await asyncio.sleep(5)
+        metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
+        if task_output.is_internal_error:
+            metric_tasks_completed.labels(
+                outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
+            ).inc()
+        elif task_output.success:
+            metric_tasks_completed.labels(
+                outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
+            ).inc()
+        else:
+            metric_tasks_completed.labels(
+                outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
+            ).inc()

indexify 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl

indexify 0.3.12py3-none-any.whl → 0.3.14py3-none-any.whl