PyPI - indexify - Versions diffs - 0.3.19__py3-none-any.whl → 0.3.21__py3-none-any.whl - Mend

indexify 0.3.19py3-none-any.whl → 0.3.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

indexify/cli/cli.py +12 -0
indexify/executor/api_objects.py +11 -6
indexify/executor/blob_store/blob_store.py +69 -0
indexify/executor/blob_store/local_fs_blob_store.py +48 -0
indexify/executor/blob_store/metrics/blob_store.py +33 -0
indexify/executor/blob_store/s3_blob_store.py +88 -0
indexify/executor/downloader.py +192 -27
indexify/executor/executor.py +29 -13
indexify/executor/function_executor/function_executor.py +1 -1
indexify/executor/function_executor/function_executor_states_container.py +5 -0
indexify/executor/function_executor/function_executor_status.py +2 -0
indexify/executor/function_executor/health_checker.py +7 -2
indexify/executor/function_executor/invocation_state_client.py +4 -2
indexify/executor/function_executor/single_task_runner.py +2 -0
indexify/executor/function_executor/task_output.py +8 -1
indexify/executor/grpc/channel_manager.py +4 -3
indexify/executor/grpc/function_executor_controller.py +163 -193
indexify/executor/grpc/metrics/state_reconciler.py +17 -0
indexify/executor/grpc/metrics/task_controller.py +8 -0
indexify/executor/grpc/state_reconciler.py +305 -188
indexify/executor/grpc/state_reporter.py +18 -10
indexify/executor/grpc/task_controller.py +247 -189
indexify/executor/metrics/task_reporter.py +17 -0
indexify/executor/task_reporter.py +217 -94
indexify/executor/task_runner.py +1 -0
indexify/proto/executor_api.proto +37 -11
indexify/proto/executor_api_pb2.py +49 -47
indexify/proto/executor_api_pb2.pyi +55 -15
{indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/METADATA +2 -1
{indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/RECORD +32 -27
indexify/executor/grpc/completed_tasks_container.py +0 -26
{indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/WHEEL +0 -0
{indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/entry_points.txt +0 -0

indexify/executor/grpc/function_executor_controller.py CHANGED Viewed

@@ -25,6 +25,60 @@ from ..function_executor.server.function_executor_server_factory import (
 )
+def validate_function_executor_description(
+    function_executor_description: FunctionExecutorDescription,
+) -> None:
+    """Validates the supplied FE description.
+    Raises ValueError if the description is not valid.
+    """
+    validator = MessageValidator(function_executor_description)
+    validator.required_field("id")
+    validator.required_field("namespace")
+    validator.required_field("graph_name")
+    validator.required_field("graph_version")
+    validator.required_field("function_name")
+    # TODO: Make graph required after we migrate to direct S3 downloads.
+    # image_uri is optional.
+    # secret_names can be empty.
+    # resource_limits is optional.
+def function_executor_logger(
+    function_executor_description: FunctionExecutorDescription, logger: Any
+) -> Any:
+    """Returns a logger bound with the FE's metadata.
+    The function assumes that the FE might be invalid."""
+    return logger.bind(
+        function_executor_id=(
+            function_executor_description.id
+            if function_executor_description.HasField("id")
+            else None
+        ),
+        namespace=(
+            function_executor_description.namespace
+            if function_executor_description.HasField("namespace")
+            else None
+        ),
+        graph_name=(
+            function_executor_description.graph_name
+            if function_executor_description.HasField("graph_name")
+            else None
+        ),
+        graph_version=(
+            function_executor_description.graph_version
+            if function_executor_description.HasField("graph_version")
+            else None
+        ),
+        function_name=(
+            function_executor_description.function_name
+            if function_executor_description.HasField("function_name")
+            else None
+        ),
+    )
 class FunctionExecutorController:
     def __init__(
         self,
@@ -39,9 +93,9 @@ class FunctionExecutorController:
     ):
         """Initializes the FunctionExecutorController.
-        Raises ValueError if the supplied FunctionExecutorDescription is not valid.
+        The supplied FunctionExecutorDescription must be already validated by the caller
+        using validate_function_executor_description().
         """
-        _validate_function_executor_description(function_executor_description)
         self._executor_id: str = executor_id
         self._function_executor_state: FunctionExecutorState = function_executor_state
         self._function_executor_description: FunctionExecutorDescription = (
@@ -53,17 +107,10 @@ class FunctionExecutorController:
         self._downloader: Downloader = downloader
         self._base_url: str = base_url
         self._config_path: str = config_path
-        self._logger: Any = logger.bind(
+        self._logger: Any = function_executor_logger(
+            function_executor_description, logger
+        ).bind(
             module=__name__,
-            function_executor_id=function_executor_description.id,
-            namespace=function_executor_description.namespace,
-            graph_name=function_executor_description.graph_name,
-            graph_version=function_executor_description.graph_version,
-            function_name=function_executor_description.function_name,
-            image_uri=function_executor_description.image_uri,
-        )
-        self._reconciliation_loop_task: asyncio.Task = asyncio.create_task(
-            self._reconciliation_loop()
         )
         # The locks protects the desired status.
         self._lock: asyncio.Lock = asyncio.Lock()
@@ -74,13 +121,31 @@ class FunctionExecutorController:
         self._desired_status_change_notifier: asyncio.Condition = asyncio.Condition(
             lock=self._lock
         )
+        # Automatically start the controller on creation.
+        self._reconciliation_loop_task: asyncio.Task = asyncio.create_task(
+            self._reconciliation_loop(),
+            name="function executor controller reconciliation loop",
+        )
+    def function_executor_description(self) -> FunctionExecutorDescription:
+        return self._function_executor_description
+    async def startup(self) -> None:
+        await self._set_desired_status(
+            FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE
+        )
-    async def set_desired_status(
+    async def shutdown(self) -> None:
+        await self._set_desired_status(
+            FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
+        )
+    async def _set_desired_status(
         self, desired_status: FunctionExecutorStatusProto
     ) -> None:
         """Updates the desired Function Executor status.
-        Reconciliation is done asynchronously.
+        Reconciliation is done asynchronously. Doesn't raise any exceptions.
         """
         async with self._lock:
             if self._desired_status == desired_status:
@@ -105,146 +170,73 @@ class FunctionExecutorController:
             await self._reconcile(last_seen_desired_status)
     async def _reconcile(self, desired_status: FunctionExecutorStatusProto) -> None:
-        async with self._function_executor_state.lock:
-            current_status: FunctionExecutorStatus = (
-                self._function_executor_state.status
-            )
-            # We have to process all possible combination of current and desired statuses.
-            if current_status == FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR:
-                if (
-                    desired_status
-                    == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR
-                ):
-                    return  # Same status, nothing to do.
-                # All we can do from the current status is to destroy the FE to possibly recreate it later
-                # if Server requests to do this. This is why we don't accept any other desired statuses.
-                return await self._destroy_or_shutdown_fe_if_desired(desired_status)
-            if current_status == FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR:
-                if (
-                    desired_status
-                    == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR
-                ):
-                    return  # Same status, nothing to do.
-                # All we can do from the current status is to destroy the FE to possibly recreate it later
-                # if Server requests to do this. This is why we don't accept any other desired statuses.
-                return await self._destroy_or_shutdown_fe_if_desired(desired_status)
-            if current_status == FunctionExecutorStatus.IDLE:
-                if (
-                    desired_status
-                    == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE
-                ):
-                    return  # Same status, nothing to do.
-                # Server can only request FE destroy or shutdown when FE has IDLE status.
-                # Transition from IDLE to RUNNING_TASK can only be done by Task controller.
-                # Transition from IDLE to UNHEALTHY can only be done by FE controller.
-                return await self._destroy_or_shutdown_fe_if_desired(desired_status)
-            if current_status == FunctionExecutorStatus.RUNNING_TASK:
-                if (
-                    desired_status
-                    == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK
-                ):
-                    return  # Same status, nothing to do.
+        """Reconciles the FE status with the desired status.
-                # Server can only request FE destroy or shutdown when FE has RUNNING_TASK status.
-                # Transition from RUNNING_TASK to UNHEALTHY can only be done by Task controller.
-                return await self._destroy_or_shutdown_fe_if_desired(desired_status)
-            if current_status == FunctionExecutorStatus.UNHEALTHY:
-                if (
-                    desired_status
-                    == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNHEALTHY
-                ):
-                    return  # Same status, nothing to do.
-                # Server can only request FE destroy or shutdown when FE has RUNNING_TASK status.
-                return await self._destroy_or_shutdown_fe_if_desired(desired_status)
-            if current_status == FunctionExecutorStatus.DESTROYED:
-                if (
-                    desired_status
-                    == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
-                ):
-                    return  # Same status, nothing to do.
-                return await self._reconcile_from_destroyed(desired_status)
-            # _reconcile() can't be called when current FE status is one of "long running" states
-            # handled by FE controller like STARTING_UP and DESTROYING. This is because _reconcile()
-            # is called with concurrency of 1 and _reconcile() waits until these long running states
-            # (operations) are finished before returning.
-            #
-            # It's not possible to have SHUTDOWN current status because when FE controller transitions to SHUTDOWN
-            # status, it cancels the reconciliation loop task.
-            self._logger.error(
-                "unexpected current function executor status, skipping state reconciliation",
-                current_status=current_status.name,
-                desired_status=FunctionExecutorStatusProto.Name(desired_status),
-            )
+        Doesn't raise any exceptions."""
+        async with self._function_executor_state.lock:
+            if (
+                desired_status
+                == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE
+            ):
+                return await self._startup()
+            elif (
+                desired_status
+                == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
+            ):
+                # Shutdown can be requested with any current status.
+                return await self._shutdown()
+            else:
+                self._logger.error(
+                    "unexpected desired function executor status received from server, skipping state reconciliation",
+                    current_status=self._function_executor_state.status.name,
+                    desired_status=FunctionExecutorStatusProto.Name(desired_status),
+                )
-    async def _destroy_or_shutdown_fe_if_desired(
-        self, desired_status: FunctionExecutorStatusProto
-    ) -> None:
-        """Destroys the Function Executor if desired status asks for it.
+    async def _shutdown(self) -> None:
+        """Shutsdown the Function Executor and frees all of its resources.
-        Otherwise logs an error because other actions are not allowed by the current status.
-        Caller holds the FE state lock.
+        Caller holds the FE state lock. Doesn't raise any exceptions.
         """
-        if desired_status not in [
-            FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPING,
-            FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
-            FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
+        # Run destroy sequence if current FE status requires it (see allows FE status transitions).
+        # We won't see DESTROYING and STARTING_UP statuses here because FE reconciliation is done
+        # with concurrency of 1.
+        if self._function_executor_state.status in [
+            FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
+            FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
+            FunctionExecutorStatus.IDLE,
+            FunctionExecutorStatus.RUNNING_TASK,
+            FunctionExecutorStatus.UNHEALTHY,
         ]:
-            self._logger.error(
-                "unexpected desired function executor status received from server, skipping state reconciliation",
-                current_status=self._function_executor_state.status.name,
-                desired_status=FunctionExecutorStatusProto.Name(desired_status),
+            await self._function_executor_state.set_status(
+                FunctionExecutorStatus.DESTROYING
             )
-            return
+            if self._function_executor_state.function_executor is not None:
+                async with _UnlockedLockContextManager(
+                    self._function_executor_state.lock
+                ):
+                    await self._function_executor_state.function_executor.destroy()
+            await self._function_executor_state.set_status(
+                FunctionExecutorStatus.DESTROYED
+            )
+            self._function_executor_state.function_executor = None
-        await self._destroy_function_executor()
-        # FE state status is now DESTROYED.
-        if (
-            desired_status
-            == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
-        ):
-            await self._shutdown()
-            # No code is executed after this point because reconciliation loop aio task is cancelled.
+        self._logger.info("shutting down function executor controller")
+        await self._function_executor_state.set_status(FunctionExecutorStatus.SHUTDOWN)
+        self._reconciliation_loop_task.cancel()
+        # No code is executed after this point because reconciliation loop aio task is cancelled.
-    async def _reconcile_from_destroyed(
-        self, desired_status: FunctionExecutorStatusProto
-    ) -> None:
-        """Reconciles the FE state when it has DESTROYED status.
+    async def _startup(self) -> None:
+        """Startups the FE if possible.
-        Caller holds the FE state lock.
+        Caller holds the FE state lock. Doesn't raise any exceptions.
         """
-        if desired_status not in [
-            FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTING_UP,
-            FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE,
-            FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK,
-            FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
-        ]:
+        if self._function_executor_state.status != FunctionExecutorStatus.DESTROYED:
             self._logger.error(
-                "unexpected desired function executor status received from server, skipping state reconciliation",
+                "Can't startup Function Executor from its current state, skipping startup",
                 current_status=self._function_executor_state.status.name,
-                desired_status=FunctionExecutorStatusProto.Name(desired_status),
             )
             return
-        if (
-            desired_status
-            == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
-        ):
-            await self._shutdown()
-            # No code is executed after this point because reconciliation loop aio task is cancelled.
-            return
-        # All the rest of the allowed desired statuses ask to create the FE.
         await self._function_executor_state.set_status(
             FunctionExecutorStatus.STARTING_UP
         )
@@ -267,6 +259,7 @@ class FunctionExecutorController:
                 next_status_message = str(e)
             except Exception as e:
                 next_status = FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
+                self._logger.error("failed to create function executor", exc_info=e)
         # FE state lock is acquired again at this point.
         await self._function_executor_state.set_status(next_status, next_status_message)
@@ -279,47 +272,35 @@ class FunctionExecutorController:
                 self._health_check_failed_callback
             )
-    async def _destroy_function_executor(self) -> None:
-        """Destroys the Function Executor if it exists.
-        Caller holds the FE state lock.
-        """
-        await self._function_executor_state.set_status(
-            FunctionExecutorStatus.DESTROYING
-        )
-        async with _UnlockedLockContextManager(self._function_executor_state.lock):
-            await self._function_executor_state.function_executor.destroy()
-        await self._function_executor_state.set_status(FunctionExecutorStatus.DESTROYED)
-        self._function_executor_state.function_executor = None
-    async def _shutdown(self) -> None:
-        """Shuts down the controller.
-        Caller holds the FE state lock.
-        Raises asyncio.CancelledError on return when called from reconciliation loop.
-        """
-        self._logger.info("shutting down function executor controller")
-        await self._function_executor_state.set_status(FunctionExecutorStatus.SHUTDOWN)
-        self._reconciliation_loop_task.cancel()
-        await self._reconciliation_loop_task
     async def _health_check_failed_callback(self, result: HealthCheckResult):
         async with self._function_executor_state.lock:
             if self._function_executor_state.status == FunctionExecutorStatus.UNHEALTHY:
                 return
-            if self._function_executor_state.status in (
+            # There can be false positive health check failures when we're creating
+            # or destroying FEs so we only react to health check failures when we expect
+            # the FE to be healthy.
+            if self._function_executor_state.status not in (
                 FunctionExecutorStatus.IDLE,
                 FunctionExecutorStatus.RUNNING_TASK,
             ):
-                # There can be false positive health check failures when we're creating
-                # or destroying FEs so we're not interested in them.
-                #
-                # Server should react to this transition into unhealthy state and ask to
-                # destroy this FE.
-                await self._function_executor_state.set_status(
-                    FunctionExecutorStatus.UNHEALTHY
-                )
+                return
+            await self._function_executor_state.set_status(
+                FunctionExecutorStatus.UNHEALTHY
+            )
+            function_executor: FunctionExecutor = (
+                self._function_executor_state.function_executor
+            )
+            self._function_executor_state.function_executor = None
+        self._logger.error(
+            "Function Executor health check failed, destroying Function Executor",
+            health_check_fail_reason=result.reason,
+        )
+        # Destroy the unhealthy FE asap so it doesn't consume resources.
+        # Do it with unlocked state lock to not stop other work on this FE state.
+        await function_executor.destroy()
 async def _create_function_executor(
@@ -341,12 +322,18 @@ async def _create_function_executor(
         graph_name=function_executor_description.graph_name,
         graph_version=function_executor_description.graph_version,
         logger=logger,
+        data_payload=(
+            function_executor_description.graph
+            if function_executor_description.HasField("graph")
+            else None
+        ),
     )
     config: FunctionExecutorServerConfiguration = FunctionExecutorServerConfiguration(
         executor_id=executor_id,
         function_executor_id=function_executor_description.id,
         namespace=function_executor_description.namespace,
+        image_uri=None,
         secret_names=list(function_executor_description.secret_names),
     )
     if function_executor_description.HasField("image_uri"):
@@ -361,8 +348,6 @@ async def _create_function_executor(
     )
     customer_code_timeout_sec: Optional[float] = None
     if function_executor_description.HasField("customer_code_timeout_ms"):
-        # TODO: Add integration tests with FE customer code initialization timeout
-        # when end-to-end implementation is done.
         customer_code_timeout_sec = (
             function_executor_description.customer_code_timeout_ms / 1000.0
         )
@@ -381,29 +366,14 @@ async def _create_function_executor(
             customer_code_timeout_sec=customer_code_timeout_sec,
         )
         return function_executor
-    except Exception:
+    except (Exception, asyncio.CancelledError):
+        # Destroy the failed to startup FE asap so it doesn't consume resources.
+        # Destroy the FE also if the FE initialization got cancelled to not leak
+        # allocated resources.
         await function_executor.destroy()
         raise
-def _validate_function_executor_description(
-    function_executor_description: FunctionExecutorDescription,
-) -> None:
-    """Validates the supplied FE description.
-    Raises ValueError if the description is not valid.
-    """
-    validator = MessageValidator(function_executor_description)
-    validator.required_field("id")
-    validator.required_field("namespace")
-    validator.required_field("graph_name")
-    validator.required_field("graph_version")
-    validator.required_field("function_name")
-    # image_uri is optional.
-    # secret_names can be empty.
-    # resource_limits is optional.
 class _UnlockedLockContextManager:
     """Unlocks its lock on enter to the scope and locks it back on exit."""

indexify/executor/grpc/metrics/state_reconciler.py ADDED Viewed

@@ -0,0 +1,17 @@
+import prometheus_client
+from ...monitoring.metrics import latency_metric_for_fast_operation
+metric_state_reconciliations = prometheus_client.Counter(
+    "state_reconciliations",
+    "Number of Executor state reconciliations",
+)
+metric_state_reconciliation_errors = prometheus_client.Counter(
+    "state_reconciliation_errors",
+    "Number of Executor state reconciliation errors after all retries",
+)
+metric_state_reconciliation_latency: prometheus_client.Histogram = (
+    latency_metric_for_fast_operation(
+        "state_reconciliation", "Executor state reconciliation"
+    )
+)

indexify/executor/grpc/metrics/task_controller.py ADDED Viewed

@@ -0,0 +1,8 @@
+import prometheus_client
+from ...monitoring.metrics import latency_metric_for_fast_operation
+metric_task_cancellations = prometheus_client.Counter(
+    "task_cancellations",
+    "Number of times a task was cancelled",
+)

indexify 0.3.19__py3-none-any.whl → 0.3.21__py3-none-any.whl

indexify 0.3.19py3-none-any.whl → 0.3.21py3-none-any.whl