PyPI - indexify - Versions diffs - 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl - Mend

indexify 0.3.14py3-none-any.whl → 0.3.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

indexify/executor/function_executor/function_executor_status.py ADDED Viewed

@@ -0,0 +1,91 @@
+from enum import Enum
+class FunctionExecutorStatus(Enum):
+    """Status of a Function Executor.
+    Each status lists transitions allowed to it.
+    """
+    # DESTROYED -> STARTING_UP
+    STARTING_UP = "Starting Up"
+    # STARTING_UP -> STARTUP_FAILED_CUSTOMER_ERROR
+    STARTUP_FAILED_CUSTOMER_ERROR = "Startup Failed (Customer Error)"
+    # STARTING_UP -> STARTUP_FAILED_PLATFORM_ERROR
+    STARTUP_FAILED_PLATFORM_ERROR = "Startup Failed (Platform Error)"
+    # STARTING_UP -> IDLE
+    # RUNNING_TASK -> IDLE
+    IDLE = "Idle"
+    # IDLE -> RUNNING_TASK
+    RUNNING_TASK = "Running Task"
+    # IDLE -> UNHEALTHY
+    # RUNNING_TASK -> UNHEALTHY
+    UNHEALTHY = "Unhealthy"
+    # STARTUP_FAILED_CUSTOMER_ERROR -> DESTROYING
+    # STARTUP_FAILED_PLATFORM_ERROR -> DESTROYING
+    # UNHEALTHY -> DESTROYING
+    # IDLE -> DESTROYING
+    DESTROYING = "Destroying"
+    # DESTROYED (initial status)
+    # DESTROYING -> DESTROYED
+    DESTROYED = "Destroyed"
+    # Any state -> SHUTDOWN
+    SHUTDOWN = "Shutdown"  # Permanent stop state
+def is_status_change_allowed(
+    current_status: FunctionExecutorStatus, new_status: FunctionExecutorStatus
+) -> bool:
+    """Returns True if the transition is allowed."""
+    allowed_transitions = {
+        FunctionExecutorStatus.DESTROYED: [
+            FunctionExecutorStatus.DESTROYED,
+            FunctionExecutorStatus.STARTING_UP,
+            FunctionExecutorStatus.SHUTDOWN,
+        ],
+        FunctionExecutorStatus.STARTING_UP: [
+            FunctionExecutorStatus.STARTING_UP,
+            FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
+            FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
+            FunctionExecutorStatus.IDLE,
+            FunctionExecutorStatus.SHUTDOWN,
+        ],
+        FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR: [
+            FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
+            FunctionExecutorStatus.DESTROYING,
+            FunctionExecutorStatus.SHUTDOWN,
+        ],
+        FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR: [
+            FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
+            FunctionExecutorStatus.DESTROYING,
+            FunctionExecutorStatus.SHUTDOWN,
+        ],
+        FunctionExecutorStatus.IDLE: [
+            FunctionExecutorStatus.IDLE,
+            FunctionExecutorStatus.RUNNING_TASK,
+            FunctionExecutorStatus.UNHEALTHY,
+            FunctionExecutorStatus.DESTROYING,
+            FunctionExecutorStatus.SHUTDOWN,
+        ],
+        FunctionExecutorStatus.RUNNING_TASK: [
+            FunctionExecutorStatus.RUNNING_TASK,
+            FunctionExecutorStatus.IDLE,
+            FunctionExecutorStatus.UNHEALTHY,
+            FunctionExecutorStatus.SHUTDOWN,
+        ],
+        FunctionExecutorStatus.UNHEALTHY: [
+            FunctionExecutorStatus.UNHEALTHY,
+            FunctionExecutorStatus.DESTROYING,
+            FunctionExecutorStatus.SHUTDOWN,
+        ],
+        FunctionExecutorStatus.DESTROYING: [
+            FunctionExecutorStatus.DESTROYING,
+            FunctionExecutorStatus.DESTROYED,
+            FunctionExecutorStatus.SHUTDOWN,
+        ],
+        FunctionExecutorStatus.SHUTDOWN: [
+            FunctionExecutorStatus.SHUTDOWN
+        ],  # No transitions allowed from SHUTDOWN
+    }
+    return new_status in allowed_transitions.get(current_status, [])

indexify/executor/function_executor/metrics/function_executor.py CHANGED Viewed

@@ -90,7 +90,7 @@ metric_get_info_rpc_errors: prometheus_client.Counter = prometheus_client.Counte
 )
 metric_function_executor_infos: prometheus_client.Counter = prometheus_client.Counter(
     "function_executor_infos",
-    "Number of Function Executors with particular info",
+    "Number of Function Executor creations with particular info",
     ["version", "sdk_version", "sdk_language", "sdk_language_version"],
 )

indexify/executor/function_executor/metrics/function_executor_state.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import prometheus_client
+from ..function_executor_status import FunctionExecutorStatus
 # This file contains all metrics used by FunctionExecutorState.
 metric_function_executor_state_not_locked_errors: prometheus_client.Counter = (
@@ -8,3 +10,37 @@ metric_function_executor_state_not_locked_errors: prometheus_client.Counter = (
         "Number of times a Function Executor state was used without acquiring its lock",
     )
 )
+# Function Executors count with a particular status.
+metric_function_executors_with_status: prometheus_client.Gauge = (
+    prometheus_client.Gauge(
+        "function_executors_with_status",
+        "Number of Function Executors with a particular status",
+        ["status"],
+    )
+)
+metric_function_executors_with_status.labels(
+    status=FunctionExecutorStatus.STARTING_UP.name
+)
+metric_function_executors_with_status.labels(
+    status=FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR.name
+)
+metric_function_executors_with_status.labels(
+    status=FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR.name
+)
+metric_function_executors_with_status.labels(status=FunctionExecutorStatus.IDLE.name)
+metric_function_executors_with_status.labels(
+    status=FunctionExecutorStatus.RUNNING_TASK.name
+)
+metric_function_executors_with_status.labels(
+    status=FunctionExecutorStatus.UNHEALTHY.name
+)
+metric_function_executors_with_status.labels(
+    status=FunctionExecutorStatus.DESTROYING.name
+)
+metric_function_executors_with_status.labels(
+    status=FunctionExecutorStatus.DESTROYED.name
+)
+metric_function_executors_with_status.labels(
+    status=FunctionExecutorStatus.SHUTDOWN.name
+)

indexify/executor/function_executor/server/function_executor_server_factory.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from typing import Any, Optional
+from dataclasses import dataclass
+from typing import Any, List, Optional
 from .function_executor_server import FunctionExecutorServer
+@dataclass
 class FunctionExecutorServerConfiguration:
     """Configuration for creating a FunctionExecutorServer.
@@ -14,13 +16,11 @@ class FunctionExecutorServerConfiguration:
     configuration parameters or raise an exception if it can't implement
     them."""
-    def __init__(
-        self, executor_id: str, function_executor_id: str, image_uri: Optional[str]
-    ):
-        self.executor_id: str = executor_id
-        self.function_executor_id: str = function_executor_id
-        # Container image URI of the Function Executor Server.
-        self.image_uri: Optional[str] = image_uri
+    executor_id: str
+    function_executor_id: str
+    namespace: str
+    image_uri: Optional[str]
+    secret_names: List[str]
 class FunctionExecutorServerFactory:

indexify/executor/function_executor/single_task_runner.py CHANGED Viewed

@@ -14,6 +14,7 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
 from ..api_objects import Task
 from .function_executor import CustomerError, FunctionExecutor
 from .function_executor_state import FunctionExecutorState
+from .function_executor_status import FunctionExecutorStatus
 from .health_checker import HealthChecker, HealthCheckResult
 from .metrics.single_task_runner import (
     metric_function_executor_run_task_rpc_errors,
@@ -40,9 +41,11 @@ class SingleTaskRunner:
         logger: Any,
     ):
         self._executor_id: str = executor_id
-        self._state: FunctionExecutorState = function_executor_state
+        self._function_executor_state: FunctionExecutorState = function_executor_state
         self._task_input: TaskInput = task_input
-        self._factory: FunctionExecutorServerFactory = function_executor_server_factory
+        self._function_executor_server_factory: FunctionExecutorServerFactory = (
+            function_executor_server_factory
+        )
         self._base_url: str = base_url
         self._config_path: Optional[str] = config_path
         self._logger = logger.bind(module=__name__)
@@ -54,18 +57,32 @@ class SingleTaskRunner:
         The lock is released during actual task run in the server.
         The lock is relocked on return.
-        Raises an exception if an error occured."""
-        self._state.check_locked()
+        Raises an exception if an error occured.
+        On enter the Function Executor status is either IDLE, UNHEALTHY or DESTROYED.
+        On return the Function Executor status is either IDLE, UNHEALTHY or DESTROYED.
+        """
+        self._function_executor_state.check_locked()
-        if self._state.is_shutdown:
-            raise RuntimeError("Function Executor state is shutting down.")
+        if self._function_executor_state.status not in [
+            FunctionExecutorStatus.IDLE,
+            FunctionExecutorStatus.UNHEALTHY,
+            FunctionExecutorStatus.DESTROYED,
+        ]:
+            self._logger.error(
+                "Function Executor is not in oneof [IDLE, UNHEALTHY, DESTROYED] state, cannot run the task",
+                status=self._function_executor_state.status,
+            )
+            raise RuntimeError(
+                f"Unexpected Function Executor state {self._function_executor_state.status}"
+            )
         # If Function Executor became unhealthy while was idle then destroy it.
         # It'll be recreated below.
         await self._destroy_existing_function_executor_if_unhealthy()
         # Create Function Executor if it doesn't exist yet.
-        if self._state.function_executor is None:
+        if self._function_executor_state.status == FunctionExecutorStatus.DESTROYED:
             try:
                 await self._create_function_executor()
             except CustomerError as e:
@@ -87,15 +104,38 @@ class SingleTaskRunner:
             # The periodic health checker might not notice this as it does only periodic checks.
             await self._destroy_existing_function_executor_if_unhealthy()
-    async def _create_function_executor(self) -> FunctionExecutor:
-        function_executor: FunctionExecutor = FunctionExecutor(
-            server_factory=self._factory, logger=self._logger
+            if self._function_executor_state.status not in [
+                FunctionExecutorStatus.IDLE,
+                FunctionExecutorStatus.UNHEALTHY,
+                FunctionExecutorStatus.DESTROYED,
+            ]:
+                self._logger.error(
+                    "Function Executor status is not oneof [IDLE, UNHEALTHY, DESTROYED] after running the task, resetting the state to mitigate a possible bug",
+                    status=self._function_executor_state.status,
+                )
+                if self._function_executor_state.function_executor is None:
+                    await self._function_executor_state.set_status(
+                        FunctionExecutorStatus.DESTROYED
+                    )
+                else:
+                    await self._function_executor_state.set_status(
+                        FunctionExecutorStatus.UNHEALTHY
+                    )
+    async def _create_function_executor(self) -> None:
+        await self._function_executor_state.set_status(
+            FunctionExecutorStatus.STARTING_UP
+        )
+        self._function_executor_state.function_executor = FunctionExecutor(
+            server_factory=self._function_executor_server_factory, logger=self._logger
         )
         config: FunctionExecutorServerConfiguration = (
             FunctionExecutorServerConfiguration(
                 executor_id=self._executor_id,
-                function_executor_id=self._state.id,
+                function_executor_id=self._function_executor_state.id,
+                namespace=self._task_input.task.namespace,
                 image_uri=self._task_input.task.image_uri,
+                secret_names=self._task_input.task.secret_names or [],
             )
         )
         initialize_request: InitializeRequest = InitializeRequest(
@@ -107,17 +147,29 @@ class SingleTaskRunner:
         )
         try:
-            await function_executor.initialize(
+            await self._function_executor_state.function_executor.initialize(
                 config=config,
                 initialize_request=initialize_request,
                 base_url=self._base_url,
                 config_path=self._config_path,
             )
-            self._state.function_executor = function_executor
+        except CustomerError:
+            # We have to follow the valid state transition sequence.
+            await self._function_executor_state.set_status(
+                FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR
+            )
+            await self._function_executor_state.destroy_function_executor()
+            raise
         except Exception:
-            await function_executor.destroy()
+            # We have to follow the valid state transition sequence.
+            await self._function_executor_state.set_status(
+                FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
+            )
+            await self._function_executor_state.destroy_function_executor()
             raise
+        await self._function_executor_state.set_status(FunctionExecutorStatus.IDLE)
     async def _run(self) -> TaskOutput:
         request: RunTaskRequest = RunTaskRequest(
             namespace=self._task_input.task.namespace,
@@ -130,13 +182,15 @@ class SingleTaskRunner:
         )
         if self._task_input.init_value is not None:
             request.function_init_value.CopyFrom(self._task_input.init_value)
-        channel: grpc.aio.Channel = self._state.function_executor.channel()
+        channel: grpc.aio.Channel = (
+            self._function_executor_state.function_executor.channel()
+        )
         async with _RunningTaskContextManager(
             invocation_id=self._task_input.task.invocation_id,
             task_id=self._task_input.task.id,
             health_check_failed_callback=self._health_check_failed_callback,
-            function_executor_state=self._state,
+            function_executor_state=self._function_executor_state,
         ):
             with (
                 metric_function_executor_run_task_rpc_errors.count_exceptions(),
@@ -154,31 +208,40 @@ class SingleTaskRunner:
     async def _health_check_failed_callback(self, result: HealthCheckResult):
         # Function Executor destroy due to the periodic health check failure ensures that
         # a running task RPC stuck in unhealthy Function Executor fails immidiately.
-        async with self._state.lock:
-            if self._state.function_executor is not None:
-                await self._destroy_function_executor_on_failed_health_check(
-                    result.reason
-                )
+        async with self._function_executor_state.lock:
+            if (
+                self._function_executor_state.status
+                != FunctionExecutorStatus.RUNNING_TASK
+            ):
+                # Protection in case the callback gets delivered after we finished running the task.
+                return
+            await self._function_executor_state.set_status(
+                FunctionExecutorStatus.UNHEALTHY
+            )
+            await self._destroy_function_executor_on_failed_health_check(result.reason)
     async def _destroy_existing_function_executor_if_unhealthy(self):
-        self._state.check_locked()
-        if self._state.function_executor is None:
-            return
-        result: HealthCheckResult = (
-            await self._state.function_executor.health_checker().check()
-        )
-        if result.is_healthy:
-            return
-        await self._destroy_function_executor_on_failed_health_check(result.reason)
+        self._function_executor_state.check_locked()
+        if self._function_executor_state.status == FunctionExecutorStatus.IDLE:
+            result: HealthCheckResult = (
+                await self._function_executor_state.function_executor.health_checker().check()
+            )
+            if not result.is_healthy:
+                await self._function_executor_state.set_status(
+                    FunctionExecutorStatus.UNHEALTHY
+                )
+        if self._function_executor_state.status == FunctionExecutorStatus.UNHEALTHY:
+            await self._destroy_function_executor_on_failed_health_check(result.reason)
     async def _destroy_function_executor_on_failed_health_check(self, reason: str):
-        self._state.check_locked()
+        self._function_executor_state.check_locked()
         self._logger.error(
             "Function Executor health check failed, destroying Function Executor",
             health_check_fail_reason=reason,
         )
-        self._state.health_check_failed = True
-        await self._state.destroy_function_executor()
+        await self._function_executor_state.destroy_function_executor()
 class _RunningTaskContextManager:
@@ -199,7 +262,7 @@ class _RunningTaskContextManager:
         self._state: FunctionExecutorState = function_executor_state
     async def __aenter__(self):
-        self._state.increment_running_tasks()
+        await self._state.set_status(FunctionExecutorStatus.RUNNING_TASK)
         self._state.function_executor.invocation_state_client().add_task_to_invocation_id_entry(
             task_id=self._task_id,
             invocation_id=self._invocation_id,
@@ -213,9 +276,9 @@ class _RunningTaskContextManager:
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         await self._state.lock.acquire()
-        self._state.decrement_running_tasks()
-        # Health check callback could destroy the FunctionExecutor.
-        if self._state.function_executor is not None:
+        # Health check callback could destroy the FunctionExecutor and set status to UNHEALTHY
+        if self._state.status == FunctionExecutorStatus.RUNNING_TASK:
+            await self._state.set_status(FunctionExecutorStatus.IDLE)
             self._state.function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
                 task_id=self._task_id
             )

indexify/executor/grpc/channel_creator.py ADDED Viewed

@@ -0,0 +1,53 @@
+import asyncio
+from typing import Any
+import grpc.aio
+from .metrics.channel_creator import (
+    metric_grpc_server_channel_creation_latency,
+    metric_grpc_server_channel_creation_retries,
+    metric_grpc_server_channel_creations,
+)
+_RETRY_INTERVAL_SEC = 5
+_CONNECT_TIMEOUT_SEC = 5
+class ChannelCreator:
+    def __init__(self, server_address: str, logger: Any):
+        self._logger = logger.bind(module=__name__)
+        self._server_address = server_address
+        self._is_shutdown = False
+    async def create(self) -> grpc.aio.Channel:
+        """Creates a channel to the gRPC server.
+        Blocks until the channel is ready.
+        Never raises any exceptions.
+        """
+        with metric_grpc_server_channel_creation_latency.time():
+            metric_grpc_server_channel_creations.inc()
+            while not self._is_shutdown:
+                try:
+                    channel = grpc.aio.insecure_channel(self._server_address)
+                    await asyncio.wait_for(
+                        channel.channel_ready(),
+                        timeout=_CONNECT_TIMEOUT_SEC,
+                    )
+                    return channel
+                except Exception:
+                    self._logger.error(
+                        f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
+                    )
+                    try:
+                        await channel.close()
+                    except Exception as e:
+                        self._logger.error(
+                            "failed closing not established channel", exc_info=e
+                        )
+                    metric_grpc_server_channel_creation_retries.inc()
+                    await asyncio.sleep(_RETRY_INTERVAL_SEC)
+    async def shutdown(self):
+        self._is_shutdown = True

indexify/executor/grpc/metrics/channel_creator.py ADDED Viewed

@@ -0,0 +1,18 @@
+import prometheus_client
+from ...monitoring.metrics import latency_metric_for_fast_operation
+metric_grpc_server_channel_creations = prometheus_client.Counter(
+    "grpc_server_channel_creations",
+    "Number of times a channel to gRPC Server was created",
+)
+metric_grpc_server_channel_creation_retries = prometheus_client.Counter(
+    "grpc_server_channel_creation_retries",
+    "Number of retries during a channel creation to gRPC Server",
+)
+metric_grpc_server_channel_creation_latency: prometheus_client.Histogram = (
+    latency_metric_for_fast_operation(
+        "grpc_server_channel_creation",
+        "gRPC server channel creation",
+    )
+)

indexify/executor/grpc/metrics/state_reporter.py ADDED Viewed

@@ -0,0 +1,17 @@
+import prometheus_client
+from ...monitoring.metrics import latency_metric_for_fast_operation
+metric_state_report_rpcs = prometheus_client.Counter(
+    "state_report_rpcs",
+    "Number of Executor state report RPCs to Server",
+)
+metric_state_report_errors = prometheus_client.Counter(
+    "state_report_rpc_errors",
+    "Number of Executor state report RPC errors",
+)
+metric_state_report_latency: prometheus_client.Histogram = (
+    latency_metric_for_fast_operation(
+        "state_report_rpc", "Executor state report rpc to Server"
+    )
+)

indexify/executor/{state_reconciler.py → grpc/state_reconciler.py} RENAMED Viewed

@@ -7,29 +7,29 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
     SerializedObject,
 )
-from indexify.task_scheduler.proto.task_scheduler_pb2 import (
+from indexify.proto.task_scheduler_pb2 import (
     DesiredExecutorState,
     FunctionExecutorDescription,
     FunctionExecutorStatus,
     GetDesiredExecutorStatesRequest,
 )
-from indexify.task_scheduler.proto.task_scheduler_pb2_grpc import (
+from indexify.proto.task_scheduler_pb2_grpc import (
     TaskSchedulerServiceStub,
 )
-from .downloader import Downloader
-from .function_executor.function_executor import CustomerError, FunctionExecutor
-from .function_executor.function_executor_state import FunctionExecutorState
-from .function_executor.function_executor_states_container import (
+from ..downloader import Downloader
+from ..function_executor.function_executor import CustomerError, FunctionExecutor
+from ..function_executor.function_executor_state import FunctionExecutorState
+from ..function_executor.function_executor_states_container import (
     FunctionExecutorStatesContainer,
 )
-from .function_executor.server.function_executor_server_factory import (
+from ..function_executor.server.function_executor_server_factory import (
     FunctionExecutorServerConfiguration,
     FunctionExecutorServerFactory,
 )
-from .function_executor.task_input import TaskInput
-from .function_executor.task_output import TaskOutput
-from .metrics.executor import (
+from ..function_executor.task_input import TaskInput
+from ..function_executor.task_output import TaskOutput
+from ..metrics.executor import (
     METRIC_TASKS_COMPLETED_OUTCOME_ALL,
     METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
     METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
@@ -42,7 +42,10 @@ from .metrics.executor import (
     metric_tasks_fetched,
     metric_tasks_reporting_outcome,
 )
-from .task_reporter import TaskReporter
+from ..task_reporter import TaskReporter
+from .channel_creator import ChannelCreator
+_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
 class ExecutorStateReconciler:
@@ -55,11 +58,13 @@ class ExecutorStateReconciler:
         config_path: Optional[str],
         downloader: Downloader,
         task_reporter: TaskReporter,
-        server_channel: grpc.aio.Channel,
+        channel_creator: ChannelCreator,
         logger: Any,
     ):
         self._executor_id: str = executor_id
-        self._factory: FunctionExecutorServerFactory = function_executor_server_factory
+        self._function_executor_server_factory: FunctionExecutorServerFactory = (
+            function_executor_server_factory
+        )
         self._base_url: str = base_url
         self._config_path: Optional[str] = config_path
         self._downloader: Downloader = downloader
@@ -67,39 +72,60 @@ class ExecutorStateReconciler:
         self._function_executor_states: FunctionExecutorStatesContainer = (
             function_executor_states
         )
-        self._stub: TaskSchedulerServiceStub = TaskSchedulerServiceStub(server_channel)
+        self._channel_creator = channel_creator
         self._logger: Any = logger.bind(module=__name__)
         self._is_shutdown: bool = False
-        self._reconciliation_lock: asyncio.Lock = asyncio.Lock()
         self._server_last_clock: Optional[int] = None
     async def run(self):
-        desired_states: AsyncGenerator[DesiredExecutorState, None] = (
-            self._stub.get_desired_executor_states(
-                GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
-            )
-        )
+        """Runs the state reconciler.
+        Never raises any exceptions.
+        """
+        while not self._is_shutdown:
+            async with await self._channel_creator.create() as server_channel:
+                server_channel: grpc.aio.Channel
+                stub = TaskSchedulerServiceStub(server_channel)
+                while not self._is_shutdown:
+                    try:
+                        # TODO: Report state once before starting the stream.
+                        desired_states_stream: AsyncGenerator[
+                            DesiredExecutorState, None
+                        ] = stub.get_desired_executor_states(
+                            GetDesiredExecutorStatesRequest(
+                                executor_id=self._executor_id
+                            )
+                        )
+                        await self._process_desired_states_stream(desired_states_stream)
+                    except Exception as e:
+                        self._logger.error(
+                            f"Failed processing desired states stream, reconnecting in {_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC} sec.",
+                            exc_info=e,
+                        )
+                        await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
+                        break
+        self._logger.info("State reconciler shutdown.")
+    async def _process_desired_states_stream(
+        self, desired_states: AsyncGenerator[DesiredExecutorState, None]
+    ):
         async for new_state in desired_states:
             if self._is_shutdown:
                 return
             new_state: DesiredExecutorState
             if self._server_last_clock is not None:
                 if self._server_last_clock >= new_state.clock:
                     continue  # Duplicate or outdated message state sent by Server.
             self._server_last_clock = new_state.clock
-            asyncio.create_task(self._reconcile_state(new_state))
+            await self._reconcile_state(new_state)
     async def _reconcile_state(self, new_state: DesiredExecutorState):
-        if self._is_shutdown:
-            return
-        # Simple non concurrent implementation for now for the PoC.
-        # Obtain this lock to force only a single coroutine doing the reconciliation.
-        async with self._reconciliation_lock:
-            await self._reconcile_function_executors(new_state)
-            # TODO
-            # await self._reconcile_task_allocations(new_state)
+        await self._reconcile_function_executors(new_state)
+        # TODO
+        # await self._reconcile_task_allocations(new_state)
     async def shutdown(self):
         """Shuts down the state reconciler.
@@ -121,6 +147,7 @@ class ExecutorStateReconciler:
                     graph_name=desired_function_executor.graph_name,
                     graph_version=desired_function_executor.graph_version,
                     function_name=desired_function_executor.function_name,
+                    image_uri=desired_function_executor.image_uri,
                 )
             )
@@ -203,13 +230,15 @@ class ExecutorStateReconciler:
             logger=logger,
         )
         function_executor: FunctionExecutor = FunctionExecutor(
-            server_factory=self._factory, logger=logger
+            server_factory=self._function_executor_server_factory, logger=logger
         )
         config: FunctionExecutorServerConfiguration = (
             FunctionExecutorServerConfiguration(
                 executor_id=self._executor_id,
                 function_executor_id=description.id,
+                namespace=description.namespace,
                 image_uri=description.image_uri,
+                secret_names=list(description.secret_names),
             )
         )
         initialize_request: InitializeRequest = InitializeRequest(

indexify 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl

indexify 0.3.14py3-none-any.whl → 0.3.16py3-none-any.whl