PyPI - indexify - Versions diffs - 0.4.21__py3-none-any.whl → 0.4.23__py3-none-any.whl - Mend

indexify 0.4.21py3-none-any.whl → 0.4.23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

indexify/executor/function_executor_controller/run_task.py CHANGED Viewed

@@ -1,14 +1,17 @@
 import asyncio
-import os
-import random
 import time
 from typing import Any, Optional
 import grpc
 from tensorlake.function_executor.proto.function_executor_pb2 import (
-    RunTaskRequest,
-    RunTaskResponse,
-    SerializedObject,
+    BLOB,
+    AwaitTaskProgress,
+    AwaitTaskRequest,
+    CreateTaskRequest,
+    DeleteTaskRequest,
+    SerializedObjectInsideBLOB,
+    Task,
+    TaskDiagnostics,
 )
 from tensorlake.function_executor.proto.function_executor_pb2 import (
     TaskFailureReason as FETaskFailureReason,
@@ -16,6 +19,9 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
 from tensorlake.function_executor.proto.function_executor_pb2 import (
     TaskOutcomeCode as FETaskOutcomeCode,
 )
+from tensorlake.function_executor.proto.function_executor_pb2 import (
+    TaskResult,
+)
 from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
     FunctionExecutorStub,
 )
@@ -40,9 +46,8 @@ from .metrics.run_task import (
 from .task_info import TaskInfo
 from .task_output import TaskMetrics, TaskOutput
-_ENABLE_INJECT_TASK_CANCELLATIONS = (
-    os.getenv("INDEXIFY_INJECT_TASK_CANCELLATIONS", "0") == "1"
-)
+_CREATE_TASK_TIMEOUT_SECS = 5
+_DELETE_TASK_TIMEOUT_SECS = 5
 async def run_task_on_function_executor(
@@ -53,7 +58,22 @@ async def run_task_on_function_executor(
     Doesn't raise any exceptions.
     """
     logger = logger.bind(module=__name__)
-    request: RunTaskRequest = RunTaskRequest(
+    if task_info.input is None:
+        logger.error(
+            "task input is None, this should never happen",
+        )
+        task_info.output = TaskOutput.internal_error(
+            allocation=task_info.allocation,
+            execution_start_time=None,
+            execution_end_time=None,
+        )
+        return TaskExecutionFinished(
+            task_info=task_info,
+            function_executor_termination_reason=None,
+        )
+    task = Task(
         namespace=task_info.allocation.task.namespace,
         graph_name=task_info.allocation.task.graph_name,
         graph_version=task_info.allocation.task.graph_version,
@@ -61,15 +81,8 @@ async def run_task_on_function_executor(
         graph_invocation_id=task_info.allocation.task.graph_invocation_id,
         task_id=task_info.allocation.task.id,
         allocation_id=task_info.allocation.allocation_id,
-        function_input=task_info.input,
+        request=task_info.input.function_inputs,
     )
-    # Don't keep the input in memory after we started running the task.
-    task_info.input = None
-    if task_info.init_value is not None:
-        request.function_init_value.CopyFrom(task_info.init_value)
-        # Don't keep the init value in memory after we started running the task.
-        task_info.init_value = None
     function_executor.invocation_state_client().add_task_to_invocation_id_entry(
         task_id=task_info.allocation.task.id,
@@ -78,51 +91,78 @@ async def run_task_on_function_executor(
     metric_function_executor_run_task_rpcs.inc()
     metric_function_executor_run_task_rpcs_in_progress.inc()
-    start_time = time.monotonic()
     # Not None if the Function Executor should be terminated after running the task.
     function_executor_termination_reason: Optional[
         FunctionExecutorTerminationReason
     ] = None
-    execution_start_time: Optional[float] = None
+    # NB: We start this timer before invoking the first RPC, since
+    # user code should be executing by the time the create_task() RPC
+    # returns, so not attributing the task management RPC overhead to
+    # the user would open a possibility for abuse. (This is somewhat
+    # mitigated by the fact that these RPCs should have a very low
+    # overhead.)
+    execution_start_time: Optional[float] = time.monotonic()
     # If this RPC failed due to customer code crashing the server we won't be
     # able to detect this. We'll treat this as our own error for now and thus
     # let the AioRpcError to be raised here.
-    timeout_sec = task_info.allocation.task.timeout_ms / 1000.0
+    timeout_sec: float = task_info.allocation.task.timeout_ms / 1000.0
     try:
-        channel: grpc.aio.Channel = function_executor.channel()
-        execution_start_time = time.monotonic()
-        response: RunTaskResponse = await FunctionExecutorStub(channel).run_task(
-            request, timeout=timeout_sec
-        )
-        task_info.output = _task_output_from_function_executor_response(
+        # This aio task can only be cancelled during this await call.
+        task_result = await _run_task_rpcs(task, function_executor, timeout_sec)
+        _process_task_diagnostics(task_result.diagnostics, logger)
+        task_info.output = _task_output_from_function_executor_result(
             allocation=task_info.allocation,
-            response=response,
+            result=task_result,
             execution_start_time=execution_start_time,
             execution_end_time=time.monotonic(),
             logger=logger,
         )
+    except asyncio.TimeoutError:
+        # This is an await_task() RPC timeout - we're not getting
+        # progress messages or a task completion.
+        function_executor_termination_reason = (
+            FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT
+        )
+        task_info.output = TaskOutput.function_timeout(
+            allocation=task_info.allocation,
+            execution_start_time=execution_start_time,
+            execution_end_time=time.monotonic(),
+        )
     except grpc.aio.AioRpcError as e:
+        # This indicates some sort of problem communicating with the FE.
+        #
+        # NB: We charge the user in these situations: code within the
+        # FE is not isolated, so not charging would enable abuse.
+        #
+        # This is an unexpected situation, though, so we make sure to
+        # log the situation for further investigation.
+        function_executor_termination_reason = (
+            FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
+        )
+        metric_function_executor_run_task_rpc_errors.inc()
         if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
-            # The task is still running in FE, we only cancelled the client-side RPC.
-            function_executor_termination_reason = (
-                FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT
-            )
-            task_info.output = TaskOutput.function_timeout(
-                allocation=task_info.allocation,
-                timeout_sec=timeout_sec,
-                execution_start_time=execution_start_time,
-                execution_end_time=time.monotonic(),
-            )
+            # This is either a create_task() RPC timeout or a
+            # delete_task() RPC timeout; either suggests that the FE
+            # is unhealthy.
+            logger.error("task management RPC execution deadline exceeded", exc_info=e)
         else:
-            metric_function_executor_run_task_rpc_errors.inc()
-            logger.error("task execution failed", exc_info=e)
-            task_info.output = TaskOutput.internal_error(
-                allocation=task_info.allocation,
-                execution_start_time=execution_start_time,
-                execution_end_time=time.monotonic(),
-            )
+            # This is a status from an unsuccessful RPC; this
+            # shouldn't happen, but we handle it.
+            logger.error("task management RPC failed", exc_info=e)
+        task_info.output = TaskOutput.function_executor_unresponsive(
+            allocation=task_info.allocation,
+            execution_start_time=execution_start_time,
+            execution_end_time=time.monotonic(),
+        )
     except asyncio.CancelledError:
+        # Handle aio task cancellation during `await _run_task_rpcs`.
         # The task is still running in FE, we only cancelled the client-side RPC.
         function_executor_termination_reason = (
             FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED
@@ -133,15 +173,20 @@ async def run_task_on_function_executor(
             execution_end_time=time.monotonic(),
         )
     except Exception as e:
-        metric_function_executor_run_task_rpc_errors.inc()
-        logger.error("task execution failed", exc_info=e)
+        # This is an unexpected exception; we believe that this
+        # indicates an internal error.
+        logger.error(
+            "Unexpected internal error during task lifecycle RPC sequence", exc_info=e
+        )
         task_info.output = TaskOutput.internal_error(
             allocation=task_info.allocation,
             execution_start_time=execution_start_time,
             execution_end_time=time.monotonic(),
         )
-    metric_function_executor_run_task_rpc_latency.observe(time.monotonic() - start_time)
+    metric_function_executor_run_task_rpc_latency.observe(
+        time.monotonic() - execution_start_time
+    )
     metric_function_executor_run_task_rpcs_in_progress.dec()
     function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
@@ -152,16 +197,21 @@ async def run_task_on_function_executor(
         task_info.output.outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
         and function_executor_termination_reason is None
     ):
-        # Check if the task failed because the FE is unhealthy to prevent more tasks failing.
-        result: HealthCheckResult = await function_executor.health_checker().check()
-        if not result.is_healthy:
-            function_executor_termination_reason = (
-                FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
-            )
-            logger.error(
-                "Function Executor health check failed after running task, shutting down Function Executor",
-                health_check_fail_reason=result.reason,
-            )
+        try:
+            # Check if the task failed because the FE is unhealthy to prevent more tasks failing.
+            result: HealthCheckResult = await function_executor.health_checker().check()
+            if not result.is_healthy:
+                function_executor_termination_reason = (
+                    FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
+                )
+                logger.error(
+                    "Function Executor health check failed after running task, shutting down Function Executor",
+                    health_check_fail_reason=result.reason,
+                )
+        except asyncio.CancelledError:
+            # The aio task was cancelled during the health check await.
+            # We can't conclude anything about the health of the FE here.
+            pass
     _log_task_execution_finished(output=task_info.output, logger=logger)
@@ -171,56 +221,106 @@ async def run_task_on_function_executor(
     )
-def _task_output_from_function_executor_response(
+async def _run_task_rpcs(
+    task: Task, function_executor: FunctionExecutor, timeout_sec: float
+) -> TaskResult:
+    """Runs the task, returning the result, reporting errors via exceptions."""
+    task_result: Optional[TaskResult] = None
+    channel: grpc.aio.Channel = function_executor.channel()
+    fe_stub = FunctionExecutorStub(channel)
+    # Create task with timeout
+    await fe_stub.create_task(
+        CreateTaskRequest(task=task), timeout=_CREATE_TASK_TIMEOUT_SECS
+    )
+    # Await task with timeout resets on each response
+    await_rpc = fe_stub.await_task(AwaitTaskRequest(task_id=task.task_id))
+    try:
+        while True:
+            # Wait for next response with fresh timeout each time
+            response: AwaitTaskProgress = await asyncio.wait_for(
+                await_rpc.read(), timeout=timeout_sec
+            )
+            if response == grpc.aio.EOF:
+                break
+            elif response.WhichOneof("response") == "task_result":
+                task_result = response.task_result
+                break
+            # NB: We don't actually check for other message types
+            # here; any message from the FE is treated as an
+            # indication that it's making forward progress.
+    finally:
+        # Cancel the outstanding RPC to ensure any resources in use
+        # are cleaned up; note that this is idempotent (in case the
+        # RPC has already completed).
+        await_rpc.cancel()
+    # Delete task with timeout
+    await fe_stub.delete_task(
+        DeleteTaskRequest(task_id=task.task_id), timeout=_DELETE_TASK_TIMEOUT_SECS
+    )
+    if task_result is None:
+        raise grpc.aio.AioRpcError(
+            grpc.StatusCode.CANCELLED,
+            None,
+            None,
+            "Function Executor didn't return function/task alloc result",
+        )
+    return task_result
+def _task_output_from_function_executor_result(
     allocation: TaskAllocation,
-    response: RunTaskResponse,
+    result: TaskResult,
     execution_start_time: Optional[float],
     execution_end_time: Optional[float],
     logger: Any,
 ) -> TaskOutput:
-    response_validator = MessageValidator(response)
-    response_validator.required_field("stdout")
-    response_validator.required_field("stderr")
+    response_validator = MessageValidator(result)
     response_validator.required_field("outcome_code")
     metrics = TaskMetrics(counters={}, timers={})
-    if response.HasField("metrics"):
+    if result.HasField("metrics"):
         # Can be None if e.g. function failed.
-        metrics.counters = dict(response.metrics.counters)
-        metrics.timers = dict(response.metrics.timers)
+        metrics.counters = dict(result.metrics.counters)
+        metrics.timers = dict(result.metrics.timers)
     outcome_code: TaskOutcomeCode = _to_task_outcome_code(
-        response.outcome_code, logger=logger
+        result.outcome_code, logger=logger
     )
     failure_reason: Optional[TaskFailureReason] = None
-    invocation_error_output: Optional[SerializedObject] = None
+    invocation_error_output: Optional[SerializedObjectInsideBLOB] = None
+    uploaded_invocation_error_blob: Optional[BLOB] = None
     if outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE:
         response_validator.required_field("failure_reason")
         failure_reason: Optional[TaskFailureReason] = _to_task_failure_reason(
-            response.failure_reason, logger
+            result.failure_reason, logger
         )
         if failure_reason == TaskFailureReason.TASK_FAILURE_REASON_INVOCATION_ERROR:
             response_validator.required_field("invocation_error_output")
-            invocation_error_output = response.invocation_error_output
-    if _ENABLE_INJECT_TASK_CANCELLATIONS:
-        logger.warning("injecting cancellation failure for the task allocation")
-        if (
-            random.random() < 0.5
-        ):  # 50% chance to get stable reproduction in manual testing
-            outcome_code = TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
-            failure_reason = TaskFailureReason.TASK_FAILURE_REASON_TASK_CANCELLED
+            response_validator.required_field("uploaded_invocation_error_blob")
+            invocation_error_output = result.invocation_error_output
+            uploaded_invocation_error_blob = result.uploaded_invocation_error_blob
+    elif outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS:
+        # function_outputs can have no items, this happens when the function returns None.
+        response_validator.required_field("uploaded_function_outputs_blob")
     return TaskOutput(
         allocation=allocation,
         outcome_code=outcome_code,
         failure_reason=failure_reason,
+        function_outputs=list(result.function_outputs),
+        uploaded_function_outputs_blob=result.uploaded_function_outputs_blob,
         invocation_error_output=invocation_error_output,
-        function_outputs=response.function_outputs,
-        next_functions=response.next_functions,
-        stdout=response.stdout,
-        stderr=response.stderr,
+        uploaded_invocation_error_blob=uploaded_invocation_error_blob,
+        next_functions=list(result.next_functions),
         metrics=metrics,
         execution_start_time=execution_start_time,
         execution_end_time=execution_end_time,
@@ -240,6 +340,14 @@ def _log_task_execution_finished(output: TaskOutput, logger: Any) -> None:
     )
+def _process_task_diagnostics(task_diagnostics: TaskDiagnostics, logger: Any) -> None:
+    MessageValidator(task_diagnostics).required_field("function_executor_log")
+    # Uncomment these lines once we stop printing FE logs to stdout/stderr.
+    # Print FE logs directly to Executor logs so operators can see them.
+    # logger.info("Function Executor logs during task execution:")
+    # print(task_diagnostics.function_executor_log)
 def _to_task_outcome_code(
     fe_task_outcome_code: FETaskOutcomeCode, logger
 ) -> TaskOutcomeCode:

indexify/executor/function_executor_controller/task_info.py CHANGED Viewed

@@ -2,10 +2,9 @@ import asyncio
 from dataclasses import dataclass
 from typing import Optional
-from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
 from indexify.proto.executor_api_pb2 import TaskAllocation
+from .task_input import TaskInput
 from .task_output import TaskOutput
@@ -22,11 +21,9 @@ class TaskInfo:
     is_cancelled: bool = False
     # aio task that is currently executing a lifecycle step of this task.
     aio_task: Optional[asyncio.Task] = None
-    # Downloaded input if function was prepared successfully.
-    input: Optional[SerializedObject] = None
-    # Downloaded init value if function was prepared successfully and is a reducer.
-    init_value: Optional[SerializedObject] = None
-    # Output of the task.
+    # Input if function was prepared successfully.
+    input: Optional[TaskInput] = None
+    # Output of the task, always set when the task is completed.
     output: Optional[TaskOutput] = None
     # True if the task is fully completed and was added to state reporter.
     is_completed: bool = False

indexify/executor/function_executor_controller/task_input.py ADDED Viewed

@@ -0,0 +1,21 @@
+from tensorlake.function_executor.proto.function_executor_pb2 import FunctionInputs
+class TaskInput:
+    """Represents the input for a task in the function executor controller."""
+    def __init__(
+        self,
+        function_inputs: FunctionInputs,
+        function_outputs_blob_uri: str,
+        function_outputs_blob_upload_id: str,
+        invocation_error_blob_uri: str,
+        invocation_error_blob_upload_id: str,
+    ):
+        # Actual input object sent to FE.
+        self.function_inputs = function_inputs
+        # Executor side function input related bookkeeping.
+        self.function_outputs_blob_uri = function_outputs_blob_uri
+        self.function_outputs_blob_upload_id = function_outputs_blob_upload_id
+        self.invocation_error_blob_uri = invocation_error_blob_uri
+        self.invocation_error_blob_upload_id = invocation_error_blob_upload_id

indexify/executor/function_executor_controller/task_output.py CHANGED Viewed

@@ -1,19 +1,17 @@
 from typing import Any, Dict, List, Optional
 from tensorlake.function_executor.proto.function_executor_pb2 import (
-    SerializedObject,
+    BLOB,
+    SerializedObjectInsideBLOB,
 )
 from indexify.proto.executor_api_pb2 import (
-    DataPayload,
     FunctionExecutorTerminationReason,
     TaskAllocation,
     TaskFailureReason,
     TaskOutcomeCode,
 )
-from .function_executor_startup_output import FunctionExecutorStartupOutput
 class TaskMetrics:
     """Metrics for a task."""
@@ -30,33 +28,27 @@ class TaskOutput:
         self,
         allocation: TaskAllocation,
         outcome_code: TaskOutcomeCode,
-        # Optional[TaskFailureReason] is not supported in python 3.9
-        failure_reason: TaskFailureReason = None,
-        invocation_error_output: Optional[SerializedObject] = None,
-        function_outputs: List[SerializedObject] = [],
+        failure_reason: Optional[TaskFailureReason] = None,
+        function_outputs: List[SerializedObjectInsideBLOB] = [],
+        uploaded_function_outputs_blob: Optional[BLOB] = None,
+        invocation_error_output: Optional[SerializedObjectInsideBLOB] = None,
+        uploaded_invocation_error_blob: Optional[BLOB] = None,
         next_functions: List[str] = [],
-        stdout: Optional[str] = None,
-        stderr: Optional[str] = None,
         metrics: Optional[TaskMetrics] = None,
         execution_start_time: Optional[float] = None,
         execution_end_time: Optional[float] = None,
     ):
-        self.task = allocation.task
         self.allocation = allocation
-        self.function_outputs = function_outputs
-        self.next_functions = next_functions
-        self.stdout = stdout
-        self.stderr = stderr
         self.outcome_code = outcome_code
         self.failure_reason = failure_reason
+        self.function_outputs = function_outputs
+        self.uploaded_function_outputs_blob = uploaded_function_outputs_blob
         self.invocation_error_output = invocation_error_output
+        self.uploaded_invocation_error_blob = uploaded_invocation_error_blob
+        self.next_functions = next_functions
         self.metrics = metrics
         self.execution_start_time = execution_start_time
         self.execution_end_time = execution_end_time
-        self.uploaded_data_payloads: List[DataPayload] = []
-        self.uploaded_stdout: Optional[DataPayload] = None
-        self.uploaded_stderr: Optional[DataPayload] = None
-        self.uploaded_invocation_error_output: Optional[DataPayload] = None
     @classmethod
     def internal_error(
@@ -66,12 +58,10 @@ class TaskOutput:
         execution_end_time: Optional[float],
     ) -> "TaskOutput":
         """Creates a TaskOutput for an internal error."""
-        # We are not sharing internal error messages with the customer.
         return TaskOutput(
             allocation=allocation,
             outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
             failure_reason=TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR,
-            stderr="Platform failed to execute the function.",
             execution_start_time=execution_start_time,
             execution_end_time=execution_end_time,
         )
@@ -80,17 +70,33 @@ class TaskOutput:
     def function_timeout(
         cls,
         allocation: TaskAllocation,
-        timeout_sec: float,
         execution_start_time: Optional[float],
         execution_end_time: Optional[float],
     ) -> "TaskOutput":
-        """Creates a TaskOutput for an function timeout error."""
-        # Task stdout, stderr is not available.
+        """Creates a TaskOutput for a function timeout error."""
         return TaskOutput(
             allocation=allocation,
             outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
             failure_reason=TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_TIMEOUT,
-            stderr=f"Function exceeded its configured timeout of {timeout_sec:.3f} sec.",
+            execution_start_time=execution_start_time,
+            execution_end_time=execution_end_time,
+        )
+    @classmethod
+    def function_executor_unresponsive(
+        cls,
+        allocation: TaskAllocation,
+        execution_start_time: Optional[float],
+        execution_end_time: Optional[float],
+    ) -> "TaskOutput":
+        """Creates a TaskOutput for an unresponsive FE aka grey failure."""
+        # When FE is unresponsive we don't know exact cause of the failure.
+        return TaskOutput(
+            allocation=allocation,
+            outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
+            # Treat the grey failure as a function error and thus charge the customer.
+            # This is to prevent service abuse by intentionally misbehaving functions.
+            failure_reason=TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR,
             execution_start_time=execution_start_time,
             execution_end_time=execution_end_time,
         )
@@ -127,21 +133,17 @@ class TaskOutput:
     def function_executor_startup_failed(
         cls,
         allocation: TaskAllocation,
-        fe_startup_output: FunctionExecutorStartupOutput,
+        fe_termination_reason: FunctionExecutorTerminationReason,
         logger: Any,
     ) -> "TaskOutput":
         """Creates a TaskOutput for the case when we fail a task that didn't run because its FE startup failed."""
-        output = TaskOutput(
+        return TaskOutput(
             allocation=allocation,
             outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
             failure_reason=_fe_startup_failure_reason_to_task_failure_reason(
-                fe_startup_output.termination_reason, logger
+                fe_termination_reason, logger
             ),
         )
-        # Use FE startup stdout, stderr for allocations that we failed because FE startup failed.
-        output.uploaded_stdout = fe_startup_output.stdout
-        output.uploaded_stderr = fe_startup_output.stderr
-        return output
 def _fe_startup_failure_reason_to_task_failure_reason(
@@ -163,6 +165,12 @@ def _fe_startup_failure_reason_to_task_failure_reason(
         == FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR
     ):
         return TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR
+    elif (
+        fe_termination_reason
+        == FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED
+    ):
+        # This fe termination reason is used when FE gets deleted by Server from desired state while it's starting up.
+        return TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED
     else:
         logger.error(
             "unexpected function executor startup failure reason",
@@ -170,4 +178,4 @@ def _fe_startup_failure_reason_to_task_failure_reason(
                 fe_termination_reason
             ),
         )
-        return TaskFailureReason.TASK_FAILURE_REASON_UNKNOWN
+        return TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR

indexify/executor/function_executor_controller/terminate_function_executor.py CHANGED Viewed

@@ -29,7 +29,12 @@ async def terminate_function_executor(
             logger.info(
                 "destroying function executor",
             )
-            await function_executor.destroy()
+            try:
+                # This await is a cancellation point, need to shield to ensure we destroyed the FE.
+                await asyncio.shield(function_executor.destroy())
+            except asyncio.CancelledError:
+                # We actually destroyed the FE so we can return without error.
+                pass
     return FunctionExecutorTerminated(
         is_success=True,

indexify 0.4.21__py3-none-any.whl → 0.4.23__py3-none-any.whl

indexify 0.4.21py3-none-any.whl → 0.4.23py3-none-any.whl