PyPI - indexify - Versions diffs - 0.4.9__py3-none-any.whl → 0.4.11__py3-none-any.whl - Mend

indexify 0.4.9py3-none-any.whl → 0.4.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

indexify/executor/function_executor_controller/function_executor_controller.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import time
 from collections.abc import Coroutine
+from enum import Enum
 from pathlib import Path
 from typing import Any, Dict, List, Optional
@@ -43,12 +44,14 @@ from .events import (
 from .function_executor_startup_output import FunctionExecutorStartupOutput
 from .loggers import function_executor_logger, task_allocation_logger
 from .metrics.function_executor_controller import (
-    METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING,
-    METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING,
-    METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED,
-    METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN,
+    METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED,
+    METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING,
+    METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP,
+    METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED,
+    METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING,
+    METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN,
     metric_control_loop_handle_event_latency,
-    metric_function_executors_with_status,
+    metric_function_executors_with_state,
     metric_runnable_tasks,
     metric_runnable_tasks_per_function_name,
     metric_schedule_task_latency,
@@ -61,6 +64,16 @@ from .task_output import TaskOutput
 from .upload_task_output import upload_task_output
+# Actual FE controller states, they are a bit different from statuses reported to the Server.
+# All the valid state transitions are forward only (can skip multiple states in a row).
+class _FE_CONTROLLER_STATE(Enum):
+    NOT_STARTED = 1
+    STARTING_UP = 2
+    RUNNING = 3
+    TERMINATING = 4
+    TERMINATED = 5
 class FunctionExecutorController:
     def __init__(
         self,
@@ -94,19 +107,18 @@ class FunctionExecutorController:
         self._logger: Any = function_executor_logger(
             function_executor_description, logger.bind(module=__name__)
         )
-        # Mutable state. No lock needed as it's modified by async tasks running in
-        # the same event loop.
+        self._destroy_lock: asyncio.Lock = asyncio.Lock()
+        # Mutable state. No lock needed as it's modified by async tasks running in the same event loop.
         self._fe: Optional[FunctionExecutor] = None
-        self._fe_termination_reason: FunctionExecutorTerminationReason = (
-            None  # Optional
-        )
-        # FE Status reported to Server.
-        self._status: FunctionExecutorStatus = (
-            FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN
-        )
-        metric_function_executors_with_status.labels(
-            status=_to_fe_status_metric_label(self._status, self._logger)
+        self._fe_termination_reason: Optional[FunctionExecutorTerminationReason] = None
+        self._internal_state = _FE_CONTROLLER_STATE.NOT_STARTED
+        metric_function_executors_with_state.labels(
+            state=_to_fe_state_metric_label(self._internal_state, self._logger)
         ).inc()
+        self._reported_state: FunctionExecutorState = FunctionExecutorState(
+            description=function_executor_description,
+            status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN,
+        )
         # Ordered list of events to be processed by the control loop.
         self._events: List[BaseEvent] = []
         # Asyncio event used to notify the control loop that there are new events to process.
@@ -124,13 +136,6 @@ class FunctionExecutorController:
     def function_executor_id(self) -> str:
         return self._fe_description.id
-    def status(self) -> FunctionExecutorStatus:
-        """Returns the current status of the Function Executor.
-        Not blocking.
-        """
-        return self._status
     def add_task_allocation(self, task_allocation: TaskAllocation) -> None:
         """Adds a task to the Function Executor.
@@ -197,7 +202,6 @@ class FunctionExecutorController:
         task_info.is_cancelled = True
         logger.info(
             "cancelling task",
-            allocation_id=task_info.allocation.allocation_id,
         )
         if task_info.aio_task is not None:
             task_info.aio_task.cancel()
@@ -206,9 +210,10 @@ class FunctionExecutorController:
         """Starts up the Function Executor and prepares it to run tasks.
         Not blocking. Never raises exceptions."""
-        if self._control_loop_aio_task is not None:
+        if self._internal_state != _FE_CONTROLLER_STATE.NOT_STARTED:
             self._logger.warning(
-                "ignoring startup call as the Function Executor is already started"
+                "function executor state is not NOT_STARTED, ignoring startup call",
+                internal_state=self._internal_state.name,
             )
             return
@@ -216,7 +221,13 @@ class FunctionExecutorController:
             self._control_loop(),
             name="function executor control loop",
         )
-        self._set_status(FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_PENDING)
+        self._update_internal_state(_FE_CONTROLLER_STATE.STARTING_UP)
+        self._update_reported_state(
+            FunctionExecutorState(
+                description=self._fe_description,
+                status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_PENDING,
+            )
+        )
         next_aio = create_function_executor(
             function_executor_description=self._fe_description,
             function_executor_server_factory=self._fe_server_factory,
@@ -238,17 +249,13 @@ class FunctionExecutorController:
             ),
         )
-    async def shutdown(
-        self, termination_reason: FunctionExecutorTerminationReason
-    ) -> None:
+    async def shutdown(self) -> None:
         """Shutsdown the Function Executor and frees all of its resources.
-        All the tasks are reported as failed with FE Terminated failure code.
+        No task outcomes and outputs are getting reported to Server after this call.
         Doesn't raise any exceptions. Blocks until the shutdown is complete.
         """
-        self._add_event(
-            ShutdownInitiated(termination_reason=termination_reason), source="shutdown"
-        )
+        self._add_event(ShutdownInitiated(), source="shutdown")
         try:
             await self._control_loop_aio_task
         except asyncio.CancelledError:
@@ -260,51 +267,49 @@ class FunctionExecutorController:
             )
         self._logger.info("function executor controller shutdown finished")
-    def _set_status(
+    def _update_internal_state(self, new_state: _FE_CONTROLLER_STATE) -> None:
+        """Updates the internal state of the Function Executor Controller.
+        Not blocking. Never raises exceptions."""
+        old_state: _FE_CONTROLLER_STATE = self._internal_state
+        self._internal_state = new_state
+        self._logger.info(
+            "function executor internal state changed",
+            old_state=old_state.name,
+            new_state=new_state.name,
+        )
+        metric_function_executors_with_state.labels(
+            state=_to_fe_state_metric_label(old_state, self._logger)
+        ).dec()
+        metric_function_executors_with_state.labels(
+            state=_to_fe_state_metric_label(new_state, self._logger)
+        ).inc()
+    def _update_reported_state(
         self,
-        status: FunctionExecutorStatus,
+        new_state: FunctionExecutorState,
     ) -> None:
-        """Sets Function Executor status and reports it to the Server.
+        """Sets new Function Executor state and reports it to the Server.
         Not blocking. Never raises exceptions."""
-        old_status: FunctionExecutorStatus = self._status
-        new_status: FunctionExecutorStatus = status
-        self._status: FunctionExecutorStatus = new_status
+        old_state: FunctionExecutorState = self._reported_state
+        self._reported_state = new_state
         self._logger.info(
-            "function executor status changed",
-            old_status=FunctionExecutorStatus.Name(old_status),
-            new_status=FunctionExecutorStatus.Name(new_status),
+            "function executor grpc status changed",
+            old_status=FunctionExecutorStatus.Name(old_state.status),
+            new_status=FunctionExecutorStatus.Name(new_state.status),
             termination_reason=_termination_reason_to_short_name(
-                self._fe_termination_reason
+                new_state.termination_reason
             ),
         )
-        metric_function_executors_with_status.labels(
-            status=_to_fe_status_metric_label(old_status, self._logger)
-        ).dec()
-        metric_function_executors_with_status.labels(
-            status=_to_fe_status_metric_label(new_status, self._logger)
-        ).inc()
-        self._state_reporter.update_function_executor_state(self._current_state())
+        self._state_reporter.update_function_executor_state(new_state)
         # Report the status change to the Server asap to reduce latency in the system.
         self._state_reporter.schedule_state_report()
-    def _current_state(self) -> FunctionExecutorState:
-        """Returns the current state of the Function Executor.
-        Not blocking. Never raises exceptions.
-        """
-        termination_reason: Optional[FunctionExecutorTerminationReason] = None
-        if self._fe_termination_reason is not None:
-            termination_reason = self._fe_termination_reason
-        return FunctionExecutorState(
-            description=self._fe_description,
-            status=self._status,
-            termination_reason=termination_reason,
-        )
     async def _control_loop(self) -> None:
         """Runs control loop that coordinates all the work done by the Function Executor.
@@ -332,7 +337,7 @@ class FunctionExecutorController:
                     self._logger.error(
                         "unexpected exception in function executor controller control loop",
                         exc_info=e,
-                        fe_event=str(event),
+                        event_type=event.event_type.name,
                     )
     def _handle_event(self, event: BaseEvent) -> None:
@@ -455,13 +460,17 @@ class FunctionExecutorController:
         self._state_reporter.schedule_state_report()
         if event.function_executor is None:
-            self._destroy_function_executor_before_termination(
-                event.output.termination_reason
-            )
+            self._start_termination(termination_reason=event.output.termination_reason)
             return
         self._fe = event.function_executor
-        self._set_status(FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING)
+        self._update_internal_state(_FE_CONTROLLER_STATE.RUNNING)
+        self._update_reported_state(
+            FunctionExecutorState(
+                description=self._fe_description,
+                status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING,
+            )
+        )
         # Health checker starts after FE creation and gets automatically stopped on FE destroy.
         self._fe.health_checker().start(self._health_check_failed_callback)
         self._add_event(
@@ -480,9 +489,18 @@ class FunctionExecutorController:
             self._logger.error(
                 "Function Executor destroy failed unexpectedly, this should never happen",
             )
-        # Set the status only after the FE got destroyed because Server assumes that all FE resources are freed when the status changes.
-        self._fe_termination_reason = event.termination_reason
-        self._set_status(FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED)
+        self._fe = None
+        # Set reported status only after the FE got destroyed because Server assumes that all FE resources are freed when the status changes.
+        self._update_reported_state(
+            FunctionExecutorState(
+                description=self._fe_description,
+                status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED,
+                termination_reason=self._fe_termination_reason,
+            )
+        )
+        self._update_internal_state(_FE_CONTROLLER_STATE.TERMINATED)
         # Invoke the scheduler so it can fail runnable tasks with FE Terminated error.
         self._add_event(
             ScheduleTaskExecution(),
@@ -494,7 +512,7 @@ class FunctionExecutorController:
             "Function Executor health check failed, terminating Function Executor",
             reason=result.reason,
         )
-        self._destroy_function_executor_before_termination(
+        self._start_termination(
             termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
         )
@@ -533,14 +551,15 @@ class FunctionExecutorController:
         if len(self._runnable_tasks) == 0:
             return
-        if self._status not in [
-            FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING,
-            FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED,
+        if self._internal_state not in [
+            _FE_CONTROLLER_STATE.RUNNING,
+            _FE_CONTROLLER_STATE.TERMINATING,
+            _FE_CONTROLLER_STATE.TERMINATED,
         ]:
-            return  # Can't progress pending task with the current status.
+            return  # Can't progress runnable tasks in the current state.
         if (
-            self._status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING
+            self._internal_state == _FE_CONTROLLER_STATE.RUNNING
             and self._running_task is not None
         ):
             return
@@ -556,12 +575,15 @@ class FunctionExecutorController:
         if task_info.is_cancelled:
             task_info.output = TaskOutput.task_cancelled(task_info.allocation)
             self._start_task_output_upload(task_info)
-        elif self._status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED:
+        elif self._internal_state in [
+            _FE_CONTROLLER_STATE.TERMINATING,
+            _FE_CONTROLLER_STATE.TERMINATED,
+        ]:
             task_info.output = TaskOutput.function_executor_terminated(
                 task_info.allocation
             )
             self._start_task_output_upload(task_info)
-        elif self._status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING:
+        elif self._internal_state == _FE_CONTROLLER_STATE.RUNNING:
             self._running_task = task_info
             next_aio = run_task_on_function_executor(
                 task_info=task_info,
@@ -604,7 +626,7 @@ class FunctionExecutorController:
                 ScheduleTaskExecution(), source="_handle_event_task_execution_finished"
             )
         else:
-            self._destroy_function_executor_before_termination(
+            self._start_termination(
                 termination_reason=event.function_executor_termination_reason
             )
@@ -661,24 +683,31 @@ class FunctionExecutorController:
         )
         self._state_reporter.schedule_state_report()
-    def _destroy_function_executor_before_termination(
+    def _start_termination(
         self, termination_reason: FunctionExecutorTerminationReason
     ) -> None:
-        """Destroys the Function Executor and frees all its resources to prepare for transitioning to the TERMINATED state.
+        """Starts termination of the Function Executor if it's not started yet.
         Doesn't raise any exceptions. Doesn't block.
         """
+        if self._internal_state in [
+            _FE_CONTROLLER_STATE.TERMINATING,
+            _FE_CONTROLLER_STATE.TERMINATED,
+        ]:
+            # _start_termination() can be called multiple times, e.g. by each failed task alloc
+            # when the FE is unhealthy. Dedup the calls to keep state machine consistent.
+            return
+        self._fe_termination_reason = termination_reason
+        self._update_internal_state(_FE_CONTROLLER_STATE.TERMINATING)
         next_aio = destroy_function_executor(
             function_executor=self._fe,
-            termination_reason=termination_reason,
+            lock=self._destroy_lock,
             logger=self._logger,
         )
-        self._fe = None
         self._spawn_aio_for_fe(
             aio=next_aio,
-            on_exception=FunctionExecutorDestroyed(
-                is_success=False, termination_reason=termination_reason
-            ),
+            on_exception=FunctionExecutorDestroyed(is_success=False),
         )
     async def _shutdown_no_exceptions(self, event: ShutdownInitiated) -> None:
@@ -717,16 +746,15 @@ class FunctionExecutorController:
                 # BaseException includes asyncio.CancelledError which is always raised here.
                 pass
-        if self._status != FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED:
-            self._handle_event_function_executor_destroyed(
-                await destroy_function_executor(
-                    function_executor=self._fe,
-                    termination_reason=event.termination_reason,
-                    logger=self._logger,
-                )
-            )
-        metric_function_executors_with_status.labels(
-            status=_to_fe_status_metric_label(self._status, self._logger)
+        await destroy_function_executor(
+            function_executor=self._fe,
+            lock=self._destroy_lock,
+            logger=self._logger,
+        )
+        # Cleanup the metric from this FE.
+        metric_function_executors_with_state.labels(
+            state=_to_fe_state_metric_label(self._internal_state, self._logger)
         ).dec()
         self._state_reporter.remove_function_executor_state(self.function_executor_id())
@@ -736,21 +764,23 @@ class FunctionExecutorController:
         debug_print_events(events=self._events, logger=self._logger)
-def _to_fe_status_metric_label(status: FunctionExecutorStatus, logger: Any) -> str:
-    if status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN:
-        return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN
-    elif status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_PENDING:
-        return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING
-    elif status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING:
-        return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING
-    elif status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED:
-        return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED
+def _to_fe_state_metric_label(state: _FE_CONTROLLER_STATE, logger: Any) -> str:
+    if state == _FE_CONTROLLER_STATE.NOT_STARTED:
+        return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED
+    elif state == _FE_CONTROLLER_STATE.STARTING_UP:
+        return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP
+    elif state == _FE_CONTROLLER_STATE.RUNNING:
+        return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING
+    elif state == _FE_CONTROLLER_STATE.TERMINATING:
+        return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING
+    elif state == _FE_CONTROLLER_STATE.TERMINATED:
+        return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED
     else:
         logger.error(
-            "unexpected Function Executor status",
-            status=FunctionExecutorStatus.Name(status),
+            "unexpected Function Executor internal state",
+            state=state.name,
         )
-        return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN
+        return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN
 _termination_reason_to_short_name_map = {
@@ -758,8 +788,6 @@ _termination_reason_to_short_name_map = {
     FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR: "STARTUP_FAILED_INTERNAL_ERROR",
     FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_ERROR: "STARTUP_FAILED_FUNCTION_ERROR",
     FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_TIMEOUT: "STARTUP_FAILED_FUNCTION_TIMEOUT",
-    FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_EXECUTOR_SHUTDOWN: "EXECUTOR_SHUTDOWN",
-    FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_REMOVED_FROM_DESIRED_STATE: "REMOVED_FROM_DESIRED_STATE",
     FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY: "UNHEALTHY",
     FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR: "INTERNAL_ERROR",
     FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT: "FUNCTION_TIMEOUT",
@@ -786,16 +814,14 @@ def _to_task_result_proto(output: TaskOutput) -> TaskResult:
         graph_invocation_id=output.allocation.task.graph_invocation_id,
         reducer=output.reducer,
         outcome_code=output.outcome_code,
-        next_functions=(output.router_output.edges if output.router_output else []),
+        failure_reason=output.failure_reason,
+        next_functions=output.next_functions,
         function_outputs=output.uploaded_data_payloads,
+        invocation_error_output=output.uploaded_invocation_error_output,
     )
-    if output.failure_reason is not None:
-        task_result.failure_reason = output.failure_reason
     if output.uploaded_stdout is not None:
         task_result.stdout.CopyFrom(output.uploaded_stdout)
     if output.uploaded_stderr is not None:
         task_result.stderr.CopyFrom(output.uploaded_stderr)
-    if output.router_output is not None:
-        task_result.routing.next_functions[:] = output.router_output.edges
     return task_result

indexify/executor/function_executor_controller/metrics/function_executor_controller.py CHANGED Viewed

@@ -34,27 +34,34 @@ metric_runnable_tasks_per_function_name: prometheus_client.Gauge = (
     )
 )
-metric_function_executors_with_status: prometheus_client.Gauge = (
-    prometheus_client.Gauge(
-        "function_executors_with_status",
-        "Number of Function Executors with a particular status",
-        ["status"],
-    )
+metric_function_executors_with_state: prometheus_client.Gauge = prometheus_client.Gauge(
+    "function_executors_with_state",
+    "Number of Function Executors with a particular internal state",
+    ["state"],
 )
-METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN = "unknown"
-METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING = "pending"
-METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING = "running"
-METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED = "terminated"
+METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN = "unknown"
+METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED = "not_started"
+METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP = "starting_up"
+METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING = "running"
+METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING = "terminating"
+METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED = "terminated"
-metric_function_executors_with_status.labels(
-    status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN
+metric_function_executors_with_state.labels(
+    state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN
+)
+metric_function_executors_with_state.labels(
+    state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED
+)
+metric_function_executors_with_state.labels(
+    state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP
 )
-metric_function_executors_with_status.labels(
-    status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING
+metric_function_executors_with_state.labels(
+    state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING
 )
-metric_function_executors_with_status.labels(
-    status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING
+metric_function_executors_with_state.labels(
+    state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING
 )
-metric_function_executors_with_status.labels(
-    status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED
+metric_function_executors_with_state.labels(
+    state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED
 )

indexify/executor/function_executor_controller/run_task.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import asyncio
+import os
+import random
 import time
 from typing import Any, Optional
@@ -6,6 +8,13 @@ import grpc
 from tensorlake.function_executor.proto.function_executor_pb2 import (
     RunTaskRequest,
     RunTaskResponse,
+    SerializedObject,
+)
+from tensorlake.function_executor.proto.function_executor_pb2 import (
+    TaskFailureReason as FETaskFailureReason,
+)
+from tensorlake.function_executor.proto.function_executor_pb2 import (
+    TaskOutcomeCode as FETaskOutcomeCode,
 )
 from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
     FunctionExecutorStub,
@@ -31,6 +40,10 @@ from .metrics.run_task import (
 from .task_info import TaskInfo
 from .task_output import TaskMetrics, TaskOutput
+_ENABLE_INJECT_TASK_CANCELLATIONS = (
+    os.getenv("INDEXIFY_INJECT_TASK_CANCELLATIONS", "0") == "1"
+)
 async def run_task_on_function_executor(
     task_info: TaskInfo, function_executor: FunctionExecutor, logger: Any
@@ -83,6 +96,7 @@ async def run_task_on_function_executor(
         task_info.output = _task_output_from_function_executor_response(
             allocation=task_info.allocation,
             response=response,
+            logger=logger,
         )
     except grpc.aio.AioRpcError as e:
         if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
@@ -140,13 +154,13 @@ async def run_task_on_function_executor(
 def _task_output_from_function_executor_response(
-    allocation: TaskAllocation, response: RunTaskResponse
+    allocation: TaskAllocation, response: RunTaskResponse, logger: Any
 ) -> TaskOutput:
     response_validator = MessageValidator(response)
     response_validator.required_field("stdout")
     response_validator.required_field("stderr")
     response_validator.required_field("is_reducer")
-    response_validator.required_field("success")
+    response_validator.required_field("outcome_code")
     metrics = TaskMetrics(counters={}, timers={})
     if response.HasField("metrics"):
@@ -154,31 +168,42 @@ def _task_output_from_function_executor_response(
         metrics.counters = dict(response.metrics.counters)
         metrics.timers = dict(response.metrics.timers)
-    output = TaskOutput(
+    outcome_code: TaskOutcomeCode = _to_task_outcome_code(
+        response.outcome_code, logger=logger
+    )
+    failure_reason: Optional[TaskFailureReason] = None
+    invocation_error_output: Optional[SerializedObject] = None
+    if outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE:
+        response_validator.required_field("failure_reason")
+        failure_reason: Optional[TaskFailureReason] = _to_task_failure_reason(
+            response.failure_reason, logger
+        )
+        if failure_reason == TaskFailureReason.TASK_FAILURE_REASON_INVOCATION_ERROR:
+            response_validator.required_field("invocation_error_output")
+            invocation_error_output = response.invocation_error_output
+    if _ENABLE_INJECT_TASK_CANCELLATIONS:
+        logger.warning("injecting cancellation failure for the task allocation")
+        if (
+            random.random() < 0.5
+        ):  # 50% chance to get stable reproduction in manual testing
+            outcome_code = TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
+            failure_reason = TaskFailureReason.TASK_FAILURE_REASON_TASK_CANCELLED
+    return TaskOutput(
         allocation=allocation,
-        outcome_code=(
-            TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS
-            if response.success
-            else TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
-        ),
-        failure_reason=(
-            None
-            if response.success
-            else TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR
-        ),
+        outcome_code=outcome_code,
+        failure_reason=failure_reason,
+        invocation_error_output=invocation_error_output,
+        function_outputs=response.function_outputs,
+        next_functions=response.next_functions,
         stdout=response.stdout,
         stderr=response.stderr,
         reducer=response.is_reducer,
         metrics=metrics,
     )
-    if response.HasField("function_output"):
-        output.function_output = response.function_output
-    if response.HasField("router_output"):
-        output.router_output = response.router_output
-    return output
 def _log_task_execution_finished(output: TaskOutput, logger: Any) -> None:
     logger.info(
@@ -191,3 +216,40 @@ def _log_task_execution_finished(output: TaskOutput, logger: Any) -> None:
             else None
         ),
     )
+def _to_task_outcome_code(
+    fe_task_outcome_code: FETaskOutcomeCode, logger
+) -> TaskOutcomeCode:
+    if fe_task_outcome_code == FETaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS:
+        return TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS
+    elif fe_task_outcome_code == FETaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE:
+        return TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
+    else:
+        logger.warning(
+            "Unknown TaskOutcomeCode received from Function Executor",
+            value=FETaskOutcomeCode.Name(fe_task_outcome_code),
+        )
+        return TaskOutcomeCode.TASK_OUTCOME_CODE_UNKNOWN
+def _to_task_failure_reason(
+    fe_task_failure_reason: FETaskFailureReason, logger: Any
+) -> TaskFailureReason:
+    if fe_task_failure_reason == FETaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR:
+        return TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR
+    elif (
+        fe_task_failure_reason
+        == FETaskFailureReason.TASK_FAILURE_REASON_INVOCATION_ERROR
+    ):
+        return TaskFailureReason.TASK_FAILURE_REASON_INVOCATION_ERROR
+    elif (
+        fe_task_failure_reason == FETaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR
+    ):
+        return TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR
+    else:
+        logger.warning(
+            "Unknown TaskFailureReason received from Function Executor",
+            value=FETaskFailureReason.Name(fe_task_failure_reason),
+        )
+        return TaskFailureReason.TASK_FAILURE_REASON_UNKNOWN

indexify 0.4.9__py3-none-any.whl → 0.4.11__py3-none-any.whl

indexify 0.4.9py3-none-any.whl → 0.4.11py3-none-any.whl