PyPI - indexify - Versions diffs - 0.3.30__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

indexify 0.3.30py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

indexify/executor/state_reporter.py ADDED Viewed

@@ -0,0 +1,364 @@
+import asyncio
+import hashlib
+import platform
+import sys
+from socket import gethostname
+from typing import Any, Dict, List, Optional
+from indexify.proto.executor_api_pb2 import (
+    AllowedFunction,
+    ExecutorState,
+    ExecutorStatus,
+    FunctionExecutorState,
+    GPUModel,
+    GPUResources,
+)
+from indexify.proto.executor_api_pb2 import HostResources as HostResourcesProto
+from indexify.proto.executor_api_pb2 import (
+    ReportExecutorStateRequest,
+    TaskFailureReason,
+    TaskOutcomeCode,
+    TaskResult,
+)
+from indexify.proto.executor_api_pb2_grpc import ExecutorAPIStub
+from .channel_manager import ChannelManager
+from .function_allowlist import FunctionURI
+from .function_executor_controller.loggers import task_logger
+from .function_executor_controller.task_output import TaskOutput
+from .host_resources.host_resources import HostResources, HostResourcesProvider
+from .host_resources.nvidia_gpu import NVIDIA_GPU_MODEL
+from .metrics.state_reporter import (
+    metric_state_report_errors,
+    metric_state_report_latency,
+    metric_state_report_rpcs,
+)
+_REPORTING_INTERVAL_SEC = 5
+_REPORT_RPC_TIMEOUT_SEC = 5
+class ExecutorStateReporter:
+    def __init__(
+        self,
+        executor_id: str,
+        version: str,
+        labels: Dict[str, str],
+        function_allowlist: List[FunctionURI],
+        channel_manager: ChannelManager,
+        host_resources_provider: HostResourcesProvider,
+        logger: Any,
+        reporting_interval_sec: int = _REPORTING_INTERVAL_SEC,
+    ):
+        self._executor_id: str = executor_id
+        self._version: str = version
+        self._labels: Dict[str, str] = labels.copy()
+        self._labels.update(_executor_labels())
+        self._hostname: str = gethostname()
+        self._channel_manager = channel_manager
+        self._logger: Any = logger.bind(module=__name__)
+        self._reporting_interval_sec: int = reporting_interval_sec
+        self._allowed_functions: List[AllowedFunction] = _to_allowed_function_protos(
+            function_allowlist
+        )
+        # We need to fetch total resources only once, because they are not changing.
+        self._total_host_resources: HostResources = (
+            host_resources_provider.total_host_resources(self._logger)
+        )
+        self._total_function_executor_resources: HostResources = (
+            host_resources_provider.total_function_executor_resources(self._logger)
+        )
+        self._logger.info(
+            "detected host resources",
+            total_host_resources=self._total_host_resources,
+            total_function_executor_resources=self._total_function_executor_resources,
+        )
+        self._state_report_worker: Optional[asyncio.Task] = None
+        self._periodic_state_report_scheduler: Optional[asyncio.Task] = None
+        # Mutable fields
+        self._state_report_scheduled_event: asyncio.Event = asyncio.Event()
+        self._state_reported_event: asyncio.Event = asyncio.Event()
+        self._executor_status: ExecutorStatus = ExecutorStatus.EXECUTOR_STATUS_UNKNOWN
+        self._last_server_clock: int = (
+            0  # Server expects initial value to be 0 until it is set by Server.
+        )
+        self._completed_task_outputs: List[TaskOutput] = []
+        self._function_executor_states: Dict[str, FunctionExecutorState] = {}
+    def update_executor_status(self, value: ExecutorStatus) -> None:
+        self._executor_status = value
+    def update_last_server_clock(self, value: int) -> None:
+        self._last_server_clock = value
+    def update_function_executor_state(
+        self,
+        state: FunctionExecutorState,
+    ) -> None:
+        self._function_executor_states[state.description.id] = state
+    def remove_function_executor_info(self, function_executor_id: str) -> None:
+        if function_executor_id not in self._function_executor_states:
+            self._logger.warning(
+                "attempted to remove non-existing function executor state",
+                function_executor_id=function_executor_id,
+            )
+            return
+        self._function_executor_states.pop(function_executor_id)
+    def add_completed_task_output(self, task_output: TaskOutput) -> None:
+        self._completed_task_outputs.append(task_output)
+    def schedule_state_report(self) -> None:
+        """Schedules a state report to be sent to the server asap.
+        This method is called when the executor state changes and it needs to get reported.
+        The call doesn't block and returns immediately.
+        """
+        self._state_report_scheduled_event.set()
+    async def report_state_and_wait_for_completion(self) -> None:
+        """Schedules a state report to be sent to the server asap and waits for the completion of the reporting."""
+        self._state_reported_event.clear()
+        self.schedule_state_report()
+        await self._state_reported_event.wait()
+    def run(self) -> None:
+        """Runs the state reporter.
+        This method is called when the executor starts and it needs to start reporting its state
+        periodically.
+        """
+        self._state_report_worker = asyncio.create_task(
+            self._state_report_worker_loop(), name="state_reporter_worker"
+        )
+        self._periodic_state_report_scheduler = asyncio.create_task(
+            self._periodic_state_report_scheduler_loop(),
+            name="state_reporter_periodic_scheduler",
+        )
+    async def shutdown(self) -> None:
+        """Tries to do one last state report and shuts down the state reporter.
+        Doesn't raise any exceptions."""
+        if self._state_report_worker is not None:
+            self._state_report_worker.cancel()
+            try:
+                await self._state_report_worker
+            except asyncio.CancelledError:
+                pass  # Expected exception
+            self._state_report_worker = None
+        if self._periodic_state_report_scheduler is not None:
+            self._periodic_state_report_scheduler.cancel()
+            try:
+                await self._periodic_state_report_scheduler
+            except asyncio.CancelledError:
+                pass
+            self._periodic_state_report_scheduler = None
+        # Don't retry state report if it failed during shutdown.
+        # We only do best effort last state report and Server might not be available.
+        try:
+            async with self._channel_manager.create_channel() as channel:
+                stub = ExecutorAPIStub(channel)
+                await self._report_state(stub)
+        except BaseException as e:
+            self._logger.error(
+                "failed to report state during shutdown",
+                exc_info=e,
+            )
+    async def _periodic_state_report_scheduler_loop(self) -> None:
+        while True:
+            self._state_report_scheduled_event.set()
+            await asyncio.sleep(self._reporting_interval_sec)
+    async def _state_report_worker_loop(self) -> None:
+        """Runs the state reporter.
+        Never raises any exceptions.
+        """
+        while True:
+            stub = ExecutorAPIStub(await self._channel_manager.get_channel())
+            while True:
+                await self._state_report_scheduled_event.wait()
+                # Clear the event immidiately to report again asap if needed. This reduces latency in the system.
+                self._state_report_scheduled_event.clear()
+                try:
+                    # The periodic state reports serve as channel health monitoring requests
+                    # (same as TCP keep-alive). Channel Manager returns the same healthy channel
+                    # for all RPCs that we do from Executor to Server. So all the RPCs benefit
+                    # from this channel health monitoring.
+                    await self._report_state(stub)
+                    self._state_reported_event.set()
+                except Exception as e:
+                    self._logger.error(
+                        f"failed to report state to the server, retrying in {self._reporting_interval_sec} sec.",
+                        exc_info=e,
+                    )
+                    break  # exit the inner loop to recreate the channel if needed
+    async def _report_state(self, stub: ExecutorAPIStub):
+        """Reports the current state to the server represented by the supplied stub.
+        Raises an exception on failure.
+        """
+        with (
+            metric_state_report_errors.count_exceptions(),
+            metric_state_report_latency.time(),
+        ):
+            metric_state_report_rpcs.inc()
+            state: ExecutorState = self._current_executor_state()
+            task_outputs: List[TaskOutput] = self._remove_completed_tasks()
+            task_results: List[TaskResult] = _to_task_result_protos(task_outputs)
+            try:
+                await stub.report_executor_state(
+                    ReportExecutorStateRequest(
+                        executor_state=state, task_results=task_results
+                    ),
+                    timeout=_REPORT_RPC_TIMEOUT_SEC,
+                )
+            except Exception as e:
+                for task_output in task_outputs:
+                    self.add_completed_task_output(task_output)
+                raise
+    def _current_executor_state(self) -> ExecutorState:
+        """Returns the current executor state."""
+        state = ExecutorState(
+            executor_id=self._executor_id,
+            hostname=self._hostname,
+            version=self._version,
+            status=self._executor_status,
+            total_function_executor_resources=_to_host_resources_proto(
+                self._total_function_executor_resources
+            ),
+            total_resources=_to_host_resources_proto(self._total_host_resources),
+            allowed_functions=self._allowed_functions,
+            function_executor_states=list(self._function_executor_states.values()),
+            labels=self._labels,
+        )
+        state.state_hash = _state_hash(state)
+        # Set fields not included in the state hash.
+        state.server_clock = self._last_server_clock
+        return state
+    def _remove_completed_tasks(self) -> List[TaskOutput]:
+        task_outputs: List[TaskOutput] = []
+        while len(self._completed_task_outputs) > 0:
+            task_output = self._completed_task_outputs.pop()
+            task_outputs.append(task_output)
+            task_logger(task_output.task, self._logger).info(
+                "reporting task outcome",
+                outcome_code=TaskOutcomeCode.Name(task_output.outcome_code),
+                failure_reason=(
+                    "None"
+                    if task_output.failure_reason is None
+                    else TaskFailureReason.Name(task_output.failure_reason)
+                ),
+            )
+        return task_outputs
+def _to_allowed_function_protos(
+    function_allowlist: List[FunctionURI],
+) -> List[AllowedFunction]:
+    allowed_functions: List[AllowedFunction] = []
+    for function_uri in function_allowlist:
+        function_uri: FunctionURI
+        allowed_function = AllowedFunction(
+            namespace=function_uri.namespace,
+            graph_name=function_uri.compute_graph,
+            function_name=function_uri.compute_fn,
+        )
+        if function_uri.version is not None:
+            allowed_function.graph_version = function_uri.version
+        allowed_functions.append(allowed_function)
+    return allowed_functions
+def _state_hash(state: ExecutorState) -> str:
+    serialized_state: bytes = state.SerializeToString(deterministic=True)
+    hasher = hashlib.sha256(usedforsecurity=False)
+    hasher.update(serialized_state)
+    return hasher.hexdigest()
+def _to_host_resources_proto(host_resources: HostResources) -> HostResourcesProto:
+    proto = HostResourcesProto(
+        cpu_count=host_resources.cpu_count,
+        memory_bytes=host_resources.memory_mb * 1024 * 1024,
+        disk_bytes=host_resources.disk_mb * 1024 * 1024,
+    )
+    if len(host_resources.gpus) > 0:
+        proto.gpu.CopyFrom(
+            GPUResources(
+                count=len(host_resources.gpus),
+                model=_to_gpu_model_proto(
+                    host_resources.gpus[0].model
+                ),  # All GPUs have the same model
+            )
+        )
+    return proto
+def _to_gpu_model_proto(nvidia_gpu_model: NVIDIA_GPU_MODEL) -> GPUModel:
+    if nvidia_gpu_model == NVIDIA_GPU_MODEL.A100_40GB:
+        return GPUModel.GPU_MODEL_NVIDIA_A100_40GB
+    elif nvidia_gpu_model == NVIDIA_GPU_MODEL.A100_80GB:
+        return GPUModel.GPU_MODEL_NVIDIA_A100_80GB
+    elif nvidia_gpu_model == NVIDIA_GPU_MODEL.H100_80GB:
+        return GPUModel.GPU_MODEL_NVIDIA_H100_80GB
+    elif nvidia_gpu_model == NVIDIA_GPU_MODEL.TESLA_T4:
+        return GPUModel.GPU_MODEL_NVIDIA_TESLA_T4
+    elif nvidia_gpu_model == NVIDIA_GPU_MODEL.A6000:
+        return GPUModel.GPU_MODEL_NVIDIA_A6000
+    elif nvidia_gpu_model == NVIDIA_GPU_MODEL.A10:
+        return GPUModel.GPU_MODEL_NVIDIA_A10
+    else:
+        return GPUModel.GPU_MODEL_UNKNOWN
+def _to_task_result_protos(task_outputs: List[TaskOutput]) -> List[TaskResult]:
+    task_results: List[TaskResult] = []
+    for output in task_outputs:
+        task_result = TaskResult(
+            task_id=output.task.id,
+            allocation_id=output.allocation_id,
+            namespace=output.task.namespace,
+            graph_name=output.task.graph_name,
+            function_name=output.task.function_name,
+            graph_invocation_id=output.task.graph_invocation_id,
+            reducer=output.reducer,
+            outcome_code=output.outcome_code,
+            next_functions=(output.router_output.edges if output.router_output else []),
+            function_outputs=output.uploaded_data_payloads,
+        )
+        if output.failure_reason is not None:
+            task_result.failure_reason = output.failure_reason
+        if output.uploaded_stdout is not None:
+            task_result.stdout.CopyFrom(output.uploaded_stdout)
+        if output.uploaded_stderr is not None:
+            task_result.stderr.CopyFrom(output.uploaded_stderr)
+        if output.router_output is not None:
+            task_result.routing.next_functions[:] = output.router_output.edges
+        task_results.append(task_result)
+    return task_results
+def _executor_labels() -> Dict[str, str]:
+    """Returns standard executor labels always added to user supplied labels."""
+    return {
+        "os": platform.system(),
+        "architecture": platform.machine(),
+        "python_major_version": str(sys.version_info.major),
+        "python_minor_version": str(sys.version_info.minor),
+    }

indexify/proto/executor_api.proto CHANGED Viewed

@@ -15,7 +15,6 @@ enum DataPayloadEncoding {
 }
 message DataPayload {
-    optional string path = 1; // deprecated, TODO: remove when URI us used everywhere
     optional uint64 size = 2;
     optional string sha256_hash = 3;
     // URI of the data.
@@ -41,7 +40,6 @@ enum GPUModel {
 message GPUResources {
     optional uint32 count = 1;
     optional GPUModel model = 2;
-    reserved 3;
 }
 // Resources that we're currently tracking and limiting on Executor.
@@ -64,17 +62,29 @@ message AllowedFunction {
 enum FunctionExecutorStatus {
     FUNCTION_EXECUTOR_STATUS_UNKNOWN = 0;
-    FUNCTION_EXECUTOR_STATUS_STARTING_UP = 1;
-    FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR = 2;
-    FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR = 3;
-    FUNCTION_EXECUTOR_STATUS_IDLE = 4;
-    FUNCTION_EXECUTOR_STATUS_RUNNING_TASK = 5;
-    FUNCTION_EXECUTOR_STATUS_UNHEALTHY = 6;
-    FUNCTION_EXECUTOR_STATUS_STOPPING = 7;
-    // FE is stopped but can be started up.
-    FUNCTION_EXECUTOR_STATUS_STOPPED = 8;
-    // FE is stopped forever, all resources are freed.
-    FUNCTION_EXECUTOR_STATUS_SHUTDOWN = 9;
+    // Function Executor is being created.
+    FUNCTION_EXECUTOR_STATUS_PENDING = 1;
+    // Function Executor is running and ready to accept tasks.
+    FUNCTION_EXECUTOR_STATUS_RUNNING = 2;
+    // Function Executor is terminated, all resources are freed.
+    FUNCTION_EXECUTOR_STATUS_TERMINATED = 3;
+}
+enum FunctionExecutorTerminationReason {
+    FUNCTION_EXECUTOR_TERMINATION_REASON_UNKNOWN = 0;
+    FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR = 1;
+    FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_ERROR = 2;
+    // Timeout on FE startup while running the function constructor.
+    FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_TIMEOUT = 3;
+    FUNCTION_EXECUTOR_TERMINATION_REASON_EXECUTOR_SHUTDOWN = 10;
+    FUNCTION_EXECUTOR_TERMINATION_REASON_REMOVED_FROM_DESIRED_STATE = 11;
+    FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY = 12;
+    FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR = 13;
+    // Timeout while running the function.
+    FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT = 14;
+    // The running function allocation was removed from the desired state.
+    FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED = 15;
 }
 // Immutable information that identifies and describes a Function Executor.
@@ -95,7 +105,6 @@ message FunctionExecutorDescription {
     optional string function_name = 5;
     optional string image_uri = 6;
     repeated string secret_names = 7;
-    optional HostResources resource_limits = 8;
     // Timeout for customer code duration during FE creation.
     optional uint32 customer_code_timeout_ms = 9;
     optional DataPayload graph = 10;
@@ -105,7 +114,7 @@ message FunctionExecutorDescription {
 message FunctionExecutorState {
     optional FunctionExecutorDescription description = 1;
     optional FunctionExecutorStatus status = 2;
-    reserved 3;
+    optional FunctionExecutorTerminationReason termination_reason = 3;
 }
 enum ExecutorStatus {
@@ -113,21 +122,12 @@ enum ExecutorStatus {
     EXECUTOR_STATUS_STARTING_UP = 1;
     EXECUTOR_STATUS_RUNNING = 2;
     EXECUTOR_STATUS_DRAINED = 3;
-    EXECUTOR_STATUS_STOPPING = 4;
-    EXECUTOR_STATUS_STOPPED = 5;
-}
-enum ExecutorFlavor {
-    EXECUTOR_FLAVOR_UNKNOWN = 0;
-    EXECUTOR_FLAVOR_OSS = 1;
-    EXECUTOR_FLAVOR_PLATFORM = 2;
+    EXECUTOR_STATUS_STOPPED = 4;
 }
 message ExecutorState {
     optional string executor_id = 1;
-    optional bool development_mode = 2;
     optional string hostname = 3;
-    optional ExecutorFlavor flavor = 4;
     optional string version = 5;
     optional ExecutorStatus status = 6;
     // Total resources at the Executor.
@@ -148,6 +148,7 @@ message ExecutorState {
 // A message sent by Executor to report its up to date state to Server.
 message ReportExecutorStateRequest {
     optional ExecutorState executor_state = 1;
+    repeated TaskResult task_results = 2;
 }
 // A message sent by Server to Executor to acknowledge the receipt of Executor state.
@@ -170,8 +171,6 @@ message Task {
     optional string graph_version = 4;
     optional string function_name = 5;
     optional string graph_invocation_id = 6;
-    optional string input_key = 8; // deprecated. TODO: remove when input is used everywhere
-    optional string reducer_output_key = 9; // deprecated. TODO: remove when reducer_input is used everywhere
     optional uint32 timeout_ms = 10;
     optional DataPayload input = 11;
     optional DataPayload reducer_input = 12;
@@ -185,6 +184,7 @@ message Task {
 message TaskAllocation {
     optional string function_executor_id = 1;
     optional Task task = 2;
+    optional string allocation_id = 3;
 }
 // A message sent by Executor to Server to open the stream of desired Executor States for the Executor.
@@ -203,46 +203,57 @@ message DesiredExecutorState {
 }
 // ===== report_task_outcome RPC =====
-enum TaskOutcome {
-    TASK_OUTCOME_UNKNOWN = 0;
-    TASK_OUTCOME_SUCCESS = 1;
-    TASK_OUTCOME_FAILURE = 2;
+enum TaskOutcomeCode {
+    TASK_OUTCOME_CODE_UNKNOWN = 0;
+    TASK_OUTCOME_CODE_SUCCESS = 1;
+    TASK_OUTCOME_CODE_FAILURE = 2;
+}
+enum TaskFailureReason {
+    TASK_FAILURE_REASON_UNKNOWN = 0;
+    TASK_FAILURE_REASON_INTERNAL_ERROR = 1;
+    TASK_FAILURE_REASON_FUNCTION_ERROR = 2;
+    TASK_FAILURE_REASON_FUNCTION_TIMEOUT = 3;
+    TASK_FAILURE_REASON_TASK_CANCELLED = 4;
+    TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED = 5;
 }
-enum OutputEncoding {
-    OUTPUT_ENCODING_UNKNOWN = 0;
-    OUTPUT_ENCODING_JSON = 1;
-    OUTPUT_ENCODING_PICKLE = 2;
-    OUTPUT_ENCODING_BINARY = 3;
+message ResultRouting {
+    // The list of next functions.
+    // NB: An empty list indicates that no routing should be performed.
+    repeated string next_functions = 1;
 }
-message ReportTaskOutcomeRequest {
+message TaskResult {
     optional string task_id = 1;
     optional string namespace = 2;
     optional string graph_name = 3;
     optional string function_name = 4;
-    optional string graph_invocation_id = 6;
-    optional TaskOutcome outcome = 7;
-    optional string invocation_id = 8; // deprecated. TODO: remove when graph_invocation_id is used everywhere
-    optional string executor_id = 9;
-    optional bool reducer = 10;
+    optional string graph_invocation_id = 5;
+    optional bool reducer = 6;
+    optional TaskOutcomeCode outcome_code = 7;
+    optional TaskFailureReason failure_reason = 8;
     // Edges that the function wants the invocation to be routed to.
     // Previously called router_edges.
-    repeated string next_functions = 11;
-    // Outputs of the function.
-    repeated DataPayload fn_outputs = 12;
+    // NB: An empty list indicates that the graph's route definitions should be used,
+    // unless this field is overridden by the presence of the `routing` field.
+    repeated string next_functions = 9;
+    repeated DataPayload function_outputs = 10;
     // Standard output and error streams of the function.
-    optional DataPayload stdout = 14;
-    optional DataPayload stderr = 15;
-    // Output encoding of all the outputs of a function have to be same.
-    optional OutputEncoding output_encoding = 13; // deprecated. TODO: remove when DataPayload.encoding is used everywhere
-    // This allows us to change how we encode the output from functions
-    // and serialize them into storage.
-    optional uint64 output_encoding_version = 5;  // deprecated. TODO: remove when DataPayload.encoding_version is used everywhere
-}
+    optional DataPayload stdout = 11;
+    optional DataPayload stderr = 12;
+    optional string allocation_id = 13;
-message ReportTaskOutcomeResponse {
+    // Indicates how the results should be routed.
+    // If this is present, it replaces `next_functions`.
+    //
+    // If absent, `next_functions` will be used; note that if no
+    // routes are defined in `next_functions`, this will use the
+    // graph's routing.  The long-term goal is to deprecate
+    // `next_functions`, so that if `routing` is not present, the
+    // graph's routing definitions will always be used.
+    ResultRouting routing = 14;
 }
 // Internal API for scheduling and running tasks on Executors. Executors are acting as clients of this API.
@@ -262,7 +273,4 @@ service ExecutorAPI {
     //
     // Deprecated HTTP API is used to download the serialized graph and task inputs.
     rpc get_desired_executor_states(GetDesiredExecutorStatesRequest) returns (stream DesiredExecutorState) {}
-    // Report the outcome of a task.
-    rpc report_task_outcome(ReportTaskOutcomeRequest) returns (ReportTaskOutcomeResponse) {}
-}
+}

indexify 0.3.30__py3-none-any.whl → 0.4.2__py3-none-any.whl

indexify 0.3.30py3-none-any.whl → 0.4.2py3-none-any.whl