PyPI - indexify - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl - Mend

indexify 0.3.9py3-none-any.whl → 0.3.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

indexify/cli/cli.py +35 -6
indexify/executor/api_objects.py +4 -0
indexify/executor/downloader.py +45 -5
indexify/executor/executor.py +103 -16
indexify/executor/function_executor/function_executor.py +174 -55
indexify/executor/function_executor/function_executor_state.py +6 -0
indexify/executor/function_executor/function_executor_states_container.py +64 -0
indexify/executor/function_executor/health_checker.py +20 -10
indexify/executor/function_executor/invocation_state_client.py +31 -6
indexify/executor/function_executor/metrics/function_executor.py +142 -0
indexify/executor/function_executor/metrics/function_executor_state.py +10 -0
indexify/executor/function_executor/metrics/function_executor_state_container.py +10 -0
indexify/executor/function_executor/metrics/health_checker.py +14 -0
indexify/executor/function_executor/metrics/invocation_state_client.py +45 -0
indexify/executor/function_executor/metrics/single_task_runner.py +22 -0
indexify/executor/function_executor/single_task_runner.py +44 -15
indexify/executor/function_executor/task_output.py +7 -1
indexify/executor/metrics/downloader.py +69 -0
indexify/executor/metrics/executor.py +51 -0
indexify/executor/metrics/task_fetcher.py +21 -0
indexify/executor/metrics/task_reporter.py +22 -0
indexify/executor/metrics/task_runner.py +45 -0
indexify/executor/monitoring/function_allowlist.py +25 -0
indexify/executor/monitoring/handler.py +8 -0
indexify/executor/monitoring/health_check_handler.py +20 -0
indexify/executor/monitoring/health_checker/generic_health_checker.py +58 -0
indexify/executor/monitoring/health_checker/health_checker.py +23 -0
indexify/executor/monitoring/metrics.py +245 -0
indexify/executor/monitoring/prometheus_metrics_handler.py +18 -0
indexify/executor/monitoring/server.py +41 -0
indexify/executor/monitoring/startup_probe_handler.py +17 -0
indexify/executor/task_fetcher.py +15 -1
indexify/executor/task_reporter.py +24 -7
indexify/executor/task_runner.py +64 -46
{indexify-0.3.9.dist-info → indexify-0.3.10.dist-info}/METADATA +4 -2
indexify-0.3.10.dist-info/RECORD +46 -0
indexify-0.3.9.dist-info/RECORD +0 -25
{indexify-0.3.9.dist-info → indexify-0.3.10.dist-info}/WHEEL +0 -0
{indexify-0.3.9.dist-info → indexify-0.3.10.dist-info}/entry_points.txt +0 -0

indexify/executor/function_executor/metrics/invocation_state_client.py ADDED Viewed

@@ -0,0 +1,45 @@
+import prometheus_client
+from ...monitoring.metrics import latency_metric_for_fast_operation
+# This file contains all metrics used by InvocationStateClient.
+# General metrics.
+metric_request_read_errors: prometheus_client.Counter = prometheus_client.Counter(
+    "function_executor_invocation_state_client_request_read_errors",
+    "Number of failed request reads in Function Executor Invocation State client resulting in its early termination",
+)
+# Get invocation state key-value Server API metrics.
+metric_server_get_state_requests: prometheus_client.Counter = prometheus_client.Counter(
+    "server_get_invocation_state_requests",
+    "Number of get invocation state requests sent to the Server on behalf of Function Executor",
+)
+metric_server_get_state_request_errors: prometheus_client.Counter = (
+    prometheus_client.Counter(
+        "server_get_invocation_state_request_errors",
+        "Server get invocation state request errors",
+    )
+)
+metric_server_get_state_request_latency: prometheus_client.Histogram = (
+    latency_metric_for_fast_operation(
+        "server_get_invocation_state_request", "Server get invocation state request"
+    )
+)
+# Set invocation state key-value Server API metrics.
+metric_server_set_state_requests: prometheus_client.Counter = prometheus_client.Counter(
+    "server_set_invocation_state_requests",
+    "Number of set invocation state requests sent to the Server on behalf of Function Executor",
+)
+metric_server_set_state_request_errors: prometheus_client.Counter = (
+    prometheus_client.Counter(
+        "server_set_invocation_state_request_errors",
+        "Server set invocation state request errors",
+    )
+)
+metric_server_set_state_request_latency: prometheus_client.Histogram = (
+    latency_metric_for_fast_operation(
+        "server_set_invocation_state_request", "Server set invocation state request"
+    )
+)

indexify/executor/function_executor/metrics/single_task_runner.py ADDED Viewed

@@ -0,0 +1,22 @@
+import prometheus_client
+from ...monitoring.metrics import latency_metric_for_customer_controlled_operation
+# This file contains all metrics used by SingleTaskRunner.
+metric_function_executor_run_task_rpcs: prometheus_client.Counter = (
+    prometheus_client.Counter(
+        "function_executor_run_task_rpcs", "Number of Function Executor run task RPCs"
+    )
+)
+metric_function_executor_run_task_rpc_errors: prometheus_client.Counter = (
+    prometheus_client.Counter(
+        "function_executor_run_task_rpc_errors",
+        "Number of Function Executor run task RPC errors",
+    )
+)
+metric_function_executor_run_task_rpc_latency: prometheus_client.Histogram = (
+    latency_metric_for_customer_controlled_operation(
+        "function_executor_run_task_rpc", "Function Executor run task RPC"
+    )
+)

indexify/executor/function_executor/single_task_runner.py CHANGED Viewed

@@ -14,6 +14,11 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
 from ..api_objects import Task
 from .function_executor import CustomerError, FunctionExecutor
 from .function_executor_state import FunctionExecutorState
+from .metrics.single_task_runner import (
+    metric_function_executor_run_task_rpc_errors,
+    metric_function_executor_run_task_rpc_latency,
+    metric_function_executor_run_task_rpcs,
+)
 from .server.function_executor_server_factory import (
     FunctionExecutorServerConfiguration,
     FunctionExecutorServerFactory,
@@ -54,12 +59,11 @@ class SingleTaskRunner:
         if self._state.is_shutdown:
             raise RuntimeError("Function Executor state is shutting down.")
-        # If Function Executor is not healthy then recreate it.
-        if self._state.function_executor is not None:
-            if not await self._state.function_executor.health_checker().check():
-                self._logger.error("Health check failed, destroying FunctionExecutor.")
-                await self._state.destroy_function_executor()
+        # If Function Executor became unhealthy while was idle then destroy it.
+        # It'll be recreated below.
+        await self._destroy_existing_function_executor_if_unhealthy()
+        # Create Function Executor if it doesn't exist yet.
         if self._state.function_executor is None:
             try:
                 await self._create_function_executor()
@@ -70,7 +74,12 @@ class SingleTaskRunner:
                     success=False,
                 )
-        return await self._run()
+        try:
+            return await self._run()
+        finally:
+            # If Function Executor became unhealthy while running the task then destroy it.
+            # The periodic health checker might not notice this as it does only periodic checks.
+            await self._destroy_existing_function_executor_if_unhealthy()
     async def _create_function_executor(self) -> FunctionExecutor:
         function_executor: FunctionExecutor = FunctionExecutor(
@@ -122,19 +131,39 @@ class SingleTaskRunner:
             health_check_failed_callback=self._health_check_failed_callback,
             function_executor_state=self._state,
         ):
-            # If this RPC failed due to customer code crashing the server we won't be
-            # able to detect this. We'll treat this as our own error for now and thus
-            # let the AioRpcError to be raised here.
-            response: RunTaskResponse = await FunctionExecutorStub(channel).run_task(
-                request
-            )
+            with (
+                metric_function_executor_run_task_rpc_errors.count_exceptions(),
+                metric_function_executor_run_task_rpc_latency.time(),
+            ):
+                metric_function_executor_run_task_rpcs.inc()
+                # If this RPC failed due to customer code crashing the server we won't be
+                # able to detect this. We'll treat this as our own error for now and thus
+                # let the AioRpcError to be raised here.
+                response: RunTaskResponse = await FunctionExecutorStub(
+                    channel
+                ).run_task(request)
             return _task_output(task=self._task_input.task, response=response)
     async def _health_check_failed_callback(self):
-        # The Function Executor needs to get recreated on next task run.
-        self._logger.error("Health check failed, destroying FunctionExecutor.")
+        # Function Executor destroy due to the periodic health check failure ensures that
+        # a running task RPC stuck in unhealthy Function Executor fails immidiately.
         async with self._state.lock:
-            await self._state.destroy_function_executor()
+            if self._state.function_executor is not None:
+                await self._destroy_function_executor_on_failed_health_check()
+    async def _destroy_existing_function_executor_if_unhealthy(self):
+        self._state.check_locked()
+        if self._state.function_executor is None:
+            return
+        if await self._state.function_executor.health_checker().check():
+            return
+        await self._destroy_function_executor_on_failed_health_check()
+    async def _destroy_function_executor_on_failed_health_check(self):
+        self._state.check_locked()
+        self._logger.error("Health check failed, destroying FunctionExecutor.")
+        self._state.health_check_failed = True
+        await self._state.destroy_function_executor()
 class _RunningTaskContextManager:

indexify/executor/function_executor/task_output.py CHANGED Viewed

@@ -20,6 +20,7 @@ class TaskOutput:
         stderr: Optional[str] = None,
         reducer: bool = False,
         success: bool = False,
+        is_internal_error: bool = False,
     ):
         self.task = task
         self.function_output = function_output
@@ -28,9 +29,14 @@ class TaskOutput:
         self.stderr = stderr
         self.reducer = reducer
         self.success = success
+        self.is_internal_error = is_internal_error
     @classmethod
     def internal_error(cls, task: Task) -> "TaskOutput":
         """Creates a TaskOutput for an internal error."""
         # We are not sharing internal error messages with the customer.
-        return TaskOutput(task=task, stderr="Platform failed to execute the function.")
+        return TaskOutput(
+            task=task,
+            stderr="Platform failed to execute the function.",
+            is_internal_error=True,
+        )

indexify/executor/metrics/downloader.py ADDED Viewed

@@ -0,0 +1,69 @@
+import prometheus_client
+from ..monitoring.metrics import latency_metric_for_fast_operation
+# This file contains all metrics used by Downloader.
+# Graph download metrics
+metric_graph_downloads: prometheus_client.Counter = prometheus_client.Counter(
+    "task_graph_downloads",
+    "Number of task graph downloads, including downloads served from local cache",
+)
+metric_graph_download_errors: prometheus_client.Counter = prometheus_client.Counter(
+    "task_graph_download_errors",
+    "Number of task download errors, including downloads served from local cache",
+)
+metric_graphs_from_cache: prometheus_client.Counter = prometheus_client.Counter(
+    "task_graph_downloads_from_cache",
+    "Number of task graph downloads served from local cache",
+)
+metric_graph_download_latency: prometheus_client.Histogram = (
+    latency_metric_for_fast_operation(
+        "task_graph_download",
+        "task graph download, including downloads served from local cache",
+    )
+)
+metric_tasks_downloading_graphs: prometheus_client.Gauge = prometheus_client.Gauge(
+    "tasks_downloading_graphs",
+    "Number of tasks currently downloading their graphs, including local cache lookups",
+)
+# Task input download metrics
+metric_task_input_downloads: prometheus_client.Counter = prometheus_client.Counter(
+    "task_input_downloads", "Number of task input downloads"
+)
+metric_task_input_download_errors: prometheus_client.Counter = (
+    prometheus_client.Counter(
+        "task_input_download_errors", "Number of task input download errors"
+    )
+)
+metric_task_input_download_latency: prometheus_client.Histogram = (
+    latency_metric_for_fast_operation("task_input_download", "task input download")
+)
+metric_tasks_downloading_inputs: prometheus_client.Gauge = prometheus_client.Gauge(
+    "tasks_downloading_inputs", "Number of tasks currently downloading their inputs"
+)
+# Reducer init value download metrics
+metric_reducer_init_value_downloads: prometheus_client.Counter = (
+    prometheus_client.Counter(
+        "task_reducer_init_value_downloads", "Number of reducer init value downloads"
+    )
+)
+metric_reducer_init_value_download_errors: prometheus_client.Counter = (
+    prometheus_client.Counter(
+        "task_reducer_init_value_download_errors",
+        "Number of reducer init value download errors",
+    )
+)
+metric_reducer_init_value_download_latency: prometheus_client.Histogram = (
+    latency_metric_for_fast_operation(
+        "task_reducer_init_value_download", "Task reducer init value download"
+    )
+)
+metric_tasks_downloading_reducer_init_value: prometheus_client.Gauge = (
+    prometheus_client.Gauge(
+        "tasks_downloading_reducer_init_value",
+        "Number of tasks currently downloading their reducer init values",
+    )
+)

indexify/executor/metrics/executor.py ADDED Viewed

@@ -0,0 +1,51 @@
+import prometheus_client
+from ..monitoring.metrics import latency_metric_for_fast_operation
+# This file contains all metrics used by Executor.
+# Executor overview metrics.
+metric_executor_info: prometheus_client.Info = prometheus_client.Info(
+    "executor", "Executor information"
+)
+metric_executor_state: prometheus_client.Enum = prometheus_client.Enum(
+    "executor_state",
+    "Current Executor state",
+    states=["starting", "running", "shutting_down"],
+)
+# Task statistics metrics.
+metric_tasks_fetched: prometheus_client.Counter = prometheus_client.Counter(
+    "tasks_fetched", "Number of tasks that were fetched from Server"
+)
+metric_tasks_completed: prometheus_client.Counter = prometheus_client.Counter(
+    "tasks_completed", "Number of tasks that were completed", ["outcome"]
+)
+METRIC_TASKS_COMPLETED_OUTCOME_ALL = "all"
+METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS = "success"
+METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE = "error_customer_code"
+METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM = "error_platform"
+metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL)
+metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS)
+metric_tasks_completed.labels(
+    outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
+)
+metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM)
+# Task outcome reporting metrics.
+metric_task_outcome_reports: prometheus_client.Counter = prometheus_client.Counter(
+    "task_outcome_reports",
+    "Number of task outcome reports",
+)
+metric_tasks_reporting_outcome: prometheus_client.Gauge = prometheus_client.Gauge(
+    "tasks_reporting_outcome",
+    "Number of tasks currently reporting their outcomes",
+)
+metric_task_outcome_report_latency: prometheus_client.Histogram = (
+    latency_metric_for_fast_operation("task_outcome_report", "task outcome report")
+)
+metric_task_outcome_report_retries: prometheus_client.Counter = (
+    prometheus_client.Counter(
+        "tasks_outcome_report_retries", "Number of task outcome report retries"
+    )
+)

indexify/executor/metrics/task_fetcher.py ADDED Viewed

@@ -0,0 +1,21 @@
+import prometheus_client
+from ..monitoring.metrics import latency_metric_for_fast_operation
+# This file contains all metrics used by TaskFetcher.
+metric_server_registrations: prometheus_client.Counter = prometheus_client.Counter(
+    "server_registration_requests",
+    "Number of Executor registrations requests sent to the Server",
+)
+metric_server_registration_errors: prometheus_client.Counter = (
+    prometheus_client.Counter(
+        "server_registration_request_errors",
+        "Number of failed Executor registration requests",
+    )
+)
+metric_server_registration_latency: prometheus_client.Histogram = (
+    latency_metric_for_fast_operation(
+        "server_registration_request", "Register Executor at the Server"
+    )
+)

indexify/executor/metrics/task_reporter.py ADDED Viewed

@@ -0,0 +1,22 @@
+import prometheus_client
+from ..monitoring.metrics import latency_metric_for_fast_operation
+# This file contains all metrics used by TaskReporter.
+metric_server_ingest_files_requests: prometheus_client.Counter = (
+    prometheus_client.Counter(
+        "server_ingest_files_requests", "Number of Server ingest files requests"
+    )
+)
+metric_server_ingest_files_errors: prometheus_client.Counter = (
+    prometheus_client.Counter(
+        "server_ingest_files_request_errors",
+        "Number of Server ingest files request errors",
+    )
+)
+metric_server_ingest_files_latency: prometheus_client.Histogram = (
+    latency_metric_for_fast_operation(
+        "server_ingest_files_request", "Ingest files request to Server"
+    )
+)

indexify/executor/metrics/task_runner.py ADDED Viewed

@@ -0,0 +1,45 @@
+import prometheus_client
+from ..monitoring.metrics import latency_metric_for_customer_controlled_operation
+# This file contains all metrics used by TaskRunner.
+# Metrics for the stage when task is blocked by the current policy.
+metric_task_policy_runs: prometheus_client.Counter = prometheus_client.Counter(
+    "task_policy_runs",
+    "Number of task execution policy runs",
+)
+metric_task_policy_errors: prometheus_client.Counter = prometheus_client.Counter(
+    "task_policy_errors",
+    "Number of errors while running task execution policy",
+)
+metric_task_policy_latency: prometheus_client.Histogram = (
+    latency_metric_for_customer_controlled_operation(
+        "task_policy",
+        "Task execution blocked by the policy",
+    )
+)
+metric_tasks_blocked_by_policy: prometheus_client.Gauge = prometheus_client.Gauge(
+    "tasks_blocked_by_policy",
+    "Number of tasks that are ready for execution but are blocked according to the current policy (typically waiting for a free Function Executor)",
+)
+# Metrics for the stage when task is running.
+metric_task_runs: prometheus_client.Counter = prometheus_client.Counter(
+    "task_runs",
+    "Number of task runs",
+)
+metric_task_run_platform_errors: prometheus_client.Counter = prometheus_client.Counter(
+    "task_run_platform_errors",
+    "Number of platform errors while running task",
+)
+metric_task_run_latency: prometheus_client.Histogram = (
+    latency_metric_for_customer_controlled_operation(
+        "task_run",
+        "run task from the moment it is unblocked by the policy until it finishes",
+    )
+)
+metric_tasks_running: prometheus_client.Gauge = prometheus_client.Gauge(
+    "tasks_running",
+    "Number of running tasks",
+)

indexify/executor/monitoring/function_allowlist.py ADDED Viewed

@@ -0,0 +1,25 @@
+from typing import Dict, List, Optional
+from ..api_objects import FunctionURI
+def function_allowlist_to_info_dict(
+    function_allowlist: Optional[List[FunctionURI]],
+) -> Dict[str, str]:
+    if function_allowlist is None:
+        return {"function_allowlist": "None"}
+    info = {}
+    counter = 0
+    for function_uri in function_allowlist:
+        function_uri: FunctionURI
+        info[f"function_allowlist_{counter}"] = ":".join(
+            [
+                function_uri.namespace,
+                function_uri.compute_graph,
+                function_uri.compute_fn,
+                str(function_uri.version),
+            ]
+        )
+        counter += 1
+    return info

indexify/executor/monitoring/handler.py ADDED Viewed

@@ -0,0 +1,8 @@
+from aiohttp import web
+class Handler:
+    """Abstract base class for all request handlers."""
+    async def handle(self, request: web.Request) -> web.Response:
+        raise NotImplementedError("Subclasses must implement this method.")

indexify/executor/monitoring/health_check_handler.py ADDED Viewed

@@ -0,0 +1,20 @@
+from aiohttp import web
+from .handler import Handler
+from .health_checker.health_checker import HealthChecker, HealthCheckResult
+class HealthCheckHandler(Handler):
+    def __init__(self, health_checker: HealthChecker):
+        self._health_checker = health_checker
+    async def handle(self, request: web.Request) -> web.Response:
+        result: HealthCheckResult = await self._health_checker.check()
+        return web.json_response(
+            {
+                "status": "ok" if result.is_success else "nok",
+                "message": result.status_message,
+                "checker": result.checker_name,
+            },
+            status=200 if result.is_success else 503,
+        )

indexify/executor/monitoring/health_checker/generic_health_checker.py ADDED Viewed

@@ -0,0 +1,58 @@
+from typing import Optional
+from ...function_executor.function_executor_states_container import (
+    FunctionExecutorStatesContainer,
+)
+from .health_checker import HealthChecker, HealthCheckResult
+HEALTH_CHECKER_NAME = "GenericHealthChecker"
+class GenericHealthChecker(HealthChecker):
+    """A generic health checker that doesn't depend on machine type and other features of the environment.
+    The health checker uses software signals available in all environments like Function Executor failure rates.
+    """
+    def __init__(self):
+        self._function_executor_states: Optional[FunctionExecutorStatesContainer] = None
+    def set_function_executor_states_container(
+        self, states: FunctionExecutorStatesContainer
+    ):
+        self._function_executor_states = states
+    async def check(self) -> HealthCheckResult:
+        if self._function_executor_states is None:
+            return HealthCheckResult(
+                is_success=False,
+                status_message="Function Executor states container was not provided yet",
+                checker_name=HEALTH_CHECKER_NAME,
+            )
+        # Current health check policy and reasoning:
+        # * A Function Executor health check failure is a strong signal that something is wrong
+        #   either with:
+        #   - The Function Code (a criticial software bug).
+        #   - The Executor machine/container/VM (a software bug or malfunctioning local hardware).
+        # * Critical Function Code bugs tend to get fixed eventually by users. What doesn't get fixed eventually
+        #   is rare but recurring local Executor issues like hardware errors and software bugs in middleware like
+        #   drivers.
+        # * Such issues tend to get mitigated by automatically recreating the Executor machine/VM/container.
+        # * So we fail whole Executor health check if a Function Executor health check ever failed to hint the users
+        #   that we probably need to recreate the Executor machine/VM/container (unless there's a bug in Function
+        #   code that user can investigate themself).
+        async for state in self._function_executor_states:
+            # No need to async lock the state to read a single value.
+            if state.health_check_failed:
+                return HealthCheckResult(
+                    is_success=False,
+                    status_message="A Function Executor health check failed",
+                    checker_name=HEALTH_CHECKER_NAME,
+                )
+        return HealthCheckResult(
+            is_success=True,
+            status_message="All Function Executors pass health checks",
+            checker_name=HEALTH_CHECKER_NAME,
+        )

indexify/executor/monitoring/health_checker/health_checker.py ADDED Viewed

@@ -0,0 +1,23 @@
+from ...function_executor.function_executor_states_container import (
+    FunctionExecutorStatesContainer,
+)
+class HealthCheckResult:
+    def __init__(self, checker_name: str, is_success: bool, status_message: str):
+        self.checker_name = checker_name
+        self.is_success = is_success
+        self.status_message = status_message
+class HealthChecker:
+    """Abstract base class for health checkers."""
+    def set_function_executor_states_container(
+        self, states: FunctionExecutorStatesContainer
+    ):
+        """Provides function executor states to this health checker so it can use them in the health checks."""
+        raise NotImplementedError("Subclasses must implement this method.")
+    async def check(self) -> HealthCheckResult:
+        raise NotImplementedError("Subclasses must implement this method.")

indexify 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl

indexify 0.3.9py3-none-any.whl → 0.3.10py3-none-any.whl