PyPI - indexify - Versions diffs - 0.3.30__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

indexify 0.3.30py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

indexify/executor/executor.py CHANGED Viewed

@@ -1,54 +1,35 @@
 import asyncio
 import signal
-import time
 from pathlib import Path
 from socket import gethostname
-from typing import Any, Dict, List, Optional
+from typing import Dict, List, Optional
 import structlog
-from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
-from tensorlake.utils.logging import suppress as suppress_logging
 from indexify.proto.executor_api_pb2 import ExecutorStatus
-from .api_objects import FunctionURI, Task
 from .blob_store.blob_store import BLOBStore
-from .downloader import Downloader
-from .executor_flavor import ExecutorFlavor
-from .function_executor.function_executor_states_container import (
-    FunctionExecutorStatesContainer,
+from .channel_manager import ChannelManager
+from .function_allowlist import (
+    FunctionURI,
+    function_allowlist_to_indexed_dict,
+    parse_function_uris,
 )
 from .function_executor.server.function_executor_server_factory import (
     FunctionExecutorServerFactory,
 )
-from .grpc.channel_manager import ChannelManager
-from .grpc.state_reconciler import ExecutorStateReconciler
-from .grpc.state_reporter import ExecutorStateReporter
 from .host_resources.host_resources import HostResourcesProvider
 from .metrics.executor import (
-    METRIC_TASKS_COMPLETED_OUTCOME_ALL,
-    METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
-    METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
-    METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
     metric_executor_info,
     metric_executor_state,
-    metric_task_completion_latency,
-    metric_task_outcome_report_latency,
-    metric_task_outcome_report_retries,
-    metric_task_outcome_reports,
-    metric_tasks_completed,
-    metric_tasks_fetched,
-    metric_tasks_reporting_outcome,
 )
-from .monitoring.function_allowlist import function_allowlist_to_info_dict
 from .monitoring.health_check_handler import HealthCheckHandler
 from .monitoring.health_checker.health_checker import HealthChecker
 from .monitoring.prometheus_metrics_handler import PrometheusMetricsHandler
 from .monitoring.server import MonitoringServer
 from .monitoring.startup_probe_handler import StartupProbeHandler
-from .task_fetcher import TaskFetcher
-from .task_reporter import TaskReporter
-from .task_runner import TaskInput, TaskOutput, TaskRunner
+from .state_reconciler import ExecutorStateReconciler
+from .state_reporter import ExecutorStateReporter
 metric_executor_state.state("starting")
@@ -57,32 +38,26 @@ class Executor:
     def __init__(
         self,
         id: str,
-        flavor: ExecutorFlavor,
         version: str,
         labels: Dict[str, str],
-        code_path: Path,
+        cache_path: Path,
         health_checker: HealthChecker,
-        function_allowlist: Optional[List[FunctionURI]],
+        function_uris: List[str],
         function_executor_server_factory: FunctionExecutorServerFactory,
         server_addr: str,
         grpc_server_addr: str,
         config_path: Optional[str],
         monitoring_server_host: str,
         monitoring_server_port: int,
-        enable_grpc_state_reconciler: bool,
         blob_store: BLOBStore,
         host_resources_provider: HostResourcesProvider,
     ):
         self._logger = structlog.get_logger(module=__name__)
-        self._is_shutdown: bool = False
         protocol: str = "http"
         if config_path:
             self._logger.info("running the extractor with TLS enabled")
             protocol = "https"
-        self._server_addr = server_addr
-        self._base_url = f"{protocol}://{self._server_addr}"
-        self._code_path = code_path
         self._startup_probe_handler = StartupProbeHandler()
         self._monitoring_server = MonitoringServer(
             host=monitoring_server_host,
@@ -91,32 +66,17 @@ class Executor:
             health_probe_handler=HealthCheckHandler(health_checker),
             metrics_handler=PrometheusMetricsHandler(),
         )
-        self._function_executor_states = FunctionExecutorStatesContainer(
-            logger=self._logger
-        )
-        health_checker.set_function_executor_states_container(
-            self._function_executor_states
-        )
-        self._downloader = Downloader(
-            code_path=code_path,
-            base_url=self._base_url,
-            blob_store=blob_store,
-            config_path=config_path,
-        )
-        self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
-        self._function_executor_server_factory = function_executor_server_factory
         self._channel_manager = ChannelManager(
             server_address=grpc_server_addr,
             config_path=config_path,
             logger=self._logger,
         )
+        function_allowlist: List[FunctionURI] = parse_function_uris(function_uris)
         self._state_reporter = ExecutorStateReporter(
             executor_id=id,
-            flavor=flavor,
             version=version,
             labels=labels,
-            function_allowlist=self._function_allowlist,
-            function_executor_states=self._function_executor_states,
+            function_allowlist=function_allowlist,
             channel_manager=self._channel_manager,
             host_resources_provider=host_resources_provider,
             logger=self._logger,
@@ -124,69 +84,48 @@ class Executor:
         self._state_reporter.update_executor_status(
             ExecutorStatus.EXECUTOR_STATUS_STARTING_UP
         )
-        self._task_reporter = TaskReporter(
-            base_url=self._base_url,
+        self._state_reconciler = ExecutorStateReconciler(
             executor_id=id,
+            function_executor_server_factory=function_executor_server_factory,
+            base_url=f"{protocol}://{server_addr}",
             config_path=config_path,
-            channel_manager=self._channel_manager,
+            cache_path=cache_path,
             blob_store=blob_store,
+            channel_manager=self._channel_manager,
+            state_reporter=self._state_reporter,
+            logger=self._logger,
         )
-        # HTTP mode task runner
-        self._task_runner: Optional[TaskRunner] = None
-        self._task_fetcher: Optional[TaskFetcher] = None
-        # gRPC mode state reconciler that runs tasks
-        self._state_reconciler: Optional[ExecutorStateReconciler] = None
-        if enable_grpc_state_reconciler:
-            self._state_reconciler = ExecutorStateReconciler(
-                executor_id=id,
-                function_executor_server_factory=self._function_executor_server_factory,
-                base_url=self._base_url,
-                function_executor_states=self._function_executor_states,
-                config_path=config_path,
-                downloader=self._downloader,
-                task_reporter=self._task_reporter,
-                channel_manager=self._channel_manager,
-                state_reporter=self._state_reporter,
-                logger=self._logger,
-            )
-        else:
-            self._task_runner = TaskRunner(
-                executor_id=id,
-                function_executor_server_factory=function_executor_server_factory,
-                base_url=self._base_url,
-                function_executor_states=self._function_executor_states,
-                config_path=config_path,
-            )
-            self._task_fetcher = TaskFetcher(
-                executor_id=id,
-                executor_version=version,
-                labels=labels,
-                function_allowlist=function_allowlist,
-                protocol=protocol,
-                indexify_server_addr=self._server_addr,
-                config_path=config_path,
-            )
+        self._run_aio_task: Optional[asyncio.Task] = None
+        self._shutdown_aio_task: Optional[asyncio.Task] = None
         executor_info: Dict[str, str] = {
             "id": id,
-            "flavor": flavor.name,
             "version": version,
-            "code_path": str(code_path),
+            "cache_path": str(cache_path),
             "server_addr": server_addr,
             "grpc_server_addr": str(grpc_server_addr),
             "config_path": str(config_path),
-            "enable_grpc_state_reconciler": str(enable_grpc_state_reconciler),
             "hostname": gethostname(),
         }
         for key, value in labels.items():
             executor_info["label_" + key] = value
-        executor_info.update(function_allowlist_to_info_dict(function_allowlist))
+        executor_info.update(function_allowlist_to_indexed_dict(function_allowlist))
         metric_executor_info.info(executor_info)
     def run(self):
         asyncio.new_event_loop()
+        self._run_aio_task = asyncio.get_event_loop().create_task(
+            self._run(),
+            name="executor startup and run loop",
+        )
+        try:
+            asyncio.get_event_loop().run_until_complete(self._run_aio_task)
+        except asyncio.CancelledError:
+            pass  # Expected exception on shutdown
+    async def _run(self):
         for signum in [
             signal.SIGABRT,
             signal.SIGINT,
@@ -195,235 +134,42 @@ class Executor:
             signal.SIGHUP,
         ]:
             asyncio.get_event_loop().add_signal_handler(
-                signum, self.shutdown, asyncio.get_event_loop()
+                signum, self._shutdown_signal_handler, asyncio.get_event_loop()
             )
-        asyncio.get_event_loop().create_task(
+        asyncio.create_task(
             self._monitoring_server.run(), name="monitoring server runner"
         )
         self._state_reporter.update_executor_status(
             ExecutorStatus.EXECUTOR_STATUS_RUNNING
         )
-        asyncio.get_event_loop().create_task(
-            self._state_reporter.run(), name="state reporter runner"
-        )
+        self._state_reporter.run()
+        self._state_reconciler.run()
         metric_executor_state.state("running")
         self._startup_probe_handler.set_ready()
-        try:
-            if self._state_reconciler is None:
-                asyncio.get_event_loop().run_until_complete(
-                    self._http_task_runner_loop()
-                )
-            else:
-                asyncio.get_event_loop().run_until_complete(
-                    self._grpc_state_reconciler_loop()
-                )
-        except asyncio.CancelledError:
-            pass  # Suppress this expected exception and return without error (normally).
-    async def _grpc_state_reconciler_loop(self):
-        """Runs the gRPC state reconciler and state reporter.
-        Never raises any exceptions."""
-        await self._state_reconciler.run()
-    async def _http_task_runner_loop(self):
-        while not self._is_shutdown:
-            try:
-                async for task in self._task_fetcher.run():
-                    metric_tasks_fetched.inc()
-                    if not self._is_shutdown:
-                        asyncio.create_task(
-                            self._run_task(task), name="task runner (http mode)"
-                        )
-                self._logger.info("fetching tasks finished, reconnecting in 5 seconds")
-            except Exception as e:
-                self._logger.error(
-                    "failed fetching tasks, retrying in 5 seconds", exc_info=e
-                )
-            if not self._is_shutdown:
-                await asyncio.sleep(5)
-    async def _run_task(self, task: Task) -> None:
-        """Runs the supplied task.
-        Doesn't raise any Exceptions. All errors are reported to the server."""
-        start_time: float = time.monotonic()
-        logger = self._task_logger(task)
-        output: Optional[TaskOutput] = None
-        try:
-            output = await self._run_task_and_get_output(task, logger)
-            logger.info("task execution finished", success=output.success)
-        except Exception as e:
-            output = TaskOutput.internal_error(
-                task_id=task.id,
-                namespace=task.namespace,
-                graph_name=task.compute_graph,
-                function_name=task.compute_fn,
-                graph_version=task.graph_version,
-                graph_invocation_id=task.invocation_id,
-                output_payload_uri_prefix=task.output_payload_uri_prefix,
-            )
-            logger.error("task execution failed", exc_info=e)
-        if output.metrics is not None:
-            self.log_function_metrics(output)
-        with (
-            metric_tasks_reporting_outcome.track_inprogress(),
-            metric_task_outcome_report_latency.time(),
-        ):
-            metric_task_outcome_reports.inc()
-            await self._report_task_outcome(output=output, logger=logger)
-        metric_task_completion_latency.observe(time.monotonic() - start_time)
-    def log_function_metrics(self, output: TaskOutput):
-        for counter_name, counter_value in output.metrics.counters.items():
-            self._logger.info(
-                f"function_metric",
-                counter_name=counter_name,
-                counter_value=counter_value,
-                invocation_id=output.graph_invocation_id,
-                function_name=output.function_name,
-                graph_name=output.graph_name,
-                namespace=output.namespace,
-            )
-        for timer_name, timer_value in output.metrics.timers.items():
-            self._logger.info(
-                f"function_metric",
-                timer_name=timer_name,
-                timer_value=timer_value,
-                invocation_id=output.graph_invocation_id,
-                function_name=output.function_name,
-                graph_name=output.graph_name,
-                namespace=output.namespace,
-            )
-    async def _run_task_and_get_output(self, task: Task, logger: Any) -> TaskOutput:
-        graph: SerializedObject = await self._downloader.download_graph(
-            namespace=task.namespace,
-            graph_name=task.compute_graph,
-            graph_version=task.graph_version,
-            logger=logger,
-            data_payload=task.graph_payload,
-        )
-        input: SerializedObject = await self._downloader.download_input(
-            namespace=task.namespace,
-            graph_name=task.compute_graph,
-            graph_invocation_id=task.invocation_id,
-            input_key=task.input_key,
-            data_payload=task.input_payload,
-            logger=logger,
-        )
-        init_value: Optional[SerializedObject] = (
-            None
-            if task.reducer_output_id is None and task.reducer_input_payload is None
-            else (
-                await self._downloader.download_init_value(
-                    namespace=task.namespace,
-                    graph_name=task.compute_graph,
-                    function_name=task.compute_fn,
-                    graph_invocation_id=task.invocation_id,
-                    reducer_output_key=task.reducer_output_id,
-                    data_payload=task.reducer_input_payload,
-                    logger=logger,
-                )
-            )
-        )
-        return await self._task_runner.run(
-            TaskInput(
-                task=task,
-                graph=graph,
-                input=input,
-                init_value=init_value,
-            ),
-            logger=logger,
-        )
-    async def _report_task_outcome(self, output: TaskOutput, logger: Any) -> None:
-        """Reports the task with the given output to the server.
-        Doesn't raise any Exceptions. Runs till the reporting is successful."""
-        reporting_retries: int = 0
+        # Run the Executor forever until it is shut down.
         while True:
-            logger = logger.bind(retries=reporting_retries)
-            try:
-                await self._task_reporter.report(output=output, logger=logger)
-                break
-            except Exception as e:
-                logger.error(
-                    "failed to report task",
-                    exc_info=e,
-                )
-                reporting_retries += 1
-                metric_task_outcome_report_retries.inc()
-                await asyncio.sleep(5)
+            await asyncio.sleep(10)
-        metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
-        if output.is_internal_error:
-            metric_tasks_completed.labels(
-                outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
-            ).inc()
-        elif output.success:
-            metric_tasks_completed.labels(
-                outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
-            ).inc()
-        else:
-            metric_tasks_completed.labels(
-                outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
-            ).inc()
-    async def _shutdown(self, loop):
-        self._logger.info(
-            "shutting down, all Executor logs are suppressed, no task outcomes will be reported to Server from this point"
-        )
-        if self._state_reporter is not None:
-            self._state_reporter.update_executor_status(
-                ExecutorStatus.EXECUTOR_STATUS_STOPPING
+    def _shutdown_signal_handler(self, loop):
+        if self._shutdown_aio_task is None:
+            self._shutdown_aio_task = loop.create_task(
+                self._shutdown(), name="executor shutdown"
             )
-        metric_executor_state.state("shutting_down")
-        # There will be lots of task cancellation exceptions and "X is shutting down"
-        # exceptions logged during Executor shutdown. Suppress their logs as they are
-        # expected and are confusing for users.
-        suppress_logging()
-        self._is_shutdown = True
-        await self._monitoring_server.shutdown()
-        await self._task_reporter.shutdown()
-        if self._task_runner is not None:
-            await self._task_runner.shutdown()
-        if self._state_reporter is not None:
-            await self._state_reporter.shutdown()
-        if self._state_reconciler is not None:
-            await self._state_reconciler.shutdown()
-        if self._channel_manager is not None:
-            await self._channel_manager.destroy()
-        # We need to shutdown all users of FE states first,
-        # otherwise states might disappear unexpectedly and we might
-        # report errors, etc that are expected.
-        await self._function_executor_states.shutdown()
-        # We mainly need to cancel the task that runs _.*_mode_loop().
-        for task in asyncio.all_tasks(loop):
-            task.cancel()
-        # The current task is cancelled, the code after this line will not run.
+    async def _shutdown(self):
+        self._logger.info("shutting down Executor")
+        metric_executor_state.state("shutting_down")
-    def shutdown(self, loop):
-        loop.create_task(self._shutdown(loop), name="executor shutdown")
+        # Shutdown state reconciler first because it changes reported state on shutdown.
+        await self._state_reconciler.shutdown()
-    def _task_logger(self, task: Task) -> Any:
-        return self._logger.bind(
-            namespace=task.namespace,
-            graph=task.compute_graph,
-            graph_version=task.graph_version,
-            invocation_id=task.invocation_id,
-            function_name=task.compute_fn,
-            task_id=task.id,
+        # Do one last state report with STOPPED status. This reduces latency in the system.
+        self._state_reporter.update_executor_status(
+            ExecutorStatus.EXECUTOR_STATUS_STOPPED
         )
+        await self._state_reporter.shutdown()
+        await self._channel_manager.destroy()
+        await self._monitoring_server.shutdown()
+        self._run_aio_task.cancel()

indexify/executor/function_allowlist.py ADDED Viewed

@@ -0,0 +1,59 @@
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+@dataclass
+class FunctionURI:
+    namespace: str
+    compute_graph: str
+    compute_fn: str
+    version: Optional[str] = None
+def function_allowlist_to_indexed_dict(
+    function_allowlist: List[FunctionURI],
+) -> Dict[str, str]:
+    """Returns a dictionary with each function URI in the allowlist as a key-value pair.
+    The keys are prefixed indexes in function allowlist, and the values are the function URIs
+    """
+    indexed_dict = {}
+    counter = 0
+    for function_uri in function_allowlist:
+        function_uri: FunctionURI
+        indexed_dict[f"function_allowlist_{counter}"] = ":".join(
+            [
+                function_uri.namespace,
+                function_uri.compute_graph,
+                function_uri.compute_fn,
+                str(function_uri.version),
+            ]
+        )
+        counter += 1
+    return indexed_dict
+def parse_function_uris(function_uri_strs: List[str]) -> List[FunctionURI]:
+    """Parses a list of function URIs from strings to FunctionURI objects."""
+    uris: List[FunctionURI] = []
+    for uri_str in function_uri_strs:
+        tokens = uri_str.split(":")
+        if len(tokens) < 3 or len(tokens) > 4:
+            raise ValueError(
+                "Function should be specified as <namespace>:<workflow>:<function>:<version> or"
+                "<namespace>:<workflow>:<function>"
+            )
+        version: Optional[str] = None
+        if len(tokens) == 4:
+            version = tokens[3]
+        uris.append(
+            FunctionURI(
+                namespace=tokens[0],
+                compute_graph=tokens[1],
+                compute_fn=tokens[2],
+                version=version,
+            )
+        )
+    return uris

indexify/executor/function_executor/function_executor.py CHANGED Viewed

@@ -56,7 +56,11 @@ from .server.function_executor_server_factory import (
 )
-class CustomerError(RuntimeError):
+class FunctionError(RuntimeError):
+    pass
+class FunctionTimeoutError(FunctionError):
     pass
@@ -92,7 +96,7 @@ class FunctionExecutor:
     ):
         """Creates and initializes a FunctionExecutorServer and all resources associated with it.
-        Raises CustomerError if the server failed to initialize due to an error in customer owned code or data.
+        Raises FunctionError if the server failed to initialize due to an error in customer owned code or data.
         Raises an Exception if an internal error occured."""
         try:
             with (
@@ -134,7 +138,9 @@ class FunctionExecutor:
     async def destroy(self):
         """Destroys all resources owned by this FunctionExecutor.
-        Never raises any exceptions but logs them."""
+        Never raises any exceptions but logs them.
+        Idempotent.
+        """
         try:
             with (
                 metric_destroy_errors.count_exceptions(),
@@ -312,12 +318,12 @@ async def _initialize_server(
             if initialize_response.success:
                 return
             if initialize_response.HasField("customer_error"):
-                raise CustomerError(initialize_response.customer_error)
+                raise FunctionError(initialize_response.customer_error)
             else:
                 raise Exception("initialize RPC failed at function executor server")
         except grpc.aio.AioRpcError as e:
             if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
-                raise CustomerError(
-                    f"Customer code timeout of {customer_code_timeout_sec:.3f} sec expired"
+                raise FunctionTimeoutError(
+                    f"Function initialization exceeded its configured timeout of {customer_code_timeout_sec:.3f} sec."
                 ) from e
             raise

indexify/executor/function_executor/invocation_state_client.py CHANGED Viewed

@@ -15,7 +15,6 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
 )
 from tensorlake.function_executor.proto.message_validator import MessageValidator
-from ..downloader import serialized_object_from_http_response
 from .metrics.invocation_state_client import (
     metric_request_read_errors,
     metric_server_get_state_request_errors,
@@ -78,11 +77,18 @@ class InvocationStateClient:
         If a request is not comming from the task ID that was added here then it will
         be rejected. It's caller's responsibility to only add task IDs that are being
         executed by the Function Executor so the Function Executor can't get access to
-        invocation state of tasks it doesn't run."""
+        invocation state of tasks it doesn't run.
+        Doesn't raise any exceptions.
+        """
         self._task_id_to_invocation_id[task_id] = invocation_id
     def remove_task_to_invocation_id_entry(self, task_id: str) -> None:
-        del self._task_id_to_invocation_id[task_id]
+        """Removes a task ID to invocation ID entry from the client's internal state.
+        Doesn't raise any exceptions.
+        """
+        self._task_id_to_invocation_id.pop(task_id, None)
     async def destroy(self) -> None:
         if self._request_loop_task is not None:
@@ -257,3 +263,19 @@ class InvocationStateClient:
             )
         else:
             raise ValueError("unknown request type")
+def serialized_object_from_http_response(response: httpx.Response) -> SerializedObject:
+    # We're hardcoding the content type currently used by Python SDK. It might change in the future.
+    # There's no other way for now to determine if the response is a bytes or string.
+    if response.headers["content-type"] in [
+        "application/octet-stream",
+        "application/pickle",
+    ]:
+        return SerializedObject(
+            bytes=response.content, content_type=response.headers["content-type"]
+        )
+    else:
+        return SerializedObject(
+            string=response.text, content_type=response.headers["content-type"]
+        )

indexify/executor/function_executor/server/function_executor_server_factory.py CHANGED Viewed

@@ -24,9 +24,9 @@ class FunctionExecutorServerConfiguration:
     graph_version: str
     image_uri: Optional[str]
     secret_names: List[str]
-    cpu_ms_per_sec: Optional[int]
-    memory_bytes: Optional[int]
-    disk_bytes: Optional[int]
+    cpu_ms_per_sec: int
+    memory_bytes: int
+    disk_bytes: int
     gpu_count: int

indexify 0.3.30__py3-none-any.whl → 0.4.2__py3-none-any.whl

indexify 0.3.30py3-none-any.whl → 0.4.2py3-none-any.whl