PyPI - indexify - Versions diffs - 0.3.19__py3-none-any.whl → 0.3.21__py3-none-any.whl - Mend

indexify 0.3.19py3-none-any.whl → 0.3.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

indexify/cli/cli.py +12 -0
indexify/executor/api_objects.py +11 -6
indexify/executor/blob_store/blob_store.py +69 -0
indexify/executor/blob_store/local_fs_blob_store.py +48 -0
indexify/executor/blob_store/metrics/blob_store.py +33 -0
indexify/executor/blob_store/s3_blob_store.py +88 -0
indexify/executor/downloader.py +192 -27
indexify/executor/executor.py +29 -13
indexify/executor/function_executor/function_executor.py +1 -1
indexify/executor/function_executor/function_executor_states_container.py +5 -0
indexify/executor/function_executor/function_executor_status.py +2 -0
indexify/executor/function_executor/health_checker.py +7 -2
indexify/executor/function_executor/invocation_state_client.py +4 -2
indexify/executor/function_executor/single_task_runner.py +2 -0
indexify/executor/function_executor/task_output.py +8 -1
indexify/executor/grpc/channel_manager.py +4 -3
indexify/executor/grpc/function_executor_controller.py +163 -193
indexify/executor/grpc/metrics/state_reconciler.py +17 -0
indexify/executor/grpc/metrics/task_controller.py +8 -0
indexify/executor/grpc/state_reconciler.py +305 -188
indexify/executor/grpc/state_reporter.py +18 -10
indexify/executor/grpc/task_controller.py +247 -189
indexify/executor/metrics/task_reporter.py +17 -0
indexify/executor/task_reporter.py +217 -94
indexify/executor/task_runner.py +1 -0
indexify/proto/executor_api.proto +37 -11
indexify/proto/executor_api_pb2.py +49 -47
indexify/proto/executor_api_pb2.pyi +55 -15
{indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/METADATA +2 -1
{indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/RECORD +32 -27
indexify/executor/grpc/completed_tasks_container.py +0 -26
{indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/WHEEL +0 -0
{indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/entry_points.txt +0 -0

indexify/executor/task_reporter.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import asyncio
+import hashlib
 import time
 from typing import Any, List, Optional, Tuple
@@ -7,8 +8,9 @@ from httpx import Timeout
 from tensorlake.function_executor.proto.function_executor_pb2 import FunctionOutput
 from tensorlake.utils.http_client import get_httpx_client
+from indexify.proto.executor_api_pb2 import DataPayload as DataPayloadProto
 from indexify.proto.executor_api_pb2 import (
-    DataPayload,
+    DataPayloadEncoding,
     OutputEncoding,
     ReportTaskOutcomeRequest,
     TaskOutcome,
@@ -18,10 +20,12 @@ from indexify.proto.executor_api_pb2_grpc import ExecutorAPIStub
 from .api_objects import (
     TASK_OUTCOME_FAILURE,
     TASK_OUTCOME_SUCCESS,
+    DataPayload,
     IngestFnOutputsResponse,
     RouterOutput,
     TaskResult,
 )
+from .blob_store.blob_store import BLOBStore
 from .function_executor.task_output import TaskOutput
 from .grpc.channel_manager import ChannelManager
 from .metrics.task_reporter import (
@@ -31,6 +35,9 @@ from .metrics.task_reporter import (
     metric_server_ingest_files_errors,
     metric_server_ingest_files_latency,
     metric_server_ingest_files_requests,
+    metric_task_output_blob_store_upload_errors,
+    metric_task_output_blob_store_upload_latency,
+    metric_task_output_blob_store_uploads,
 )
@@ -62,6 +69,7 @@ class TaskReporter:
         base_url: str,
         executor_id: str,
         channel_manager: ChannelManager,
+        blob_store: BLOBStore,
         config_path: Optional[str] = None,
     ):
         self._base_url = base_url
@@ -74,8 +82,9 @@ class TaskReporter:
         # results in not reusing established TCP connections to server.
         self._client = get_httpx_client(config_path, make_async=False)
         self._channel_manager = channel_manager
+        self._blob_store = blob_store
-    async def shutdown(self):
+    async def shutdown(self) -> None:
         """Shuts down the task reporter.
         Task reporter stops reporting all task outcomes to the Server.
@@ -84,7 +93,7 @@ class TaskReporter:
         """
         self._is_shutdown = True
-    async def report(self, output: TaskOutput, logger: Any):
+    async def report(self, output: TaskOutput, logger: Any) -> None:
         """Reports result of the supplied task."""
         logger = logger.bind(module=__name__)
@@ -94,9 +103,13 @@ class TaskReporter:
             )
             return
-        task_result, output_files, output_summary = self._process_task_output(output)
-        task_result_data = task_result.model_dump_json(exclude_none=True)
+        # TODO: If the files are uploaded successfully,
+        # we should record that so that if we fail to report
+        # the task outcome, we don't retry the upload.
+        # This will save us some time and resources.
+        # It's good to do this once we delete all the legacy code paths.
+        output_summary: TaskOutputSummary = _task_output_summary(output)
         logger.info(
             "reporting task outcome",
             total_bytes=output_summary.total_bytes,
@@ -110,6 +123,80 @@ class TaskReporter:
             stderr_bytes=output_summary.stderr_total_bytes,
         )
+        if output.output_payload_uri_prefix is None:
+            ingested_files = await self._ingest_files_at_server(output, logger)
+        else:
+            ingested_files = await self._ingest_files_at_blob_store(output, logger)
+        fn_outputs = []
+        for data_payload in ingested_files.data_payloads:
+            fn_outputs.append(
+                DataPayloadProto(
+                    path=data_payload.path,  # TODO: stop using this deprecated field once Server side migration is done.
+                    uri=data_payload.path,
+                    size=data_payload.size,
+                    sha256_hash=data_payload.sha256_hash,
+                    encoding=_to_grpc_data_payload_encoding(output),
+                    encoding_version=0,
+                )
+            )
+        stdout, stderr = None, None
+        if ingested_files.stdout is not None:
+            stdout = DataPayloadProto(
+                path=ingested_files.stdout.path,  # TODO: stop using this deprecated field once Server side migration is done.
+                uri=ingested_files.stdout.path,
+                size=ingested_files.stdout.size,
+                sha256_hash=ingested_files.stdout.sha256_hash,
+                encoding=DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT,
+                encoding_version=0,
+            )
+        if ingested_files.stderr is not None:
+            stderr = DataPayloadProto(
+                path=ingested_files.stderr.path,  # TODO: stop using this deprecated field once Server side migration is done.
+                uri=ingested_files.stderr.path,
+                size=ingested_files.stderr.size,
+                sha256_hash=ingested_files.stderr.sha256_hash,
+                encoding=DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT,
+                encoding_version=0,
+            )
+        request = ReportTaskOutcomeRequest(
+            task_id=output.task_id,
+            namespace=output.namespace,
+            graph_name=output.graph_name,
+            function_name=output.function_name,
+            graph_invocation_id=output.graph_invocation_id,
+            outcome=_to_grpc_task_outcome(output),
+            invocation_id=output.graph_invocation_id,
+            executor_id=self._executor_id,
+            reducer=output.reducer,
+            next_functions=(output.router_output.edges if output.router_output else []),
+            fn_outputs=fn_outputs,
+            stdout=stdout,
+            stderr=stderr,
+            output_encoding=_to_grpc_output_encoding(output),
+            output_encoding_version=0,
+        )
+        try:
+            stub = ExecutorAPIStub(await self._channel_manager.get_channel())
+            with (
+                metric_report_task_outcome_latency.time(),
+                metric_report_task_outcome_errors.count_exceptions(),
+            ):
+                metric_report_task_outcome_rpcs.inc()
+                await stub.report_task_outcome(request, timeout=5.0)
+        except Exception as e:
+            logger.error("failed to report task outcome", error=e)
+            raise e
+    async def _ingest_files_at_server(
+        self, output: TaskOutput, logger: Any
+    ) -> IngestFnOutputsResponse:
+        logger.warning("uploading task output files to server (deprecated mode)")
+        task_result, output_files = self._process_task_output(output)
+        task_result_data = task_result.model_dump_json(exclude_none=True)
         kwargs = {
             "data": {"task_result": task_result_data},
             # Use httpx default timeout of 5s for all timeout types.
@@ -132,7 +219,7 @@ class TaskReporter:
             )
         end_time = time.time()
         logger.info(
-            "files uploaded",
+            "files uploaded to server",
             response_time=end_time - start_time,
             response_code=response.status_code,
         )
@@ -148,68 +235,86 @@ class TaskReporter:
                 f"Response text: '{response.text}'."
             ) from e
-        # TODO: If the files are uploaded successfully,
-        # we should record that so that if we fail to report
-        # the task outcome, we don't retry the upload.
-        # This will save us some time and resources.
         ingested_files_response = response.json()
-        ingested_files = IngestFnOutputsResponse.model_validate(ingested_files_response)
-        fn_outputs = []
-        for data_payload in ingested_files.data_payloads:
-            fn_outputs.append(
-                DataPayload(
-                    path=data_payload.path,
-                    size=data_payload.size,
-                    sha256_hash=data_payload.sha256_hash,
-                )
-            )
-        stdout, stderr = None, None
-        if ingested_files.stdout:
+        return IngestFnOutputsResponse.model_validate(ingested_files_response)
+    async def _ingest_files_at_blob_store(
+        self, output: TaskOutput, logger: Any
+    ) -> IngestFnOutputsResponse:
+        start_time = time.time()
+        with (
+            metric_task_output_blob_store_upload_latency.time(),
+            metric_task_output_blob_store_upload_errors.count_exceptions(),
+        ):
+            metric_task_output_blob_store_uploads.inc()
+            response = await self._upload_output_to_blob_store(output, logger)
+        logger.info(
+            "files uploaded to blob store",
+            duration=time.time() - start_time,
+        )
+        return response
+    async def _upload_output_to_blob_store(
+        self, output: TaskOutput, logger: Any
+    ) -> IngestFnOutputsResponse:
+        data_payloads: List[DataPayload] = []
+        stdout: Optional[DataPayload] = None
+        stderr: Optional[DataPayload] = None
+        if output.stdout is not None:
+            stdout_url = f"{output.output_payload_uri_prefix}.{output.task_id}.stdout"
+            stdout_bytes: bytes = output.stdout.encode()
+            await self._blob_store.put(stdout_url, stdout_bytes, logger)
             stdout = DataPayload(
-                path=ingested_files.stdout.path,
-                size=ingested_files.stdout.size,
-                sha256_hash=ingested_files.stdout.sha256_hash,
+                path=stdout_url,
+                size=len(stdout_bytes),
+                sha256_hash=_compute_hash(stdout_bytes),
             )
-        if ingested_files.stderr:
+        if output.stderr is not None:
+            stderr_url = f"{output.output_payload_uri_prefix}.{output.task_id}.stderr"
+            stderr_bytes: bytes = output.stderr.encode()
+            await self._blob_store.put(stderr_url, stderr_bytes, logger)
             stderr = DataPayload(
-                path=ingested_files.stderr.path,
-                size=ingested_files.stderr.size,
-                sha256_hash=ingested_files.stderr.sha256_hash,
+                path=stderr_url,
+                size=len(stderr_bytes),
+                sha256_hash=_compute_hash(stderr_bytes),
             )
-        request = ReportTaskOutcomeRequest(
-            task_id=output.task_id,
-            namespace=output.namespace,
-            graph_name=output.graph_name,
-            function_name=output.function_name,
-            graph_invocation_id=output.graph_invocation_id,
-            outcome=_to_grpc_task_outcome(output),
-            invocation_id=output.graph_invocation_id,
-            executor_id=self._executor_id,
-            reducer=output.reducer,
-            next_functions=(output.router_output.edges if output.router_output else []),
-            fn_outputs=fn_outputs,
+        if output.function_output is not None:
+            for func_output_item in output.function_output.outputs:
+                node_output_sequence = len(data_payloads)
+                if output.reducer:
+                    # Reducer tasks have to write their results into the same blob.
+                    output_url = (
+                        f"{output.output_payload_uri_prefix}.{node_output_sequence}"
+                    )
+                else:
+                    # Regular tasks write their results into different blobs made unique using task ids.
+                    output_url = f"{output.output_payload_uri_prefix}.{output.task_id}.{node_output_sequence}"
+                output_bytes: bytes = (
+                    func_output_item.bytes
+                    if func_output_item.HasField("bytes")
+                    else func_output_item.string.encode()
+                )
+                await self._blob_store.put(output_url, output_bytes, logger)
+                data_payloads.append(
+                    DataPayload(
+                        path=output_url,
+                        size=len(output_bytes),
+                        sha256_hash=_compute_hash(output_bytes),
+                    )
+                )
+        return IngestFnOutputsResponse(
+            data_payloads=data_payloads,
             stdout=stdout,
             stderr=stderr,
-            output_encoding=_to_grpc_output_encoding(output),
-            output_encoding_version=0,
         )
-        try:
-            stub = ExecutorAPIStub(await self._channel_manager.get_channel())
-            with (
-                metric_report_task_outcome_latency.time(),
-                metric_report_task_outcome_errors.count_exceptions(),
-            ):
-                metric_report_task_outcome_rpcs.inc()
-                await stub.report_task_outcome(request, timeout=5.0)
-        except Exception as e:
-            logger.error("failed to report task outcome", error=e)
-            raise e
-    def _process_task_output(
-        self, output: TaskOutput
-    ) -> Tuple[TaskResult, List[Any], TaskOutputSummary]:
+    def _process_task_output(self, output: TaskOutput) -> Tuple[TaskResult, List[Any]]:
         task_result = TaskResult(
             outcome="failure",
             namespace=output.namespace,
@@ -220,9 +325,8 @@ class TaskReporter:
             task_id=output.task_id,
         )
         output_files: List[Any] = []
-        summary: TaskOutputSummary = TaskOutputSummary()
         if output is None:
-            return task_result, output_files, summary
+            return task_result, output_files
         task_result.outcome = (
             TASK_OUTCOME_SUCCESS if output.success else TASK_OUTCOME_FAILURE
@@ -230,33 +334,19 @@ class TaskReporter:
         task_result.reducer = output.reducer
         _process_function_output(
-            function_output=output.function_output,
-            output_files=output_files,
-            summary=summary,
+            function_output=output.function_output, output_files=output_files
         )
         _process_router_output(
-            router_output=output.router_output, task_result=task_result, summary=summary
-        )
-        _process_stdout(
-            stdout=output.stdout, output_files=output_files, summary=summary
-        )
-        _process_stderr(
-            stderr=output.stderr, output_files=output_files, summary=summary
+            router_output=output.router_output, task_result=task_result
         )
+        _process_stdout(stdout=output.stdout, output_files=output_files)
+        _process_stderr(stderr=output.stderr, output_files=output_files)
-        summary.total_bytes = (
-            summary.output_total_bytes
-            + summary.stdout_total_bytes
-            + summary.stderr_total_bytes
-        )
-        return task_result, output_files, summary
+        return task_result, output_files
 def _process_function_output(
-    function_output: Optional[FunctionOutput],
-    output_files: List[Any],
-    summary: TaskOutputSummary,
+    function_output: Optional[FunctionOutput], output_files: List[Any]
 ) -> None:
     if function_output is None:
         return
@@ -269,25 +359,19 @@ def _process_function_output(
                 (nanoid.generate(), payload, output.content_type),
             )
         )
-        summary.output_count += 1
-        summary.output_total_bytes += len(payload)
 def _process_router_output(
     router_output: Optional[RouterOutput],
     task_result: TaskResult,
-    summary: TaskOutputSummary,
 ) -> None:
     if router_output is None:
         return
     task_result.router_output = RouterOutput(edges=router_output.edges)
-    summary.router_output_count += 1
-def _process_stdout(
-    stdout: Optional[str], output_files: List[Any], summary: TaskOutputSummary
-) -> None:
+def _process_stdout(stdout: Optional[str], output_files: List[Any]) -> None:
     if stdout is None:
         return
@@ -301,13 +385,9 @@ def _process_stdout(
             ),
         )
     )
-    summary.stdout_count += 1
-    summary.stdout_total_bytes += len(stdout)
-def _process_stderr(
-    stderr: Optional[str], output_files: List[Any], summary: TaskOutputSummary
-) -> None:
+def _process_stderr(stderr: Optional[str], output_files: List[Any]) -> None:
     if stderr is None:
         return
@@ -321,8 +401,38 @@ def _process_stderr(
             ),
         )
     )
-    summary.stderr_count += 1
-    summary.stderr_total_bytes += len(stderr)
+def _task_output_summary(output: TaskOutput) -> TaskOutputSummary:
+    summary: TaskOutputSummary = TaskOutputSummary()
+    if output.stdout is not None:
+        summary.stdout_count += 1
+        summary.stdout_total_bytes += len(output.stdout)
+    if output.stderr is not None:
+        summary.stderr_count += 1
+        summary.stderr_total_bytes += len(output.stderr)
+    if output.function_output is not None:
+        for func_output_item in output.function_output.outputs:
+            output_len: bytes = len(
+                func_output_item.bytes
+                if func_output_item.HasField("bytes")
+                else func_output_item.string
+            )
+            summary.output_count += 1
+            summary.output_total_bytes += output_len
+    if output.router_output is not None:
+        summary.router_output_count += 1
+    summary.total_bytes = (
+        summary.output_total_bytes
+        + summary.stdout_total_bytes
+        + summary.stderr_total_bytes
+    )
+    return summary
 def _to_grpc_task_outcome(task_output: TaskOutput) -> TaskOutcome:
@@ -337,3 +447,16 @@ def _to_grpc_output_encoding(task_output: TaskOutput) -> OutputEncoding:
         return OutputEncoding.OUTPUT_ENCODING_JSON
     else:
         return OutputEncoding.OUTPUT_ENCODING_PICKLE
+def _to_grpc_data_payload_encoding(task_output: TaskOutput) -> DataPayloadEncoding:
+    if task_output.output_encoding == "json":
+        return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_JSON
+    else:
+        return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_BINARY_PICKLE
+def _compute_hash(data: bytes) -> str:
+    hasher = hashlib.sha256(usedforsecurity=False)
+    hasher.update(data)
+    return hasher.hexdigest()

indexify/executor/task_runner.py CHANGED Viewed

@@ -85,6 +85,7 @@ class TaskRunner:
                 function_name=task_input.task.compute_fn,
                 graph_version=task_input.task.graph_version,
                 graph_invocation_id=task_input.task.invocation_id,
+                output_payload_uri_prefix=task_input.task.output_payload_uri_prefix,
             )
         finally:
             if state is not None:

indexify/proto/executor_api.proto CHANGED Viewed

@@ -4,6 +4,28 @@ syntax = "proto3";
 // Existing clients won't find the service if the package name changes.
 package executor_api_pb;
+// ===== DataPayload =====
+enum DataPayloadEncoding {
+    DATA_PAYLOAD_ENCODING_UNKNOWN = 0;
+    // These encodings are currently mapping 1:1 to mime types.
+    // TODO: use SDK specific encodings becase 1:1 mapping might not work in the future.
+    DATA_PAYLOAD_ENCODING_UTF8_JSON = 1;
+    DATA_PAYLOAD_ENCODING_UTF8_TEXT = 2;
+    DATA_PAYLOAD_ENCODING_BINARY_PICKLE = 3;
+}
+message DataPayload {
+    optional string path = 1; // deprecated, TODO: remove when URI us used everywhere
+    optional uint64 size = 2;
+    optional string sha256_hash = 3;
+    // URI of the data.
+    // S3 URI if the data is stored in S3.
+    // Starts with "file://"" prefix if the data is stored on a local file system.
+    optional string uri = 4;
+    optional DataPayloadEncoding encoding = 5;
+    optional uint64 encoding_version = 6;
+}
 // ===== report_executor_state RPC =====
 enum GPUModel {
@@ -72,6 +94,7 @@ message FunctionExecutorDescription {
     optional HostResources resource_limits = 8;
     // Timeout for customer code duration during FE creation.
     optional uint32 customer_code_timeout_ms = 9;
+    optional DataPayload graph = 10;
 }
 message FunctionExecutorState {
@@ -112,6 +135,9 @@ message ExecutorState {
     repeated FunctionExecutorState function_executor_states = 9;
     map<string, string> labels = 10;
     optional string state_hash = 11;
+    // Server supplied clock value of the latest desired executor state that was
+    // reconciled by Executor. Not included into state_hash.
+    optional uint64 server_clock = 12;
 }
 // A message sent by Executor to report its up to date state to Server.
@@ -131,9 +157,15 @@ message Task {
     optional string graph_version = 4;
     optional string function_name = 5;
     optional string graph_invocation_id = 6;
-    optional string input_key = 8;
-    optional string reducer_output_key = 9;
+    optional string input_key = 8; // deprecated. TODO: remove when input is used everywhere
+    optional string reducer_output_key = 9; // deprecated. TODO: remove when reducer_input is used everywhere
     optional uint32 timeout_ms = 10;
+    optional DataPayload input = 11;
+    optional DataPayload reducer_input = 12;
+    // URI prefix for the output payloads.
+    // S3 URI if the data is stored in S3.
+    // Starts with "file://"" prefix followed by an absolute directory path if the data is stored on a local file system.
+    optional string output_payload_uri_prefix = 13;
 }
 message TaskAllocation {
@@ -163,12 +195,6 @@ enum TaskOutcome {
     TASK_OUTCOME_FAILURE = 2;
 }
-message DataPayload {
-    optional string path = 1;
-    optional uint64 size = 2;
-    optional string sha256_hash = 3;
-}
 enum OutputEncoding {
     OUTPUT_ENCODING_UNKNOWN = 0;
     OUTPUT_ENCODING_JSON = 1;
@@ -183,7 +209,7 @@ message ReportTaskOutcomeRequest {
     optional string function_name = 4;
     optional string graph_invocation_id = 6;
     optional TaskOutcome outcome = 7;
-    optional string invocation_id = 8;
+    optional string invocation_id = 8; // deprecated. TODO: remove when graph_invocation_id is used everywhere
     optional string executor_id = 9;
     optional bool reducer = 10;
@@ -196,10 +222,10 @@ message ReportTaskOutcomeRequest {
     optional DataPayload stdout = 14;
     optional DataPayload stderr = 15;
     // Output encoding of all the outputs of a function have to be same.
-    optional OutputEncoding output_encoding = 13;
+    optional OutputEncoding output_encoding = 13; // deprecated. TODO: remove when DataPayload.encoding is used everywhere
     // This allows us to change how we encode the output from functions
     // and serialize them into storage.
-    optional uint64 output_encoding_version = 5;
+    optional uint64 output_encoding_version = 5;  // deprecated. TODO: remove when DataPayload.encoding_version is used everywhere
 }
 message ReportTaskOutcomeResponse {

indexify 0.3.19__py3-none-any.whl → 0.3.21__py3-none-any.whl

indexify 0.3.19py3-none-any.whl → 0.3.21py3-none-any.whl