PyPI - nv-ingest - Versions diffs - 2025.5.21.dev20250521__py3-none-any.whl - Mend

nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show

nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py ADDED Viewed

@@ -0,0 +1,195 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import json
+import logging
+import math
+import os
+from typing import Dict, Any
+import ray
+from pydantic import BaseModel
+from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import RayPipeline
+from nv_ingest.framework.orchestration.ray.util.pipeline.stage_builders import (
+    add_source_stage,
+    add_metadata_injector_stage,
+    add_pdf_extractor_stage,
+    add_image_extractor_stage,
+    add_docx_extractor_stage,
+    add_audio_extractor_stage,
+    add_image_dedup_stage,
+    add_image_filter_stage,
+    add_table_extractor_stage,
+    add_chart_extractor_stage,
+    add_image_caption_stage,
+    add_text_splitter_stage,
+    add_text_embedding_stage,
+    add_embedding_storage_stage,
+    add_image_storage_stage,
+    add_message_broker_response_stage,
+    add_pptx_extractor_stage,
+    add_infographic_extractor_stage,
+    add_otel_tracer_stage,
+    add_default_drain_stage,
+)
+from nv_ingest_api.util.system.hardware_info import SystemResourceProbe
+logger = logging.getLogger("uvicorn")
+def export_config_to_env(ingest_config: Any) -> None:
+    if isinstance(ingest_config, BaseModel):
+        ingest_config = ingest_config.model_dump()
+    os.environ.update({key.upper(): val for key, val in ingest_config.items()})
+def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any] = None):
+    # Initialize the pipeline with the configuration
+    if ingest_config:
+        # Export the config to environment variables
+        export_config_to_env(ingest_config)
+    current_level = logging.getLogger().getEffectiveLevel()
+    ray.init(
+        namespace="nv_ingest_ray",
+        logging_level=current_level,
+        ignore_reinit_error=True,
+        dashboard_host="0.0.0.0",
+        dashboard_port=8265,
+        _system_config={
+            "local_fs_capacity_threshold": 0.9,
+            "object_spilling_config": json.dumps(
+                {
+                    "type": "filesystem",
+                    "params": {
+                        "directory_path": [
+                            "/tmp/ray_spill_testing_0",
+                            "/tmp/ray_spill_testing_1",
+                            "/tmp/ray_spill_testing_2",
+                            "/tmp/ray_spill_testing_3",
+                        ],
+                        "buffer_size": 100_000_000,
+                    },
+                },
+            ),
+        },
+    )
+    system_resource_probe = SystemResourceProbe()
+    effective_cpu_core_count = system_resource_probe.get_effective_cores()
+    default_cpu_count = int(os.environ.get("NV_INGEST_MAX_UTIL", int(max(1, math.floor(effective_cpu_core_count)))))
+    add_meter_stage = os.environ.get("MESSAGE_CLIENT_TYPE") != "simple"
+    _ = add_meter_stage  # TODO(Devin)
+    ########################################################################################################
+    ## Insertion and Pre-processing stages
+    ########################################################################################################
+    logger.debug("Setting up ingestion pipeline")
+    source_stage_id = add_source_stage(pipeline, default_cpu_count)
+    # TODO(Devin): Job counter used a global stats object that isn't ray compatible, need to update.
+    # submitted_job_counter_stage = add_submitted_job_counter_stage(pipe, morpheus_pipeline_config, ingest_config)
+    metadata_injector_stage_id = add_metadata_injector_stage(pipeline, default_cpu_count)
+    ########################################################################################################
+    ########################################################################################################
+    ## Primitive extraction
+    ########################################################################################################
+    pdf_extractor_stage_id = add_pdf_extractor_stage(pipeline, default_cpu_count)
+    image_extractor_stage_id = add_image_extractor_stage(pipeline, default_cpu_count)
+    docx_extractor_stage_id = add_docx_extractor_stage(pipeline, default_cpu_count)
+    pptx_extractor_stage_id = add_pptx_extractor_stage(pipeline, default_cpu_count)
+    audio_extractor_stage_id = add_audio_extractor_stage(pipeline, default_cpu_count)
+    ########################################################################################################
+    ########################################################################################################
+    ## Post-processing
+    ########################################################################################################
+    image_dedup_stage_id = add_image_dedup_stage(pipeline, default_cpu_count)
+    image_filter_stage_id = add_image_filter_stage(pipeline, default_cpu_count)
+    table_extraction_stage_id = add_table_extractor_stage(pipeline, default_cpu_count)
+    chart_extraction_stage_id = add_chart_extractor_stage(pipeline, default_cpu_count)
+    infographic_extraction_stage_id = add_infographic_extractor_stage(pipeline, default_cpu_count)
+    image_caption_stage_id = add_image_caption_stage(pipeline, default_cpu_count)
+    ########################################################################################################
+    ########################################################################################################
+    ## Transforms and data synthesis
+    ########################################################################################################
+    text_splitter_stage_id = add_text_splitter_stage(pipeline, default_cpu_count)
+    embed_extractions_stage_id = add_text_embedding_stage(pipeline, default_cpu_count)
+    ########################################################################################################
+    ## Storage and output
+    ########################################################################################################
+    embedding_storage_stage_id = add_embedding_storage_stage(pipeline, default_cpu_count)
+    image_storage_stage_id = add_image_storage_stage(pipeline, default_cpu_count)
+    # vdb_task_sink_stage = add_vdb_task_sink_stage(pipe, morpheus_pipeline_config, ingest_config)
+    broker_response_stage_id = add_message_broker_response_stage(pipeline, default_cpu_count)
+    ########################################################################################################
+    #######################################################################################################
+    ## Telemetry (Note: everything after the sync stage is out of the hot path, please keep it that way) ##
+    #######################################################################################################
+    otel_tracer_stage_id = add_otel_tracer_stage(pipeline, default_cpu_count)
+    # TODO(devin)
+    # if add_meter_stage:
+    #    otel_meter_stage = add_otel_meter_stage(pipe, morpheus_pipeline_config, ingest_config)
+    # else:
+    #    otel_meter_stage = None
+    # completed_job_counter_stage = add_completed_job_counter_stage(pipe, morpheus_pipeline_config, ingest_config)
+    ########################################################################################################
+    # Add a drain stage to the pipeline -- flushes and deletes control messages
+    drain_id = add_default_drain_stage(pipeline, default_cpu_count)
+    ingest_edge_buffer_size = int(os.environ.get("INGEST_EDGE_BUFFER_SIZE", 32))
+    # Add edges
+    ###### Intake Stages ########
+    pipeline.make_edge(source_stage_id, metadata_injector_stage_id, queue_size=ingest_edge_buffer_size)
+    pipeline.make_edge(metadata_injector_stage_id, pdf_extractor_stage_id, queue_size=ingest_edge_buffer_size)
+    ###### Document Extractors ########
+    pipeline.make_edge(pdf_extractor_stage_id, audio_extractor_stage_id, queue_size=ingest_edge_buffer_size)
+    pipeline.make_edge(audio_extractor_stage_id, docx_extractor_stage_id, queue_size=ingest_edge_buffer_size)
+    pipeline.make_edge(docx_extractor_stage_id, pptx_extractor_stage_id, queue_size=ingest_edge_buffer_size)
+    pipeline.make_edge(pptx_extractor_stage_id, image_extractor_stage_id, queue_size=ingest_edge_buffer_size)
+    pipeline.make_edge(image_extractor_stage_id, infographic_extraction_stage_id, queue_size=ingest_edge_buffer_size)
+    ###### Primitive Extractors ########
+    pipeline.make_edge(infographic_extraction_stage_id, table_extraction_stage_id, queue_size=ingest_edge_buffer_size)
+    pipeline.make_edge(table_extraction_stage_id, chart_extraction_stage_id, queue_size=ingest_edge_buffer_size)
+    pipeline.make_edge(chart_extraction_stage_id, image_filter_stage_id, queue_size=ingest_edge_buffer_size)
+    ###### Primitive Mutators ########
+    pipeline.make_edge(image_filter_stage_id, image_dedup_stage_id, queue_size=ingest_edge_buffer_size)
+    pipeline.make_edge(image_dedup_stage_id, text_splitter_stage_id, queue_size=ingest_edge_buffer_size)
+    ###### Primitive Transforms ########
+    pipeline.make_edge(text_splitter_stage_id, embed_extractions_stage_id, queue_size=ingest_edge_buffer_size)
+    pipeline.make_edge(embed_extractions_stage_id, image_caption_stage_id, queue_size=ingest_edge_buffer_size)
+    pipeline.make_edge(image_caption_stage_id, image_storage_stage_id, queue_size=ingest_edge_buffer_size)
+    ###### Primitive Storage ########
+    pipeline.make_edge(image_storage_stage_id, embedding_storage_stage_id, queue_size=ingest_edge_buffer_size)
+    pipeline.make_edge(embedding_storage_stage_id, broker_response_stage_id, queue_size=ingest_edge_buffer_size)
+    ###### Response and Telemetry ########
+    pipeline.make_edge(broker_response_stage_id, otel_tracer_stage_id, queue_size=ingest_edge_buffer_size)
+    pipeline.make_edge(otel_tracer_stage_id, drain_id, queue_size=ingest_edge_buffer_size)
+    pipeline.build()
+    # TODO(devin)
+    # if add_meter_stage:
+    #    pipe.add_edge(sink_stage, otel_meter_stage)
+    #    pipe.add_edge(otel_meter_stage, otel_tracer_stage)
+    # else:
+    #    pipe.add_edge(sink_stage, otel_tracer_stage)
+    # pipe.add_edge(otel_tracer_stage, completed_job_counter_stage)

nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py ADDED Viewed

@@ -0,0 +1,170 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import os
+import time
+from datetime import datetime
+from typing import Union, Tuple
+import ray
+from pydantic import BaseModel, ConfigDict
+from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import RayPipeline, ScalingConfig
+from nv_ingest.framework.orchestration.ray.util.pipeline.pipeline_builders import setup_ingestion_pipeline
+logger = logging.getLogger(__name__)
+def str_to_bool(value: str) -> bool:
+    return value.strip().lower() in {"1", "true", "yes", "on"}
+DISABLE_DYNAMIC_SCALING = str_to_bool(os.environ.get("INGEST_DISABLE_DYNAMIC_SCALING", "false"))
+DYNAMIC_MEMORY_THRESHOLD = float(os.environ.get("INGEST_DYNAMIC_MEMORY_THRESHOLD", 0.75))
+class PipelineCreationSchema(BaseModel):
+    """
+    Schema for pipeline creation configuration.
+    Contains all parameters required to set up and execute the pipeline,
+    including endpoints, API keys, and processing options.
+    """
+    # Audio processing settings
+    audio_grpc_endpoint: str = os.getenv("AUDIO_GRPC_ENDPOINT", "grpc.nvcf.nvidia.com:443")
+    audio_function_id: str = os.getenv("AUDIO_FUNCTION_ID", "1598d209-5e27-4d3c-8079-4751568b1081")
+    audio_infer_protocol: str = os.getenv("AUDIO_INFER_PROTOCOL", "grpc")
+    # Embedding model settings
+    embedding_nim_endpoint: str = os.getenv("EMBEDDING_NIM_ENDPOINT", "https://integrate.api.nvidia.com/v1")
+    embedding_nim_model_name: str = os.getenv("EMBEDDING_NIM_MODEL_NAME", "nvidia/llama-3.2-nv-embedqa-1b-v2")
+    # General pipeline settings
+    ingest_log_level: str = os.getenv("INGEST_LOG_LEVEL", "INFO")
+    max_ingest_process_workers: str = os.getenv("MAX_INGEST_PROCESS_WORKERS", "16")
+    # Messaging configuration
+    message_client_host: str = os.getenv("MESSAGE_CLIENT_HOST", "localhost")
+    message_client_port: str = os.getenv("MESSAGE_CLIENT_PORT", "7671")
+    message_client_type: str = os.getenv("MESSAGE_CLIENT_TYPE", "simple")
+    # NeMo Retriever settings
+    nemoretriever_parse_http_endpoint: str = os.getenv(
+        "NEMORETRIEVER_PARSE_HTTP_ENDPOINT", "https://integrate.api.nvidia.com/v1/chat/completions"
+    )
+    nemoretriever_parse_infer_protocol: str = os.getenv("NEMORETRIEVER_PARSE_INFER_PROTOCOL", "http")
+    nemoretriever_parse_model_name: str = os.getenv("NEMORETRIEVER_PARSE_MODEL_NAME", "nvidia/nemoretriever-parse")
+    # API keys
+    ngc_api_key: str = os.getenv("NGC_API_KEY", "")
+    nvidia_build_api_key: str = os.getenv("NVIDIA_BUILD_API_KEY", "")
+    # Observability settings
+    otel_exporter_otlp_endpoint: str = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317")
+    # OCR settings
+    paddle_http_endpoint: str = os.getenv("PADDLE_HTTP_ENDPOINT", "https://ai.api.nvidia.com/v1/cv/baidu/paddleocr")
+    paddle_infer_protocol: str = os.getenv("PADDLE_INFER_PROTOCOL", "http")
+    # Task queue settings
+    REDIS_INGEST_TASK_QUEUE: str = "ingest_task_queue"
+    # Vision language model settings
+    vlm_caption_endpoint: str = os.getenv(
+        "VLM_CAPTION_ENDPOINT", "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions"
+    )
+    vlm_caption_model_name: str = os.getenv("VLM_CAPTION_MODEL_NAME", "meta/llama-3.2-11b-vision-instruct")
+    # YOLOX image processing settings
+    yolox_graphic_elements_http_endpoint: str = os.getenv(
+        "YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT",
+        "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1",
+    )
+    yolox_graphic_elements_infer_protocol: str = os.getenv("YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL", "http")
+    # YOLOX page elements settings
+    yolox_http_endpoint: str = os.getenv(
+        "YOLOX_HTTP_ENDPOINT", "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
+    )
+    yolox_infer_protocol: str = os.getenv("YOLOX_INFER_PROTOCOL", "http")
+    # YOLOX table structure settings
+    yolox_table_structure_http_endpoint: str = os.getenv(
+        "YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT", "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1"
+    )
+    yolox_table_structure_infer_protocol: str = os.getenv("YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL", "http")
+    model_config = ConfigDict(extra="forbid")
+def _launch_pipeline(
+    ingest_config: PipelineCreationSchema,
+    block: bool,
+    disable_dynamic_scaling: bool = None,
+    dynamic_memory_threshold: float = None,
+) -> Tuple[Union[RayPipeline, None], float]:
+    logger.info("Starting pipeline setup")
+    dynamic_memory_scaling = not DISABLE_DYNAMIC_SCALING
+    if disable_dynamic_scaling is not None:
+        dynamic_memory_scaling = not disable_dynamic_scaling
+    dynamic_memory_threshold = dynamic_memory_threshold if dynamic_memory_threshold else DYNAMIC_MEMORY_THRESHOLD
+    scaling_config = ScalingConfig(
+        dynamic_memory_scaling=dynamic_memory_scaling, dynamic_memory_threshold=dynamic_memory_threshold
+    )
+    pipeline = RayPipeline(scaling_config=scaling_config)
+    start_abs = datetime.now()
+    # Set up the ingestion pipeline
+    setup_ingestion_pipeline(pipeline, ingest_config.model_dump())
+    # Record setup time
+    end_setup = start_run = datetime.now()
+    setup_elapsed = (end_setup - start_abs).total_seconds()
+    logger.info(f"Pipeline setup completed in {setup_elapsed:.2f} seconds")
+    # Run the pipeline
+    logger.debug("Running pipeline")
+    pipeline.start()
+    if block:
+        try:
+            while True:
+                time.sleep(5)
+        except KeyboardInterrupt:
+            logger.info("Interrupt received, shutting down pipeline.")
+            pipeline.stop()
+            ray.shutdown()
+            logger.info("Ray shutdown complete.")
+        # Record execution times
+        end_run = datetime.now()
+        run_elapsed = (end_run - start_run).total_seconds()
+        total_elapsed = (end_run - start_abs).total_seconds()
+        logger.info(f"Pipeline run completed in {run_elapsed:.2f} seconds")
+        logger.info(f"Total time elapsed: {total_elapsed:.2f} seconds")
+        return None, total_elapsed
+    else:
+        return pipeline, 0.0
+def run_pipeline(
+    ingest_config: PipelineCreationSchema,
+    block: bool = True,
+    disable_dynamic_scaling: bool = None,
+    dynamic_memory_threshold: float = None,
+) -> Union[RayPipeline, float]:
+    pipeline, total_elapsed = _launch_pipeline(ingest_config, block, disable_dynamic_scaling, dynamic_memory_threshold)
+    if block:
+        logger.debug(f"Pipeline execution completed successfully in {total_elapsed:.2f} seconds.")
+    return pipeline