PyPI - nv-ingest-api - Versions diffs - 26.1.0rc4__py3-none-any.whl - Mend

nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show

nv_ingest_api/__init__.py +3 -0
nv_ingest_api/interface/__init__.py +218 -0
nv_ingest_api/interface/extract.py +977 -0
nv_ingest_api/interface/mutate.py +154 -0
nv_ingest_api/interface/store.py +200 -0
nv_ingest_api/interface/transform.py +382 -0
nv_ingest_api/interface/utility.py +186 -0
nv_ingest_api/internal/__init__.py +0 -0
nv_ingest_api/internal/enums/__init__.py +3 -0
nv_ingest_api/internal/enums/common.py +550 -0
nv_ingest_api/internal/extract/__init__.py +3 -0
nv_ingest_api/internal/extract/audio/__init__.py +3 -0
nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
nv_ingest_api/internal/extract/docx/__init__.py +5 -0
nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
nv_ingest_api/internal/extract/html/__init__.py +3 -0
nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
nv_ingest_api/internal/extract/image/__init__.py +3 -0
nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
nv_ingest_api/internal/meta/__init__.py +3 -0
nv_ingest_api/internal/meta/udf.py +232 -0
nv_ingest_api/internal/mutate/__init__.py +3 -0
nv_ingest_api/internal/mutate/deduplicate.py +110 -0
nv_ingest_api/internal/mutate/filter.py +133 -0
nv_ingest_api/internal/primitives/__init__.py +0 -0
nv_ingest_api/internal/primitives/control_message_task.py +16 -0
nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
nv_ingest_api/internal/schemas/__init__.py +3 -0
nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
nv_ingest_api/internal/schemas/meta/udf.py +23 -0
nv_ingest_api/internal/schemas/mixins.py +39 -0
nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
nv_ingest_api/internal/schemas/store/__init__.py +3 -0
nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
nv_ingest_api/internal/store/__init__.py +3 -0
nv_ingest_api/internal/store/embed_text_upload.py +236 -0
nv_ingest_api/internal/store/image_upload.py +251 -0
nv_ingest_api/internal/transform/__init__.py +3 -0
nv_ingest_api/internal/transform/caption_image.py +219 -0
nv_ingest_api/internal/transform/embed_text.py +702 -0
nv_ingest_api/internal/transform/split_text.py +182 -0
nv_ingest_api/util/__init__.py +3 -0
nv_ingest_api/util/control_message/__init__.py +0 -0
nv_ingest_api/util/control_message/validators.py +47 -0
nv_ingest_api/util/converters/__init__.py +0 -0
nv_ingest_api/util/converters/bytetools.py +78 -0
nv_ingest_api/util/converters/containers.py +65 -0
nv_ingest_api/util/converters/datetools.py +90 -0
nv_ingest_api/util/converters/dftools.py +127 -0
nv_ingest_api/util/converters/formats.py +64 -0
nv_ingest_api/util/converters/type_mappings.py +27 -0
nv_ingest_api/util/dataloader/__init__.py +9 -0
nv_ingest_api/util/dataloader/dataloader.py +409 -0
nv_ingest_api/util/detectors/__init__.py +5 -0
nv_ingest_api/util/detectors/language.py +38 -0
nv_ingest_api/util/exception_handlers/__init__.py +0 -0
nv_ingest_api/util/exception_handlers/converters.py +72 -0
nv_ingest_api/util/exception_handlers/decorators.py +429 -0
nv_ingest_api/util/exception_handlers/detectors.py +74 -0
nv_ingest_api/util/exception_handlers/pdf.py +116 -0
nv_ingest_api/util/exception_handlers/schemas.py +68 -0
nv_ingest_api/util/image_processing/__init__.py +5 -0
nv_ingest_api/util/image_processing/clustering.py +260 -0
nv_ingest_api/util/image_processing/processing.py +177 -0
nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
nv_ingest_api/util/image_processing/transforms.py +850 -0
nv_ingest_api/util/imports/__init__.py +3 -0
nv_ingest_api/util/imports/callable_signatures.py +108 -0
nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
nv_ingest_api/util/introspection/__init__.py +3 -0
nv_ingest_api/util/introspection/class_inspect.py +145 -0
nv_ingest_api/util/introspection/function_inspect.py +65 -0
nv_ingest_api/util/logging/__init__.py +0 -0
nv_ingest_api/util/logging/configuration.py +102 -0
nv_ingest_api/util/logging/sanitize.py +84 -0
nv_ingest_api/util/message_brokers/__init__.py +3 -0
nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
nv_ingest_api/util/metadata/__init__.py +5 -0
nv_ingest_api/util/metadata/aggregators.py +516 -0
nv_ingest_api/util/multi_processing/__init__.py +8 -0
nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
nv_ingest_api/util/nim/__init__.py +161 -0
nv_ingest_api/util/pdf/__init__.py +3 -0
nv_ingest_api/util/pdf/pdfium.py +428 -0
nv_ingest_api/util/schema/__init__.py +3 -0
nv_ingest_api/util/schema/schema_validator.py +10 -0
nv_ingest_api/util/service_clients/__init__.py +3 -0
nv_ingest_api/util/service_clients/client_base.py +86 -0
nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
nv_ingest_api/util/string_processing/__init__.py +51 -0
nv_ingest_api/util/string_processing/configuration.py +682 -0
nv_ingest_api/util/string_processing/yaml.py +109 -0
nv_ingest_api/util/system/__init__.py +0 -0
nv_ingest_api/util/system/hardware_info.py +594 -0
nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
udfs/__init__.py +5 -0
udfs/llm_summarizer_udf.py +259 -0

nv_ingest_api/internal/schemas/extract/extract_audio_schema.py ADDED Viewed

@@ -0,0 +1,133 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from typing import Optional
+from typing import Tuple
+from pydantic import BaseModel, Field
+from pydantic import root_validator
+from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
+logger = logging.getLogger(__name__)
+class AudioConfigSchema(LowercaseProtocolMixin):
+    """
+    Configuration schema for audio extraction endpoints and options.
+    Parameters
+    ----------
+    auth_token : Optional[str], default=None
+        Authentication token required for secure services.
+    audio_endpoints : Tuple[str, str]
+        A tuple containing the gRPC and HTTP services for the audio_retriever endpoint.
+        Either the gRPC or HTTP service can be empty, but not both.
+    Methods
+    -------
+    validate_endpoints(values)
+        Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
+    Raises
+    ------
+    ValueError
+        If both gRPC and HTTP services are empty for any endpoint.
+    Config
+    ------
+    extra : str
+        Pydantic config option to forbid extra fields.
+    """
+    auth_token: Optional[str] = Field(default=None, repr=False)
+    audio_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
+    audio_infer_protocol: Optional[str] = None
+    function_id: Optional[str] = None
+    use_ssl: Optional[bool] = None
+    ssl_cert: Optional[str] = Field(default=None, repr=False)
+    segment_audio: Optional[bool] = None
+    @root_validator(pre=True)
+    def validate_endpoints(cls, values):
+        """
+        Validates the gRPC and HTTP services for all endpoints.
+        Parameters
+        ----------
+        values : dict
+            Dictionary containing the values of the attributes for the class.
+        Returns
+        -------
+        dict
+            The validated dictionary of values.
+        Raises
+        ------
+        ValueError
+            If both gRPC and HTTP services are empty for any endpoint.
+        """
+        def clean_service(service):
+            """Set service to None if it's an empty string or contains only spaces or quotes."""
+            if service is None or not service.strip() or service.strip(" \"'") == "":
+                return None
+            return service
+        endpoint_name = "audio_endpoints"
+        grpc_service, http_service = values.get(endpoint_name)
+        grpc_service = clean_service(grpc_service)
+        http_service = clean_service(http_service)
+        if not grpc_service and not http_service:
+            raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
+        values[endpoint_name] = (grpc_service, http_service)
+        # Auto-infer protocol from endpoints if not specified
+        protocol_name = "audio_infer_protocol"
+        protocol_value = values.get(protocol_name)
+        if not protocol_value:
+            protocol_value = "http" if http_service else "grpc" if grpc_service else ""
+        values[protocol_name] = protocol_value
+        return values
+    class Config:
+        extra = "forbid"
+class AudioExtractorSchema(BaseModel):
+    """
+    Configuration schema for the PDF extractor settings.
+    Parameters
+    ----------
+    max_queue_size : int, default=1
+        The maximum number of items allowed in the processing queue.
+    n_workers : int, default=16
+        The number of worker threads to use for processing.
+    raise_on_failure : bool, default=False
+        A flag indicating whether to raise an exception on processing failure.
+    audio_extraction_config: Optional[AudioConfigSchema], default=None
+        Configuration schema for the audio extraction stage.
+    """
+    max_queue_size: int = 1
+    n_workers: int = 16
+    raise_on_failure: bool = False
+    audio_extraction_config: Optional[AudioConfigSchema] = None
+    class Config:
+        extra = "forbid"

nv_ingest_api/internal/schemas/extract/extract_chart_schema.py ADDED Viewed

@@ -0,0 +1,144 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from typing import Optional
+from typing import Tuple
+from pydantic import field_validator, model_validator, ConfigDict, BaseModel, Field
+from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
+logger = logging.getLogger(__name__)
+class ChartExtractorConfigSchema(LowercaseProtocolMixin):
+    """
+    Configuration schema for chart extraction service endpoints and options.
+    Parameters
+    ----------
+    auth_token : Optional[str], default=None
+        Authentication token required for secure services.
+    yolox_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
+        A tuple containing the gRPC and HTTP services for the yolox endpoint.
+        Either the gRPC or HTTP service can be empty, but not both.
+    ocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
+        A tuple containing the gRPC and HTTP services for the ocr endpoint.
+        Either the gRPC or HTTP service can be empty, but not both.
+    Methods
+    -------
+    validate_endpoints(values)
+        Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
+    Raises
+    ------
+    ValueError
+        If both gRPC and HTTP services are empty for any endpoint.
+    Config
+    ------
+    extra : str
+        Pydantic config option to forbid extra fields.
+    """
+    auth_token: Optional[str] = Field(default=None, repr=False)
+    yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
+    yolox_infer_protocol: str = ""
+    ocr_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
+    ocr_infer_protocol: str = ""
+    nim_batch_size: int = 2
+    workers_per_progress_engine: int = 5
+    @model_validator(mode="before")
+    @classmethod
+    def validate_endpoints(cls, values):
+        """
+        Validates the gRPC and HTTP services for all endpoints.
+        Ensures that at least one service (either gRPC or HTTP) is provided
+        for each endpoint in the configuration.
+        Parameters
+        ----------
+        values : dict
+            Dictionary containing the values of the attributes for the class.
+        Returns
+        -------
+        dict
+            The validated dictionary of values.
+        Raises
+        ------
+        ValueError
+            If both gRPC and HTTP services are empty for any endpoint.
+        """
+        def clean_service(service):
+            """Set service to None if it's an empty string or contains only spaces or quotes."""
+            if service is None or not service.strip() or service.strip(" \"'") == "":
+                return None
+            return service
+        for endpoint_name in ["yolox_endpoints", "ocr_endpoints"]:
+            grpc_service, http_service = values.get(endpoint_name, (None, None))
+            grpc_service = clean_service(grpc_service)
+            http_service = clean_service(http_service)
+            if not grpc_service and not http_service:
+                raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
+            values[endpoint_name] = (grpc_service, http_service)
+            # Auto-infer protocol from endpoints if not specified
+            protocol_name = endpoint_name.replace("_endpoints", "_infer_protocol")
+            protocol_value = values.get(protocol_name)
+            if not protocol_value:
+                protocol_value = "http" if http_service else "grpc" if grpc_service else ""
+            values[protocol_name] = protocol_value
+        return values
+    model_config = ConfigDict(extra="forbid")
+class ChartExtractorSchema(BaseModel):
+    """
+    Configuration schema for chart extraction processing settings.
+    Parameters
+    ----------
+    max_queue_size : int, default=1
+        The maximum number of items allowed in the processing queue.
+    n_workers : int, default=2
+        The number of worker threads to use for processing.
+    raise_on_failure : bool, default=False
+        A flag indicating whether to raise an exception if a failure occurs during chart extraction.
+    extraction_config: Optional[ChartExtractorConfigSchema], default=None
+        Configuration for the chart extraction stage, including yolox and ocr service endpoints.
+    """
+    max_queue_size: int = 1
+    n_workers: int = 2
+    raise_on_failure: bool = False
+    endpoint_config: Optional[ChartExtractorConfigSchema] = None
+    @field_validator("max_queue_size", "n_workers")
+    def check_positive(cls, v, field):
+        if v <= 0:
+            raise ValueError(f"{field.field_name} must be greater than 0.")
+        return v
+    model_config = ConfigDict(extra="forbid")

nv_ingest_api/internal/schemas/extract/extract_docx_schema.py ADDED Viewed

@@ -0,0 +1,129 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from typing import Optional
+from typing import Tuple
+from pydantic import model_validator, ConfigDict, BaseModel, Field
+from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema
+from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
+logger = logging.getLogger(__name__)
+class DocxConfigSchema(LowercaseProtocolMixin):
+    """
+    Configuration schema for docx extraction endpoints and options.
+    Parameters
+    ----------
+    auth_token : Optional[str], default=None
+        Authentication token required for secure services.
+    yolox_endpoints : Tuple[str, str]
+        A tuple containing the gRPC and HTTP services for the yolox endpoint.
+        Either the gRPC or HTTP service can be empty, but not both.
+    Methods
+    -------
+    validate_endpoints(values)
+        Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
+    Raises
+    ------
+    ValueError
+        If both gRPC and HTTP services are empty for any endpoint.
+    Config
+    ------
+    extra : str
+        Pydantic config option to forbid extra fields.
+    """
+    auth_token: Optional[str] = Field(default=None, repr=False)
+    yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
+    yolox_infer_protocol: str = ""
+    @model_validator(mode="before")
+    @classmethod
+    def validate_endpoints(cls, values):
+        """
+        Validates the gRPC and HTTP services for all endpoints.
+        Parameters
+        ----------
+        values : dict
+            Dictionary containing the values of the attributes for the class.
+        Returns
+        -------
+        dict
+            The validated dictionary of values.
+        Raises
+        ------
+        ValueError
+            If both gRPC and HTTP services are empty for any endpoint.
+        """
+        def clean_service(service):
+            """Set service to None if it's an empty string or contains only spaces or quotes."""
+            if service is None or not service.strip() or service.strip(" \"'") == "":
+                return None
+            return service
+        for model_name in ["yolox"]:
+            endpoint_name = f"{model_name}_endpoints"
+            grpc_service, http_service = values.get(endpoint_name)
+            grpc_service = clean_service(grpc_service)
+            http_service = clean_service(http_service)
+            if not grpc_service and not http_service:
+                raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
+            values[endpoint_name] = (grpc_service, http_service)
+            # Auto-infer protocol from endpoints if not specified
+            protocol_name = f"{model_name}_infer_protocol"
+            protocol_value = values.get(protocol_name)
+            if not protocol_value:
+                protocol_value = "http" if http_service else "grpc" if grpc_service else ""
+            values[protocol_name] = protocol_value
+        return values
+    model_config = ConfigDict(extra="forbid")
+class DocxExtractorSchema(BaseModel):
+    """
+    Configuration schema for the PDF extractor settings.
+    Parameters
+    ----------
+    max_queue_size : int, default=1
+        The maximum number of items allowed in the processing queue.
+    n_workers : int, default=16
+        The number of worker threads to use for processing.
+    raise_on_failure : bool, default=False
+        A flag indicating whether to raise an exception on processing failure.
+    image_extraction_config: Optional[ImageConfigSchema], default=None
+        Configuration schema for the image extraction stage.
+    """
+    max_queue_size: int = 1
+    n_workers: int = 16
+    raise_on_failure: bool = False
+    docx_extraction_config: Optional[DocxConfigSchema] = None
+    pdfium_config: Optional[PDFiumConfigSchema] = None
+    model_config = ConfigDict(extra="forbid")

nv_ingest_api/internal/schemas/extract/extract_html_schema.py ADDED Viewed

@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from pydantic import ConfigDict, BaseModel
+logger = logging.getLogger(__name__)
+class HtmlExtractorSchema(BaseModel):
+    """
+    Configuration schema for the Html extractor settings.
+    Parameters
+    ----------
+    max_queue_size : int, default=1
+        The maximum number of items allowed in the processing queue.
+    n_workers : int, default=16
+        The number of worker threads to use for processing.
+    raise_on_failure : bool, default=False
+        A flag indicating whether to raise an exception on processing failure.
+    """
+    max_queue_size: int = 1
+    n_workers: int = 16
+    raise_on_failure: bool = False
+    model_config = ConfigDict(extra="forbid")

nv_ingest_api/internal/schemas/extract/extract_image_schema.py ADDED Viewed

@@ -0,0 +1,126 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from typing import Optional
+from typing import Tuple
+from pydantic import model_validator, ConfigDict, BaseModel, Field
+from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
+logger = logging.getLogger(__name__)
+class ImageConfigSchema(LowercaseProtocolMixin):
+    """
+    Configuration schema for image extraction endpoints and options.
+    Parameters
+    ----------
+    auth_token : Optional[str], default=None
+        Authentication token required for secure services.
+    yolox_endpoints : Tuple[str, str]
+        A tuple containing the gRPC and HTTP services for the yolox endpoint.
+        Either the gRPC or HTTP service can be empty, but not both.
+    Methods
+    -------
+    validate_endpoints(values)
+        Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
+    Raises
+    ------
+    ValueError
+        If both gRPC and HTTP services are empty for any endpoint.
+    Config
+    ------
+    extra : str
+        Pydantic config option to forbid extra fields.
+    """
+    auth_token: Optional[str] = Field(default=None, repr=False)
+    yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
+    yolox_infer_protocol: str = ""
+    @model_validator(mode="before")
+    @classmethod
+    def validate_endpoints(cls, values):
+        """
+        Validates the gRPC and HTTP services for all endpoints.
+        Parameters
+        ----------
+        values : dict
+            Dictionary containing the values of the attributes for the class.
+        Returns
+        -------
+        dict
+            The validated dictionary of values.
+        Raises
+        ------
+        ValueError
+            If both gRPC and HTTP services are empty for any endpoint.
+        """
+        def clean_service(service):
+            """Set service to None if it's an empty string or contains only spaces or quotes."""
+            if service is None or not service.strip() or service.strip(" \"'") == "":
+                return None
+            return service
+        for model_name in ["yolox"]:
+            endpoint_name = f"{model_name}_endpoints"
+            grpc_service, http_service = values.get(endpoint_name)
+            grpc_service = clean_service(grpc_service)
+            http_service = clean_service(http_service)
+            if not grpc_service and not http_service:
+                raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
+            values[endpoint_name] = (grpc_service, http_service)
+            # Auto-infer protocol from endpoints if not specified
+            protocol_name = f"{model_name}_infer_protocol"
+            protocol_value = values.get(protocol_name)
+            if not protocol_value:
+                protocol_value = "http" if http_service else "grpc" if grpc_service else ""
+            values[protocol_name] = protocol_value
+        return values
+    model_config = ConfigDict(extra="forbid")
+class ImageExtractorSchema(BaseModel):
+    """
+    Configuration schema for the PDF extractor settings.
+    Parameters
+    ----------
+    max_queue_size : int, default=1
+        The maximum number of items allowed in the processing queue.
+    n_workers : int, default=16
+        The number of worker threads to use for processing.
+    raise_on_failure : bool, default=False
+        A flag indicating whether to raise an exception on processing failure.
+    image_extraction_config: Optional[ImageConfigSchema], default=None
+        Configuration schema for the image extraction stage.
+    """
+    max_queue_size: int = 1
+    n_workers: int = 16
+    raise_on_failure: bool = False
+    image_extraction_config: Optional[ImageConfigSchema] = None
+    model_config = ConfigDict(extra="forbid")

nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py ADDED Viewed

@@ -0,0 +1,137 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from typing import Optional
+from typing import Tuple
+from pydantic import field_validator, model_validator, ConfigDict, BaseModel, Field
+from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
+logger = logging.getLogger(__name__)
+class InfographicExtractorConfigSchema(LowercaseProtocolMixin):
+    """
+    Configuration schema for infographic extraction service endpoints and options.
+    Parameters
+    ----------
+    auth_token : Optional[str], default=None
+        Authentication token required for secure services.
+    ocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
+        A tuple containing the gRPC and HTTP services for the ocr endpoint.
+        Either the gRPC or HTTP service can be empty, but not both.
+    Methods
+    -------
+    validate_endpoints(values)
+        Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
+    Raises
+    ------
+    ValueError
+        If both gRPC and HTTP services are empty for any endpoint.
+    Config
+    ------
+    extra : str
+        Pydantic config option to forbid extra fields.
+    """
+    auth_token: Optional[str] = Field(default=None, repr=False)
+    ocr_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
+    ocr_infer_protocol: str = ""
+    nim_batch_size: int = 2
+    workers_per_progress_engine: int = 5
+    @model_validator(mode="before")
+    @classmethod
+    def validate_endpoints(cls, values):
+        """
+        Validates the gRPC and HTTP services for all endpoints.
+        Ensures that at least one service (either gRPC or HTTP) is provided
+        for each endpoint in the configuration.
+        Parameters
+        ----------
+        values : dict
+            Dictionary containing the values of the attributes for the class.
+        Returns
+        -------
+        dict
+            The validated dictionary of values.
+        Raises
+        ------
+        ValueError
+            If both gRPC and HTTP services are empty for any endpoint.
+        """
+        def clean_service(service):
+            """Set service to None if it's an empty string or contains only spaces or quotes."""
+            if service is None or not service.strip() or service.strip(" \"'") == "":
+                return None
+            return service
+        for endpoint_name in ["ocr_endpoints"]:
+            grpc_service, http_service = values.get(endpoint_name, (None, None))
+            grpc_service = clean_service(grpc_service)
+            http_service = clean_service(http_service)
+            if not grpc_service and not http_service:
+                raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
+            values[endpoint_name] = (grpc_service, http_service)
+            # Auto-infer protocol from endpoints if not specified
+            protocol_name = endpoint_name.replace("_endpoints", "_infer_protocol")
+            protocol_value = values.get(protocol_name)
+            if not protocol_value:
+                protocol_value = "http" if http_service else "grpc" if grpc_service else ""
+            values[protocol_name] = protocol_value
+        return values
+    model_config = ConfigDict(extra="forbid")
+class InfographicExtractorSchema(BaseModel):
+    """
+    Configuration schema for infographic extraction processing settings.
+    Parameters
+    ----------
+    max_queue_size : int, default=1
+        The maximum number of items allowed in the processing queue.
+    n_workers : int, default=2
+        The number of worker threads to use for processing.
+    raise_on_failure : bool, default=False
+        A flag indicating whether to raise an exception if a failure occurs during infographic extraction.
+    stage_config : Optional[InfographicExtractorConfigSchema], default=None
+        Configuration for the infographic extraction stage, including yolox and ocr service endpoints.
+    """
+    max_queue_size: int = 1
+    n_workers: int = 2
+    raise_on_failure: bool = False
+    endpoint_config: Optional[InfographicExtractorConfigSchema] = None
+    @field_validator("max_queue_size", "n_workers")
+    def check_positive(cls, v, field):
+        if v <= 0:
+            raise ValueError(f"{field.field_name} must be greater than 0.")
+        return v
+    model_config = ConfigDict(extra="forbid")