PyPI - nv-ingest-client - Versions diffs - 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl - Mend

nv-ingest-client 2025.7.24.dev20250724py3-none-any.whl → 2025.11.2.dev20251102py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (38) hide show

nv_ingest_client/cli/util/click.py +182 -30
nv_ingest_client/cli/util/processing.py +0 -393
nv_ingest_client/client/client.py +561 -207
nv_ingest_client/client/ingest_job_handler.py +412 -0
nv_ingest_client/client/interface.py +466 -59
nv_ingest_client/client/util/processing.py +11 -1
nv_ingest_client/nv_ingest_cli.py +58 -6
nv_ingest_client/primitives/jobs/job_spec.py +32 -10
nv_ingest_client/primitives/tasks/__init__.py +6 -4
nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
nv_ingest_client/primitives/tasks/caption.py +10 -16
nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
nv_ingest_client/primitives/tasks/dedup.py +12 -21
nv_ingest_client/primitives/tasks/embed.py +37 -76
nv_ingest_client/primitives/tasks/extract.py +68 -169
nv_ingest_client/primitives/tasks/filter.py +22 -28
nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
nv_ingest_client/primitives/tasks/split.py +17 -18
nv_ingest_client/primitives/tasks/store.py +29 -29
nv_ingest_client/primitives/tasks/task_base.py +1 -72
nv_ingest_client/primitives/tasks/task_factory.py +10 -11
nv_ingest_client/primitives/tasks/udf.py +349 -0
nv_ingest_client/util/dataset.py +8 -2
nv_ingest_client/util/document_analysis.py +314 -0
nv_ingest_client/util/image_disk_utils.py +300 -0
nv_ingest_client/util/transport.py +12 -6
nv_ingest_client/util/util.py +66 -0
nv_ingest_client/util/vdb/milvus.py +220 -75
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
nv_ingest_client/cli/util/tasks.py +0 -3
nv_ingest_client/primitives/exceptions.py +0 -0
nv_ingest_client/primitives/tasks/transform.py +0 -0
nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0

nv_ingest_client/primitives/tasks/embed.py CHANGED Viewed

@@ -7,82 +7,15 @@
 import logging
 from typing import Any
 from typing import Dict
-from typing import Literal
 from typing import Optional
-from typing import Type
-from pydantic import BaseModel
-from pydantic import ConfigDict
-from pydantic import model_validator
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
 from .task_base import Task
 logger = logging.getLogger(__name__)
-class EmbedTaskSchema(BaseModel):
-    """
-    Schema for embed task configuration.
-    This schema contains configuration details for an embedding task,
-    including the endpoint URL, model name, API key, and error filtering flag.
-    Attributes
-    ----------
-    endpoint_url : Optional[str]
-        URL of the embedding endpoint. Default is None.
-    model_name : Optional[str]
-        Name of the embedding model. Default is None.
-    api_key : Optional[str]
-        API key for authentication with the embedding service. Default is None.
-    filter_errors : bool
-        Flag to indicate whether errors should be filtered. Default is False.
-    """
-    endpoint_url: Optional[str] = None
-    model_name: Optional[str] = None
-    api_key: Optional[str] = None
-    filter_errors: bool = False
-    text_elements_modality: Optional[Literal["text", "image", "text_image"]] = None
-    image_elements_modality: Optional[Literal["text", "image", "text_image"]] = None
-    structured_elements_modality: Optional[Literal["text", "image", "text_image"]] = None
-    audio_elements_modality: Optional[Literal["text"]] = None
-    @model_validator(mode="before")
-    def handle_deprecated_fields(cls: Type["EmbedTaskSchema"], values: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Handle deprecated fields before model validation.
-        This validator checks for the presence of deprecated keys ('text' and 'tables')
-        in the input dictionary and removes them. Warnings are issued if these keys are found.
-        Parameters
-        ----------
-        values : Dict[str, Any]
-            Input dictionary of model values.
-        Returns
-        -------
-        Dict[str, Any]
-            The updated dictionary with deprecated fields removed.
-        """
-        if "text" in values:
-            logger.warning(
-                "'text' parameter is deprecated and will be ignored. Future versions will remove this argument."
-            )
-            values.pop("text")
-        if "tables" in values:
-            logger.warning(
-                "'tables' parameter is deprecated and will be ignored. Future versions will remove this argument."
-            )
-            values.pop("tables")
-        return values
-    model_config = ConfigDict(extra="forbid")
-    model_config["protected_namespaces"] = ()
 class EmbedTask(Task):
     """
     Object for document embedding tasks.
@@ -103,6 +36,8 @@ class EmbedTask(Task):
         image_elements_modality: Optional[str] = None,
         structured_elements_modality: Optional[str] = None,
         audio_elements_modality: Optional[str] = None,
+        custom_content_field: Optional[str] = None,
+        result_target_field: Optional[str] = None,
     ) -> None:
         """
         Initialize the EmbedTask configuration.
@@ -133,14 +68,30 @@ class EmbedTask(Task):
                 "'tables' parameter is deprecated and will be ignored. Future versions will remove this argument."
             )
-        self._endpoint_url: Optional[str] = endpoint_url
-        self._model_name: Optional[str] = model_name
-        self._api_key: Optional[str] = api_key
-        self._filter_errors: bool = filter_errors
-        self._text_elements_modality: Optional[bool] = text_elements_modality
-        self._image_elements_modality: Optional[bool] = image_elements_modality
-        self._structured_elements_modality: Optional[bool] = structured_elements_modality
-        self._audio_elements_modality: Optional[bool] = audio_elements_modality
+        # Use the API schema for validation
+        validated_data = IngestTaskEmbedSchema(
+            endpoint_url=endpoint_url,
+            model_name=model_name,
+            api_key=api_key,
+            filter_errors=filter_errors,
+            text_elements_modality=text_elements_modality,
+            image_elements_modality=image_elements_modality,
+            structured_elements_modality=structured_elements_modality,
+            audio_elements_modality=audio_elements_modality,
+            custom_content_field=custom_content_field,
+            result_target_field=result_target_field,
+        )
+        self._endpoint_url = validated_data.endpoint_url
+        self._model_name = validated_data.model_name
+        self._api_key = validated_data.api_key
+        self._filter_errors = validated_data.filter_errors
+        self._text_elements_modality = validated_data.text_elements_modality
+        self._image_elements_modality = validated_data.image_elements_modality
+        self._structured_elements_modality = validated_data.structured_elements_modality
+        self._audio_elements_modality = validated_data.audio_elements_modality
+        self._custom_content_field = validated_data.custom_content_field
+        self._result_target_field = validated_data.result_target_field
     def __str__(self) -> str:
         """
@@ -169,6 +120,10 @@ class EmbedTask(Task):
             info += f"  structured_elements_modality: {self._structured_elements_modality}\n"
         if self._audio_elements_modality:
             info += f"  audio_elements_modality: {self._audio_elements_modality}\n"
+        if self._custom_content_field:
+            info += f"  custom_content_field: {self._custom_content_field}\n"
+        if self._result_target_field:
+            info += f"  result_target_field: {self.result_target_field}\n"
         return info
     def to_dict(self) -> Dict[str, Any]:
@@ -204,4 +159,10 @@ class EmbedTask(Task):
         if self._audio_elements_modality:
             task_properties["audio_elements_modality"] = self._audio_elements_modality
+        if self._custom_content_field:
+            task_properties["custom_content_field"] = self._custom_content_field
+        if self._result_target_field:
+            task_properties["result_target_field"] = self.result_target_field
         return {"type": "embed", "task_properties": task_properties}

nv_ingest_client/primitives/tasks/extract.py CHANGED Viewed

@@ -12,12 +12,8 @@ from typing import Any
 from typing import Dict
 from typing import Literal
 from typing import Optional
-from typing import get_args
-from pydantic import BaseModel
-from pydantic import ConfigDict
-from pydantic import field_validator
-from pydantic import model_validator
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
 from .task_base import Task
@@ -64,142 +60,9 @@ _Type_Extract_Method_PDF = Literal[
     "unstructured_io",
 ]
-_Type_Extract_Method_DOCX = Literal["python_docx", "haystack", "unstructured_local", "unstructured_service"]
+_Type_Extract_Images_Method = Literal["group", "yolox"]
-_Type_Extract_Method_PPTX = Literal["python_pptx", "haystack", "unstructured_local", "unstructured_service"]
-_Type_Extract_Method_Image = Literal["image"]
-_Type_Extract_Method_Audio = Literal["audio"]
-_Type_Extract_Method_Text = Literal["txt"]
-_Type_Extract_Method_Html = Literal["markitdown"]
-_Type_Extract_Method_Map = {
-    "bmp": get_args(_Type_Extract_Method_Image),
-    "docx": get_args(_Type_Extract_Method_DOCX),
-    "html": get_args(_Type_Extract_Method_Html),
-    "jpeg": get_args(_Type_Extract_Method_Image),
-    "jpg": get_args(_Type_Extract_Method_Image),
-    "pdf": get_args(_Type_Extract_Method_PDF),
-    "png": get_args(_Type_Extract_Method_Image),
-    "pptx": get_args(_Type_Extract_Method_PPTX),
-    "text": get_args(_Type_Extract_Method_Text),
-    "tiff": get_args(_Type_Extract_Method_Image),
-    "txt": get_args(_Type_Extract_Method_Text),
-    "mp3": get_args(_Type_Extract_Method_Audio),
-    "wav": get_args(_Type_Extract_Method_Audio),
-}
-_Type_Extract_Tables_Method_PDF = Literal["yolox", "pdfium", "nemoretriever_parse"]
-_Type_Extract_Tables_Method_DOCX = Literal["python_docx",]
-_Type_Extract_Tables_Method_PPTX = Literal["python_pptx",]
-_Type_Extract_Tables_Method_Map = {
-    "pdf": get_args(_Type_Extract_Tables_Method_PDF),
-    "docx": get_args(_Type_Extract_Tables_Method_DOCX),
-    "pptx": get_args(_Type_Extract_Tables_Method_PPTX),
-}
-_Type_Extract_Images_Method = Literal["simple", "group"]
-class ExtractTaskSchema(BaseModel):
-    document_type: str
-    extract_method: str = None  # Initially allow None to set a smart default
-    extract_text: bool = True
-    extract_images: bool = True
-    extract_images_method: str = "group"
-    extract_images_params: Optional[Dict[str, Any]] = None
-    extract_tables: bool = True
-    extract_tables_method: str = "yolox"
-    extract_charts: Optional[bool] = None  # Initially allow None to set a smart default
-    extract_infographics: bool = False
-    extract_page_as_image: bool = False
-    extract_audio_params: Optional[Dict[str, Any]] = None
-    text_depth: str = "document"
-    paddle_output_format: Optional[str] = None
-    table_output_format: str = "pseudo_markdown"
-    @model_validator(mode="after")
-    @classmethod
-    def set_default_extract_method(cls, values):
-        document_type = values.document_type.lower()  # Ensure case-insensitive comparison
-        extract_method = values.extract_method
-        paddle_output_format = values.paddle_output_format
-        if document_type not in _DEFAULT_EXTRACTOR_MAP:
-            raise ValueError(
-                f"Unsupported document type: {document_type}."
-                f" Supported types are: {list(_DEFAULT_EXTRACTOR_MAP.keys())}"
-            )
-        if extract_method is None:
-            values.extract_method = _DEFAULT_EXTRACTOR_MAP[document_type]
-        if paddle_output_format is not None:
-            logger.warning(
-                "`paddle_output_format` is deprecated and will be removed in a future release. "
-                "Please use `table_output_format` instead."
-            )
-            values.table_output_format = paddle_output_format
-        return values
-    @field_validator("extract_charts")
-    def set_default_extract_charts(cls, v, values):
-        # `extract_charts` is initially set to None for backward compatibility.
-        # {extract_tables: true, extract_charts: None} or {extract_tables: true, extract_charts: true} enables both
-        # table and chart extraction.
-        # {extract_tables: true, extract_charts: false} enables only the table extraction and disables chart extraction.
-        extract_charts = v
-        if extract_charts is None:
-            extract_charts = values.data.get("extract_tables")
-        return extract_charts
-    @field_validator("extract_method")
-    def extract_method_must_be_valid(cls, v, values, **kwargs):
-        document_type = values.data.get("document_type", "").lower()  # Ensure case-insensitive comparison
-        # Skip validation for text-like types, since they do not have 'extract' stages.
-        if document_type in ["txt", "text", "json", "md", "sh"]:
-            return
-        valid_methods = set(_Type_Extract_Method_Map[document_type])
-        if v not in valid_methods:
-            raise ValueError(f"extract_method must be one of {valid_methods}")
-        return v
-    @field_validator("document_type")
-    def document_type_must_be_supported(cls, v):
-        if v.lower() not in _DEFAULT_EXTRACTOR_MAP:
-            raise ValueError(
-                f"Unsupported document type '{v}'. Supported types are: {', '.join(_DEFAULT_EXTRACTOR_MAP.keys())}"
-            )
-        return v.lower()
-    @field_validator("extract_tables_method")
-    def extract_tables_method_must_be_valid(cls, v, values, **kwargs):
-        document_type = values.data.get("document_type", "").lower()  # Ensure case-insensitive comparison
-        valid_methods = set(_Type_Extract_Tables_Method_Map[document_type])
-        if v not in valid_methods:
-            raise ValueError(f"extract_method must be one of {valid_methods}")
-        return v
-    @field_validator("extract_images_method")
-    def extract_images_method_must_be_valid(cls, v):
-        if v.lower() not in get_args(_Type_Extract_Images_Method):
-            raise ValueError(
-                f"Unsupported document type '{v}'. Supported types are: {', '.join(_Type_Extract_Images_Method)}"
-            )
-        return v.lower()
-    model_config = ConfigDict(extra="forbid")
+_Type_Extract_Tables_Method_PDF = Literal["yolox", "paddle"]
 class ExtractTask(Task):
@@ -210,7 +73,7 @@ class ExtractTask(Task):
     def __init__(
         self,
         document_type,
-        extract_method: _Type_Extract_Method_PDF = "pdfium",
+        extract_method: _Type_Extract_Method_PDF = None,
         extract_text: bool = False,
         extract_images: bool = False,
         extract_tables: bool = False,
@@ -223,26 +86,69 @@ class ExtractTask(Task):
         extract_page_as_image: bool = False,
         text_depth: str = "document",
         paddle_output_format: str = "pseudo_markdown",
-        table_output_format: str = "pseudo_markdown",
+        table_output_format: str = "markdown",
     ) -> None:
         """
         Setup Extract Task Config
         """
         super().__init__()
-        self._document_type = document_type
+        # Set default extract_method if None
+        if extract_method is None:
+            # Handle both string and enum inputs
+            if hasattr(document_type, "value"):
+                document_type_str = document_type.value
+            else:
+                document_type_str = document_type
+            document_type_lower = document_type_str.lower()
+            if document_type_lower not in _DEFAULT_EXTRACTOR_MAP:
+                raise ValueError(
+                    f"Unsupported document type: {document_type}."
+                    f" Supported types are: {list(_DEFAULT_EXTRACTOR_MAP.keys())}"
+                )
+            extract_method = _DEFAULT_EXTRACTOR_MAP[document_type_lower]
+        # Set default extract_charts if None
+        if extract_charts is None:
+            extract_charts = extract_tables
+        # Build params dict for API schema validation
+        extract_params = {
+            "extract_text": extract_text,
+            "extract_images": extract_images,
+            "extract_images_method": extract_images_method,
+            "extract_tables": extract_tables,
+            "extract_tables_method": extract_tables_method,
+            "extract_charts": extract_charts,
+            "extract_infographics": extract_infographics,
+            "extract_page_as_image": extract_page_as_image,
+            "text_depth": text_depth,
+            "table_output_format": table_output_format,
+        }
+        # Add optional parameters if provided
+        if extract_images_params:
+            extract_params["extract_images_params"] = extract_images_params
+        if extract_audio_params:
+            extract_params["extract_audio_params"] = extract_audio_params
+        # Use the API schema for validation
+        validated_data = IngestTaskExtractSchema(
+            document_type=document_type,
+            method=extract_method,
+            params=extract_params,
+        )
+        # Store validated data
+        self._document_type = validated_data.document_type
+        self._extract_method = validated_data.method
         self._extract_audio_params = extract_audio_params
         self._extract_images = extract_images
-        self._extract_method = extract_method
         self._extract_tables = extract_tables
         self._extract_images_method = extract_images_method
         self._extract_images_params = extract_images_params
         self._extract_tables_method = extract_tables_method
-        # `extract_charts` is initially set to None for backward compatibility.
-        # {extract_tables: true, extract_charts: None} or {extract_tables: true, extract-charts: true} enables both
-        # table and chart extraction.
-        # {extract_tables: true, extract_charts: false} enables only the table extraction and disables chart extraction.
-        self._extract_charts = extract_charts if extract_charts is not None else extract_tables
+        self._extract_charts = extract_charts
         self._extract_infographics = extract_infographics
         self._extract_page_as_image = extract_page_as_image
         self._extract_text = extract_text
@@ -256,34 +162,27 @@ class ExtractTask(Task):
         """
         info = ""
         info += "Extract Task:\n"
-        info += f"  document type: {self._document_type}\n"
-        info += f"  extract method: {self._extract_method}\n"
-        info += f"  extract text: {self._extract_text}\n"
-        info += f"  extract images: {self._extract_images}\n"
-        info += f"  extract tables: {self._extract_tables}\n"
-        info += f"  extract charts: {self._extract_charts}\n"
-        info += f"  extract infographics: {self._extract_infographics}\n"
-        info += f"  extract page as image: {self._extract_page_as_image}\n"
-        info += f"  extract images method: {self._extract_images_method}\n"
-        info += f"  extract tables method: {self._extract_tables_method}\n"
-        info += f"  text depth: {self._text_depth}\n"
+        info += f"  document_type: {self._document_type.value}\n"
+        info += f"  extract_method: {self._extract_method}\n"
+        info += f"  extract_text: {self._extract_text}\n"
+        info += f"  extract_images: {self._extract_images}\n"
+        info += f"  extract_tables: {self._extract_tables}\n"
+        info += f"  extract_charts: {self._extract_charts}\n"
+        info += f"  extract_infographics: {self._extract_infographics}\n"
+        info += f"  extract_page_as_image: {self._extract_page_as_image}\n"
+        info += f"  text_depth: {self._text_depth}\n"
         info += f"  table_output_format: {self._table_output_format}\n"
-        if self._extract_images_params:
-            info += f"  extract images params: {self._extract_images_params}\n"
-        if self._extract_audio_params:
-            info += f"  extract audio params: {self._extract_audio_params}\n"
         return info
     def to_dict(self) -> Dict:
         """
-        Convert to a dict for submission to redis (fixme)
+        Convert to a dict for submission to redis
         """
         extract_params = {
             "extract_text": self._extract_text,
             "extract_images": self._extract_images,
-            "extract_tables": self._extract_tables,
             "extract_images_method": self._extract_images_method,
+            "extract_tables": self._extract_tables,
             "extract_tables_method": self._extract_tables_method,
             "extract_charts": self._extract_charts,
             "extract_infographics": self._extract_infographics,
@@ -306,7 +205,7 @@ class ExtractTask(Task):
         task_properties = {
             "method": self._extract_method,
-            "document_type": self._document_type,
+            "document_type": self._document_type.value,
             "params": extract_params,
         }
@@ -339,4 +238,4 @@ class ExtractTask(Task):
     @property
     def document_type(self):
-        return self._document_type
+        return self._document_type.value

nv_ingest_client/primitives/tasks/filter.py CHANGED Viewed

@@ -11,31 +11,13 @@ from typing import Dict
 from typing import Literal
 from typing import Union
-from pydantic import BaseModel, field_validator
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
 from .task_base import Task
 logger = logging.getLogger(__name__)
-class FilterTaskSchema(BaseModel):
-    content_type: str = "image"
-    min_size: int = 128
-    max_aspect_ratio: Union[float, int] = 5.0
-    min_aspect_ratio: Union[float, int] = 0.2
-    filter: bool = False
-    @field_validator("content_type")
-    def content_type_must_be_valid(cls, v):
-        valid_criteria = ["image"]
-        if v not in valid_criteria:
-            raise ValueError(f"content_type must be one of {valid_criteria}")
-        return v
-    class Config:
-        extra = "forbid"
 class FilterTask(Task):
     """
     Object for document filter task
@@ -49,17 +31,29 @@ class FilterTask(Task):
         min_size: int = 128,
         max_aspect_ratio: Union[int, float] = 5.0,
         min_aspect_ratio: Union[int, float] = 0.2,
-        filter: bool = False,
+        filter: bool = True,
     ) -> None:
         """
-        Setup Split Task Config
+        Setup Filter Task Config
         """
         super().__init__()
-        self._content_type = content_type
-        self._min_size = min_size
-        self._max_aspect_ratio = max_aspect_ratio
-        self._min_aspect_ratio = min_aspect_ratio
-        self._filter = filter
+        # Use the API schema for validation
+        validated_data = IngestTaskFilterSchema(
+            content_type=content_type,
+            params={
+                "min_size": min_size,
+                "max_aspect_ratio": max_aspect_ratio,
+                "min_aspect_ratio": min_aspect_ratio,
+                "filter": filter,
+            },
+        )
+        self._content_type = validated_data.content_type
+        self._min_size = validated_data.params.min_size
+        self._max_aspect_ratio = validated_data.params.max_aspect_ratio
+        self._min_aspect_ratio = validated_data.params.min_aspect_ratio
+        self._filter = validated_data.params.filter
     def __str__(self) -> str:
         """
@@ -67,7 +61,7 @@ class FilterTask(Task):
         """
         info = ""
         info += "Filter Task:\n"
-        info += f"  content_type: {self._content_type}\n"
+        info += f"  content_type: {self._content_type.value}\n"
         info += f"  min_size: {self._min_size}\n"
         info += f"  max_aspect_ratio: {self._max_aspect_ratio}\n"
         info += f"  min_aspect_ratio: {self._min_aspect_ratio}\n"
@@ -86,7 +80,7 @@ class FilterTask(Task):
         }
         task_properties = {
-            "content_type": self._content_type,
+            "content_type": self._content_type.value,
             "params": filter_params,
         }

nv_ingest_client/primitives/tasks/infographic_extraction.py CHANGED Viewed

@@ -9,44 +9,47 @@
 import logging
 from typing import Dict
-from pydantic import BaseModel
-from .task_base import Task
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskInfographicExtraction
+from nv_ingest_client.primitives.tasks.task_base import Task
 logger = logging.getLogger(__name__)
-class InfographicExtractionSchema(BaseModel):
-    class Config:
-        extra = "forbid"
 class InfographicExtractionTask(Task):
     """
     Object for infographic extraction task
     """
-    def __init__(self) -> None:
+    def __init__(self, params: dict = None) -> None:
         """
-        Setup Dedup Task Config
+        Setup Infographic Extraction Task Config
         """
         super().__init__()
+        # Handle None params by converting to empty dict for backward compatibility
+        if params is None:
+            params = {}
+        # Use the API schema for validation
+        validated_data = IngestTaskInfographicExtraction(params=params)
+        self._params = validated_data.params
     def __str__(self) -> str:
         """
         Returns a string with the object's config and run time state
         """
         info = ""
-        info += "infographic extraction task\n"
+        info += "Infographic Extraction Task:\n"
+        info += f"  params: {self._params}\n"
         return info
     def to_dict(self) -> Dict:
         """
         Convert to a dict for submission to redis
         """
         task_properties = {
-            "params": {},
+            "params": self._params,
         }
         return {"type": "infographic_data_extract", "task_properties": task_properties}

nv_ingest_client/primitives/tasks/split.py CHANGED Viewed

@@ -8,25 +8,14 @@
 import logging
 from typing import Dict
-from typing import Optional
-from pydantic import BaseModel
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
 from .task_base import Task
 logger = logging.getLogger(__name__)
-class SplitTaskSchema(BaseModel):
-    tokenizer: Optional[str] = None
-    chunk_size: int = 1024
-    chunk_overlap: int = 150
-    params: dict = {}
-    class Config:
-        extra = "forbid"
 class SplitTask(Task):
     """
     Object for document splitting task
@@ -37,16 +26,26 @@ class SplitTask(Task):
         tokenizer: str = None,
         chunk_size: int = 1024,
         chunk_overlap: int = 150,
-        params: dict = {},
-    ) -> None:
+        params: dict = None,
+    ):
         """
         Setup Split Task Config
         """
         super().__init__()
-        self._tokenizer = tokenizer
-        self._chunk_size = chunk_size
-        self._chunk_overlap = chunk_overlap
-        self._params = params
+        # Handle None params by converting to empty dict for backward compatibility
+        if params is None:
+            params = {}
+        # Use the API schema for validation
+        validated_data = IngestTaskSplitSchema(
+            tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap, params=params
+        )
+        self._tokenizer = validated_data.tokenizer
+        self._chunk_size = validated_data.chunk_size
+        self._chunk_overlap = validated_data.chunk_overlap
+        self._params = validated_data.params
     def __str__(self) -> str:
         """

nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

Potentially problematic release.

nv-ingest-client 2025.7.24.dev20250724py3-none-any.whl → 2025.11.2.dev20251102py3-none-any.whl