PyPI - nv-ingest-client - Versions diffs - 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl - Mend

nv-ingest-client 2025.7.24.dev20250724py3-none-any.whl → 2025.11.2.dev20251102py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (38) hide show

nv_ingest_client/cli/util/click.py +182 -30
nv_ingest_client/cli/util/processing.py +0 -393
nv_ingest_client/client/client.py +561 -207
nv_ingest_client/client/ingest_job_handler.py +412 -0
nv_ingest_client/client/interface.py +466 -59
nv_ingest_client/client/util/processing.py +11 -1
nv_ingest_client/nv_ingest_cli.py +58 -6
nv_ingest_client/primitives/jobs/job_spec.py +32 -10
nv_ingest_client/primitives/tasks/__init__.py +6 -4
nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
nv_ingest_client/primitives/tasks/caption.py +10 -16
nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
nv_ingest_client/primitives/tasks/dedup.py +12 -21
nv_ingest_client/primitives/tasks/embed.py +37 -76
nv_ingest_client/primitives/tasks/extract.py +68 -169
nv_ingest_client/primitives/tasks/filter.py +22 -28
nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
nv_ingest_client/primitives/tasks/split.py +17 -18
nv_ingest_client/primitives/tasks/store.py +29 -29
nv_ingest_client/primitives/tasks/task_base.py +1 -72
nv_ingest_client/primitives/tasks/task_factory.py +10 -11
nv_ingest_client/primitives/tasks/udf.py +349 -0
nv_ingest_client/util/dataset.py +8 -2
nv_ingest_client/util/document_analysis.py +314 -0
nv_ingest_client/util/image_disk_utils.py +300 -0
nv_ingest_client/util/transport.py +12 -6
nv_ingest_client/util/util.py +66 -0
nv_ingest_client/util/vdb/milvus.py +220 -75
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
nv_ingest_client/cli/util/tasks.py +0 -3
nv_ingest_client/primitives/exceptions.py +0 -0
nv_ingest_client/primitives/tasks/transform.py +0 -0
nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0

nv_ingest_client/cli/util/click.py CHANGED Viewed

@@ -12,23 +12,30 @@ from pprint import pprint
 from typing import Union, List, Any, Dict
 import click
+from nv_ingest_api.internal.enums.common import PipelinePhase
+from nv_ingest_api.util.introspection.function_inspect import infer_udf_function_name
 from nv_ingest_client.util.processing import check_schema
 from nv_ingest_client.primitives.tasks import CaptionTask
 from nv_ingest_client.primitives.tasks import DedupTask
 from nv_ingest_client.primitives.tasks import EmbedTask
 from nv_ingest_client.primitives.tasks import ExtractTask
 from nv_ingest_client.primitives.tasks import FilterTask
+from nv_ingest_client.primitives.tasks import InfographicExtractionTask
 from nv_ingest_client.primitives.tasks import SplitTask
 from nv_ingest_client.primitives.tasks import StoreEmbedTask
 from nv_ingest_client.primitives.tasks import StoreTask
-from nv_ingest_client.primitives.tasks.caption import CaptionTaskSchema
-from nv_ingest_client.primitives.tasks.dedup import DedupTaskSchema
-from nv_ingest_client.primitives.tasks.embed import EmbedTaskSchema
-from nv_ingest_client.primitives.tasks.extract import ExtractTaskSchema
-from nv_ingest_client.primitives.tasks.filter import FilterTaskSchema
-from nv_ingest_client.primitives.tasks.split import SplitTaskSchema
-from nv_ingest_client.primitives.tasks.store import StoreEmbedTaskSchema
-from nv_ingest_client.primitives.tasks.store import StoreTaskSchema
+from nv_ingest_client.primitives.tasks import UDFTask
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskInfographicExtraction
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskUDFSchema
 from nv_ingest_client.util.util import generate_matching_files
 logger = logging.getLogger(__name__)
@@ -78,12 +85,6 @@ class ClientType(str, Enum):
     KAFKA = "KAFKA"
-# Example TaskId validation set
-VALID_TASK_IDS = {"task1", "task2", "task3"}
-_MODULE_UNDER_TEST = "nv_ingest_client.cli.util.click"
 def debug_print_click_options(ctx: click.Context) -> None:
     """
     Retrieves all options from the Click context and pretty prints them.
@@ -149,9 +150,11 @@ TaskType = Union[
     EmbedTask,
     ExtractTask,
     FilterTask,
+    InfographicExtractionTask,
     SplitTask,
     StoreEmbedTask,
     StoreTask,
+    UDFTask,
 ]
@@ -178,7 +181,32 @@ def parse_task_options(task_id: str, options_str: str) -> Dict[str, Any]:
         the error details (e.g., expected property format), and show the input that was provided.
     """
     try:
-        return json.loads(options_str)
+        options = json.loads(options_str)
+        # Convert string boolean values to actual booleans for extract tasks
+        if task_id == "extract":
+            boolean_fields = [
+                "extract_text",
+                "extract_images",
+                "extract_tables",
+                "extract_charts",
+                "extract_infographics",
+                "extract_page_as_image",
+            ]
+            for field in boolean_fields:
+                if field in options:
+                    value = options[field]
+                    if isinstance(value, str):
+                        if value.lower() in ("true", "1", "yes", "on"):
+                            options[field] = True
+                        elif value.lower() in ("false", "0", "no", "off"):
+                            options[field] = False
+                        else:
+                            raise ValueError(
+                                f"Invalid boolean value for {field}: '{value}'. Use true/false, 1/0, yes/no, or on/off."
+                            )
+        return options
     except json.JSONDecodeError as e:
         error_message = (
             f"Invalid JSON format for task '{task_id}': {e.msg} at line {e.lineno} column {e.colno} (char {e.pos}). "
@@ -229,46 +257,170 @@ def click_validate_task(ctx: click.Context, param: click.Parameter, value: List[
             options: Dict[str, Any] = parse_task_options(task_id, json_options)
             if task_id == "split":
-                task_options = check_schema(SplitTaskSchema, options, task_id, json_options)
+                task_options = check_schema(IngestTaskSplitSchema, options, task_id, json_options)
                 new_task_id = f"{task_id}"
                 new_task = [(new_task_id, SplitTask(**task_options.model_dump()))]
             elif task_id == "extract":
-                task_options = check_schema(ExtractTaskSchema, options, task_id, json_options)
-                new_task_id = f"{task_id}_{task_options.document_type}"
-                new_task = [(new_task_id, ExtractTask(**task_options.model_dump()))]
+                # Map CLI parameters to API schema structure
+                method = options.pop("extract_method", None)
+                if method is None:
+                    method = "pdfium"  # Default fallback
+                # Build params dict for API schema
+                params = {k: v for k, v in options.items() if k != "document_type"}
+                # Validate with API schema
+                api_options = {
+                    "document_type": options.get("document_type"),
+                    "method": method,
+                    "params": params,
+                }
+                task_options = check_schema(IngestTaskExtractSchema, api_options, task_id, json_options)
+                new_task_id = f"{task_id}_{task_options.document_type.value}"
+                # Create ExtractTask with original CLI parameters
+                extract_task_params = {
+                    "document_type": task_options.document_type,
+                    "extract_method": task_options.method,
+                    **task_options.params,
+                }
+                # Start with the main extract task
+                new_task = [(new_task_id, ExtractTask(**extract_task_params))]
+                # Add ChartExtractionTask if extract_charts is True
+                if task_options.params.get("extract_charts", False):
+                    from nv_ingest_client.primitives.tasks import ChartExtractionTask
+                    chart_task_id = "chart_data_extract"
+                    chart_params = {"params": {}}  # ChartExtractionTask takes params dict
+                    new_task.append((chart_task_id, ChartExtractionTask(chart_params)))
+                # Add TableExtractionTask if extract_tables is True
+                if task_options.params.get("extract_tables", False):
+                    from nv_ingest_client.primitives.tasks import TableExtractionTask
+                    table_task_id = "table_data_extract"
+                    new_task.append((table_task_id, TableExtractionTask()))
             elif task_id == "store":
-                task_options = check_schema(StoreTaskSchema, options, task_id, json_options)
+                task_options = check_schema(IngestTaskStoreSchema, options, task_id, json_options)
                 new_task_id = f"{task_id}"
                 new_task = [(new_task_id, StoreTask(**task_options.model_dump()))]
             elif task_id == "store_embedding":
-                task_options = check_schema(StoreEmbedTaskSchema, options, task_id, json_options)
+                task_options = check_schema(IngestTaskStoreEmbedSchema, options, task_id, json_options)
                 new_task_id = f"{task_id}"
                 new_task = [(new_task_id, StoreEmbedTask(**task_options.model_dump()))]
             elif task_id == "caption":
-                task_options = check_schema(CaptionTaskSchema, options, task_id, json_options)
+                task_options = check_schema(IngestTaskCaptionSchema, options, task_id, json_options)
                 new_task_id = f"{task_id}"
-                new_task = [(new_task_id, CaptionTask(**task_options.model_dump()))]
+                # Extract individual parameters from API schema for CaptionTask constructor
+                caption_params = {
+                    "api_key": task_options.api_key,
+                    "endpoint_url": task_options.endpoint_url,
+                    "prompt": task_options.prompt,
+                    "model_name": task_options.model_name,
+                }
+                new_task = [(new_task_id, CaptionTask(**caption_params))]
             elif task_id == "dedup":
-                task_options = check_schema(DedupTaskSchema, options, task_id, json_options)
+                task_options = check_schema(IngestTaskDedupSchema, options, task_id, json_options)
                 new_task_id = f"{task_id}"
-                new_task = [(new_task_id, DedupTask(**task_options.model_dump()))]
+                # Extract individual parameters from API schema for DedupTask constructor
+                dedup_params = {
+                    "content_type": task_options.content_type,
+                    "filter": task_options.params.filter,
+                }
+                new_task = [(new_task_id, DedupTask(**dedup_params))]
             elif task_id == "filter":
-                task_options = check_schema(FilterTaskSchema, options, task_id, json_options)
+                task_options = check_schema(IngestTaskFilterSchema, options, task_id, json_options)
                 new_task_id = f"{task_id}"
-                new_task = [(new_task_id, FilterTask(**task_options.model_dump()))]
+                # Extract individual parameters from API schema for FilterTask constructor
+                filter_params = {
+                    "content_type": task_options.content_type,
+                    "min_size": task_options.params.min_size,
+                    "max_aspect_ratio": task_options.params.max_aspect_ratio,
+                    "min_aspect_ratio": task_options.params.min_aspect_ratio,
+                    "filter": task_options.params.filter,
+                }
+                new_task = [(new_task_id, FilterTask(**filter_params))]
             elif task_id == "embed":
-                task_options = check_schema(EmbedTaskSchema, options, task_id, json_options)
+                task_options = check_schema(IngestTaskEmbedSchema, options, task_id, json_options)
                 new_task_id = f"{task_id}"
                 new_task = [(new_task_id, EmbedTask(**task_options.model_dump()))]
+            elif task_id == "infographic":
+                task_options = check_schema(IngestTaskInfographicExtraction, options, task_id, json_options)
+                new_task_id = f"{task_id}"
+                new_task = [(new_task_id, InfographicExtractionTask(**task_options.model_dump()))]
+            elif task_id == "udf":
+                # Validate mutual exclusivity of target_stage and phase
+                has_target_stage = "target_stage" in options and options["target_stage"] is not None
+                has_phase = "phase" in options and options["phase"] is not None
+                if has_target_stage and has_phase:
+                    raise ValueError(
+                        "UDF task cannot specify both 'target_stage' and 'phase'. Please specify only one."
+                    )
+                elif not has_target_stage and not has_phase:
+                    raise ValueError("UDF task must specify either 'target_stage' or 'phase'.")
+                # Pre-process UDF task options to convert phase names to integers
+                if "phase" in options and isinstance(options["phase"], str):
+                    # Convert phase string to integer using the same logic as UDFTask
+                    phase_str = options["phase"].upper()
+                    phase_aliases = {
+                        "PRE_PROCESSING": PipelinePhase.PRE_PROCESSING,
+                        "PREPROCESSING": PipelinePhase.PRE_PROCESSING,
+                        "PRE": PipelinePhase.PRE_PROCESSING,
+                        "EXTRACTION": PipelinePhase.EXTRACTION,
+                        "EXTRACT": PipelinePhase.EXTRACTION,
+                        "POST_PROCESSING": PipelinePhase.POST_PROCESSING,
+                        "POSTPROCESSING": PipelinePhase.POST_PROCESSING,
+                        "POST": PipelinePhase.POST_PROCESSING,
+                        "MUTATION": PipelinePhase.MUTATION,
+                        "MUTATE": PipelinePhase.MUTATION,
+                        "TRANSFORM": PipelinePhase.TRANSFORM,
+                        "RESPONSE": PipelinePhase.RESPONSE,
+                        "RESP": PipelinePhase.RESPONSE,
+                    }
+                    if phase_str in phase_aliases:
+                        options["phase"] = phase_aliases[phase_str].value
+                    else:
+                        raise ValueError(f"Invalid phase name: {options['phase']}")
+                # Try to infer udf_function_name if not provided
+                if "udf_function_name" not in options or not options["udf_function_name"]:
+                    udf_function = options.get("udf_function", "")
+                    if udf_function:
+                        inferred_name = infer_udf_function_name(udf_function)
+                        if inferred_name:
+                            options["udf_function_name"] = inferred_name
+                            logger.info(f"Inferred UDF function name: {inferred_name}")
+                        else:
+                            raise ValueError(
+                                f"Could not infer UDF function name from '{udf_function}'. "
+                                "Please specify 'udf_function_name' explicitly."
+                            )
+                task_options = check_schema(IngestTaskUDFSchema, options, task_id, json_options)
+                new_task_id = f"{task_id}"
+                new_task = [(new_task_id, UDFTask(**task_options.model_dump()))]
             else:
                 raise ValueError(f"Unsupported task type: {task_id}")
+            # Check for duplicate tasks - now allowing multiple tasks of the same type
             if new_task_id in validated_tasks:
-                raise ValueError(f"Duplicate task detected: {new_task_id}")
+                logger.debug(f"Multiple tasks detected for {new_task_id}, storing as list")
             logger.debug("Adding task: %s", new_task_id)
             for task_tuple in new_task:
-                validated_tasks[task_tuple[0]] = task_tuple[1]
+                if task_tuple[0] in validated_tasks:
+                    # Convert single task to list if needed, then append
+                    existing_task = validated_tasks[task_tuple[0]]
+                    if not isinstance(existing_task, list):
+                        validated_tasks[task_tuple[0]] = [existing_task]
+                    validated_tasks[task_tuple[0]].append(task_tuple[1])
+                else:
+                    validated_tasks[task_tuple[0]] = task_tuple[1]
         except ValueError as e:
             validation_errors.append(str(e))

nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

Potentially problematic release.

nv-ingest-client 2025.7.24.dev20250724py3-none-any.whl → 2025.11.2.dev20251102py3-none-any.whl