PyPI - nv-ingest-client - Versions diffs - 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl - Mend

nv-ingest-client 2025.7.24.dev20250724py3-none-any.whl → 2025.11.2.dev20251102py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (38) hide show

nv_ingest_client/cli/util/click.py +182 -30
nv_ingest_client/cli/util/processing.py +0 -393
nv_ingest_client/client/client.py +561 -207
nv_ingest_client/client/ingest_job_handler.py +412 -0
nv_ingest_client/client/interface.py +466 -59
nv_ingest_client/client/util/processing.py +11 -1
nv_ingest_client/nv_ingest_cli.py +58 -6
nv_ingest_client/primitives/jobs/job_spec.py +32 -10
nv_ingest_client/primitives/tasks/__init__.py +6 -4
nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
nv_ingest_client/primitives/tasks/caption.py +10 -16
nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
nv_ingest_client/primitives/tasks/dedup.py +12 -21
nv_ingest_client/primitives/tasks/embed.py +37 -76
nv_ingest_client/primitives/tasks/extract.py +68 -169
nv_ingest_client/primitives/tasks/filter.py +22 -28
nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
nv_ingest_client/primitives/tasks/split.py +17 -18
nv_ingest_client/primitives/tasks/store.py +29 -29
nv_ingest_client/primitives/tasks/task_base.py +1 -72
nv_ingest_client/primitives/tasks/task_factory.py +10 -11
nv_ingest_client/primitives/tasks/udf.py +349 -0
nv_ingest_client/util/dataset.py +8 -2
nv_ingest_client/util/document_analysis.py +314 -0
nv_ingest_client/util/image_disk_utils.py +300 -0
nv_ingest_client/util/transport.py +12 -6
nv_ingest_client/util/util.py +66 -0
nv_ingest_client/util/vdb/milvus.py +220 -75
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
nv_ingest_client/cli/util/tasks.py +0 -3
nv_ingest_client/primitives/exceptions.py +0 -0
nv_ingest_client/primitives/tasks/transform.py +0 -0
nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
{nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0

nv_ingest_client/cli/util/processing.py CHANGED Viewed

@@ -2,25 +2,13 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-import base64
-import io
-import json
 import logging
-import os
 import re
 import time
 from collections import defaultdict
-from concurrent.futures import as_completed
 from statistics import mean
 from statistics import median
 from typing import Any
-from typing import Dict
-from typing import List
-from typing import Tuple
-from nv_ingest_client.util.processing import handle_future_result
-from PIL import Image
-from tqdm import tqdm
 logger = logging.getLogger(__name__)
@@ -135,387 +123,6 @@ def report_statistics(
     report_overall_speed(total_pages_processed, start_time_ns, total_files)
-def process_response(response: Dict[str, Any], stage_elapsed_times: defaultdict) -> None:
-    """
-    Process the response to extract trace data and calculate elapsed time for each stage.
-    This function iterates over trace data in the response, identifies entry and exit times for each stage,
-    calculates the elapsed time, and appends the elapsed time to the corresponding stage in the provided
-    `stage_elapsed_times` dictionary.
-    Parameters
-    ----------
-    response : Dict[str, Any]
-        The response dictionary containing trace information for processing stages.
-    stage_elapsed_times : defaultdict
-        A defaultdict where keys are stage names (str) and values are lists of elapsed times (int, in nanoseconds).
-    Notes
-    -----
-    The function expects trace keys to include "entry" and "exit" substrings. For each entry key, the corresponding
-    exit key is determined by replacing "entry" with "exit". The stage name is assumed to be the third element when
-    splitting the key by "::".
-    """
-    trace_data: Dict[str, Any] = response.get("trace", {})
-    for key, entry_time in trace_data.items():
-        if "entry" in key:
-            exit_key: str = key.replace("entry", "exit")
-            exit_time: Any = trace_data.get(exit_key)
-            if exit_time:
-                # Assumes the stage name is in the third position when splitting the key
-                stage_parts = key.split("::")
-                if len(stage_parts) >= 3:
-                    stage_name: str = stage_parts[2]
-                    elapsed_time: int = exit_time - entry_time
-                    stage_elapsed_times[stage_name].append(elapsed_time)
-def organize_documents_by_type(response_data: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
-    """
-    Organize documents by their content type.
-    This function takes a list of response documents, extracts the content type from each document's metadata,
-    and organizes the documents into a dictionary, where the keys are content types and the values are lists of
-    documents belonging to each type.
-    Parameters
-    ----------
-    response_data : List[Dict[str, Any]]
-        A list of documents, where each document is represented as a dictionary. Each dictionary must contain
-        a 'metadata' field that may be either a JSON string or a dictionary. The metadata is expected to have a
-        "content_metadata" field containing the document's type.
-    Returns
-    -------
-    Dict[str, List[Dict[str, Any]]]
-        A dictionary mapping document types (as strings) to lists of documents. Each key represents a document type,
-        and the associated value is a list of documents that belong to that type.
-    Notes
-    -----
-    - If the 'metadata' field of a document is a string, it is parsed into a dictionary using `json.loads`.
-    - The function assumes that each document's metadata has a valid "content_metadata" field with a "type" key.
-    - Documents are grouped by the value of the "type" key in their "content_metadata".
-    Examples
-    --------
-    >>> response_data = [
-    ...     {"metadata": {"content_metadata": {"type": "report"}}},
-    ...     {"metadata": '{"content_metadata": {"type": "summary"}}'},
-    ...     {"metadata": {"content_metadata": {"type": "report"}}}
-    ... ]
-    >>> organize_documents_by_type(response_data)
-    {'report': [{'metadata': {'content_metadata': {'type': 'report'}}},
-                {'metadata': {'content_metadata': {'type': 'report'}}}],
-     'summary': [{'metadata': {'content_metadata': {'type': 'summary'}}}]}
-    """
-    doc_map: Dict[str, List[Dict[str, Any]]] = {}
-    for document in response_data:
-        doc_meta: Any = document["metadata"]
-        if isinstance(doc_meta, str):
-            doc_meta = json.loads(doc_meta)
-        doc_content_metadata: Dict[str, Any] = doc_meta["content_metadata"]
-        doc_type: str = doc_content_metadata["type"]
-        if doc_type not in doc_map:
-            doc_map[doc_type] = []
-        doc_map[doc_type].append(document)
-    return doc_map
-def save_response_data(response: Dict[str, Any], output_directory: str, images_to_disk: bool = False) -> None:
-    """
-    Save the response data into categorized metadata JSON files and optionally save images to disk.
-    This function processes the response data, organizes it based on document types, and saves the organized data
-    into a specified output directory as JSON files. If 'images_to_disk' is True and the document type is 'image',
-    it decodes and writes base64 encoded images to disk.
-    Parameters
-    ----------
-    response : Dict[str, Any]
-        A dictionary containing the API response data. It must contain a "data" field, which is expected to be a
-        list of document entries. Each document entry should contain metadata, which includes information about
-        the document's source.
-    output_directory : str
-        The path to the directory where the JSON metadata files should be saved. Subdirectories will be created based
-        on the document types, and the metadata files will be stored within these subdirectories.
-    images_to_disk : bool, optional
-        If True, base64 encoded images in the 'metadata.content' field will be decoded and saved to disk.
-        Default is False.
-    Returns
-    -------
-    None
-        This function does not return any values. It writes output to the filesystem.
-    Notes
-    -----
-    - If 'images_to_disk' is True and 'doc_type' is 'image', images will be decoded and saved to disk with appropriate
-      file types based on 'metadata.image_metadata.image_type'.
-    """
-    if ("data" not in response) or (not response["data"]):
-        logger.debug("Data is not in the response or response.data is empty")
-        return
-    response_data = response["data"]
-    if not isinstance(response_data, list) or len(response_data) == 0:
-        logger.debug("Response data is not a list or the list is empty.")
-        return
-    doc_meta_base = response_data[0]["metadata"]
-    source_meta = doc_meta_base["source_metadata"]
-    doc_name = source_meta["source_id"]
-    clean_doc_name = get_valid_filename(os.path.basename(doc_name))
-    output_name = f"{clean_doc_name}.metadata.json"
-    doc_map = organize_documents_by_type(response_data)
-    for doc_type, documents in doc_map.items():
-        doc_type_path = os.path.join(output_directory, doc_type)
-        if not os.path.exists(doc_type_path):
-            os.makedirs(doc_type_path)
-        if doc_type in ("image", "structured") and images_to_disk:
-            for i, doc in enumerate(documents):
-                meta: Dict[str, Any] = doc.get("metadata", {})
-                image_content = meta.get("content")
-                if doc_type == "image":
-                    image_type = meta.get("image_metadata", {}).get("image_type", "png").lower()
-                else:
-                    image_type = "png"
-                if image_content and image_type in {"png", "bmp", "jpeg", "jpg", "tiff"}:
-                    try:
-                        # Decode the base64 content
-                        image_data = base64.b64decode(image_content)
-                        image = Image.open(io.BytesIO(image_data))
-                        # Define the output file path
-                        image_ext = "jpg" if image_type == "jpeg" else image_type
-                        image_filename = f"{clean_doc_name}_{i}.{image_ext}"
-                        image_output_path = os.path.join(doc_type_path, "media", image_filename)
-                        # Ensure the media directory exists
-                        os.makedirs(os.path.dirname(image_output_path), exist_ok=True)
-                        # Save the image to disk
-                        image.save(image_output_path, format=image_ext.upper())
-                        # Update the metadata content with the image path
-                        meta["content"] = ""
-                        meta["content_url"] = os.path.realpath(image_output_path)
-                        logger.debug(f"Saved image to {image_output_path}")
-                    except Exception as e:
-                        logger.error(f"Failed to save image {i} for {clean_doc_name}: {e}")
-        # Write the metadata JSON file
-        with open(os.path.join(doc_type_path, output_name), "w") as f:
-            f.write(json.dumps(documents, indent=2))
-def generate_job_batch_for_iteration(
-    client: Any,
-    pbar: Any,
-    files: List[str],
-    tasks: Dict[str, Any],
-    processed: int,
-    batch_size: int,
-    retry_job_ids: List[str],
-    fail_on_error: bool = False,
-) -> Tuple[List[str], Dict[str, str], int]:
-    """
-    Generates a batch of job specifications for the current iteration of file processing.
-    This function handles retrying failed jobs and creating new jobs for unprocessed files.
-    The job specifications are then submitted for processing.
-    Parameters
-    ----------
-    client : Any
-        The client object used to submit jobs asynchronously.
-    pbar : Any
-        The progress bar object used to update the progress as jobs are processed.
-    files : List[str]
-        The list of file paths to be processed.
-    tasks : Dict[str, Any]
-        A dictionary of tasks to be executed as part of the job specifications.
-    processed : int
-        The number of files that have been processed so far.
-    batch_size : int
-        The maximum number of jobs to process in one batch.
-    retry_job_ids : List[str]
-        A list of job IDs that need to be retried due to previous failures.
-    fail_on_error : bool, optional
-        Whether to raise an error and stop processing if job specifications are missing. Default is False.
-    Returns
-    -------
-    Tuple[List[str], Dict[str, str], int]
-        A tuple containing:
-        - job_ids (List[str]): The list of job IDs created or retried in this iteration.
-        - job_id_map_updates (Dict[str, str]): A dictionary mapping job IDs to their corresponding file names.
-        - processed (int): The updated number of files processed.
-    Raises
-    ------
-    RuntimeError
-        If `fail_on_error` is True and there are missing job specifications, a RuntimeError is raised.
-    """
-    job_indices: List[str] = []
-    job_index_map_updates: Dict[str, str] = {}
-    cur_job_count: int = 0
-    if retry_job_ids:
-        job_indices.extend(retry_job_ids)
-        cur_job_count = len(job_indices)
-    if (cur_job_count < batch_size) and (processed < len(files)):
-        new_job_count: int = min(batch_size - cur_job_count, len(files) - processed)
-        batch_files: List[str] = files[processed : processed + new_job_count]
-        new_job_indices: List[str] = client.create_jobs_for_batch(batch_files, tasks)
-        if len(new_job_indices) != new_job_count:
-            missing_jobs: int = new_job_count - len(new_job_indices)
-            error_msg: str = f"Missing {missing_jobs} job specs -- this is likely due to bad reads or file corruption"
-            logger.warning(error_msg)
-            if fail_on_error:
-                raise RuntimeError(error_msg)
-            pbar.update(missing_jobs)
-        job_index_map_updates = {job_index: file for job_index, file in zip(new_job_indices, batch_files)}
-        processed += new_job_count
-        _ = client.submit_job_async(new_job_indices, "ingest_task_queue")
-        job_indices.extend(new_job_indices)
-    return job_indices, job_index_map_updates, processed
-def create_and_process_jobs(
-    files: List[str],
-    client: Any,
-    tasks: Dict[str, Any],
-    output_directory: str,
-    batch_size: int,
-    fail_on_error: bool = False,
-    save_images_separately: bool = False,
-) -> Tuple[int, Dict[str, List[float]], int, Dict[str, str]]:
-    """
-    Process a list of files by creating and submitting jobs for each file, then fetching
-    and handling the results asynchronously.
-    This function creates job specifications (JobSpecs) for the provided list of files,
-    submits the jobs to the client, and processes the results asynchronously. It handles
-    job retries for timeouts, logs failures, and limits the number of JobSpecs in memory to
-    `batch_size * 2`. Progress is reported on a per-file basis, including the pages processed
-    per second.
-    Parameters
-    ----------
-    files : List[str]
-        A list of file paths to be processed. Each file is used to create a job which is then
-        submitted to the client.
-    client : Any
-        An instance of NvIngestClient used to submit jobs and fetch results asynchronously.
-    tasks : Dict[str, Any]
-        A dictionary of tasks to be added to each job. The keys represent task names (e.g., "split",
-        "extract", "store", "caption", etc.) and the values represent task configurations.
-    output_directory : str
-        The directory path where the processed job results will be saved. If an empty string or None
-        is provided, results will not be saved.
-    batch_size : int
-        The number of jobs to process in each batch. Memory is limited to `batch_size * 2` jobs at
-        any time.
-    fail_on_error : bool, optional
-        If True, the function will raise an error and stop processing when encountering an unrecoverable
-        error. If False, the function logs the error and continues processing other jobs. Default is False.
-    save_images_separately : bool, optional
-        If True, images will be saved separately to disk. Default is False.
-    Returns
-    -------
-    Tuple[int, Dict[str, List[float]], int, Dict[str, str]]
-        A tuple containing:
-        - total_files (int): The total number of files processed.
-        - trace_times (Dict[str, List[float]]): A dictionary mapping job IDs to a list of trace times
-          for diagnostic purposes.
-        - total_pages_processed (int): The total number of pages processed from the files.
-        - trace_ids (Dict[str, str]): A dictionary mapping a source file to its correlating trace_id.
-    Raises
-    ------
-    RuntimeError
-        If `fail_on_error` is True and an error occurs during job submission or processing.
-    """
-    total_files: int = len(files)
-    total_pages_processed: int = 0
-    trace_times: Dict[str, List[float]] = defaultdict(list)
-    trace_ids: Dict[str, str] = defaultdict(list)  # type: ignore
-    failed_jobs: List[str] = []
-    retry_job_ids: List[str] = []
-    job_id_map: Dict[str, str] = {}
-    retry_counts: Dict[str, int] = defaultdict(int)
-    start_time_ns: int = time.time_ns()
-    with tqdm(total=total_files, desc="Processing files", unit="file") as pbar:
-        processed: int = 0
-        while (processed < len(files)) or retry_job_ids:
-            # Process new batch of files or retry failed job IDs
-            job_ids, job_id_map_updates, processed = generate_job_batch_for_iteration(
-                client, pbar, files, tasks, processed, batch_size, retry_job_ids, fail_on_error
-            )
-            job_id_map.update(job_id_map_updates)
-            retry_job_ids = []
-            futures_dict: Dict[Any, str] = client.fetch_job_result_async(job_ids, data_only=False)
-            for future in as_completed(futures_dict.keys()):
-                retry: bool = False
-                job_id: str = futures_dict[future]
-                source_name: str = job_id_map[job_id]
-                try:
-                    future_response, trace_id = handle_future_result(future)
-                    trace_ids[source_name] = trace_id
-                    first_page_metadata = future_response["data"][0]["metadata"]
-                    file_page_counts: Dict[str, int] = {
-                        first_page_metadata["source_metadata"]["source_name"]: first_page_metadata["content_metadata"][
-                            "hierarchy"
-                        ]["page_count"]
-                    }
-                    if output_directory:
-                        save_response_data(future_response, output_directory, images_to_disk=save_images_separately)
-                    total_pages_processed += file_page_counts[source_name]
-                    elapsed_time: float = (time.time_ns() - start_time_ns) / 1e9
-                    pages_per_sec: float = total_pages_processed / elapsed_time if elapsed_time > 0 else 0
-                    pbar.set_postfix(pages_per_sec=f"{pages_per_sec:.2f}")
-                    process_response(future_response, trace_times)
-                except TimeoutError:
-                    retry_counts[source_name] += 1
-                    retry_job_ids.append(job_id)  # Add job_id back to retry list
-                    retry = True
-                except json.JSONDecodeError as e:
-                    logger.error(f"Decoding error while processing {job_id}({source_name}): {e}")
-                    failed_jobs.append(f"{job_id}::{source_name}")
-                except RuntimeError as e:
-                    logger.error(f"Error while processing '{job_id}' - ({source_name}):\n{e}")
-                    failed_jobs.append(f"{job_id}::{source_name}")
-                except Exception as e:
-                    logger.exception(f"Unhandled error while processing {job_id}({source_name}): {e}")
-                    failed_jobs.append(f"{job_id}::{source_name}")
-                finally:
-                    # Do not update progress bar if we're going to retry the job.
-                    if not retry:
-                        pbar.update(1)
-    return total_files, trace_times, total_pages_processed, trace_ids
 def get_valid_filename(name: Any) -> str:
     """
     Return a sanitized version of the given filename.

nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

Potentially problematic release.

nv-ingest-client 2025.7.24.dev20250724py3-none-any.whl → 2025.11.2.dev20251102py3-none-any.whl