PyPI - nv-ingest-client - Versions diffs - 2025.9.26.dev20250926__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl - Mend

nv-ingest-client 2025.9.26.dev20250926py3-none-any.whl → 2025.11.2.dev20251102py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (21) hide show

nv_ingest_client/client/interface.py CHANGED Viewed

@@ -6,6 +6,7 @@
 import collections
 import glob
+import gzip
 import json
 import logging
 import os
@@ -53,7 +54,7 @@ from nv_ingest_client.primitives.tasks import StoreEmbedTask
 from nv_ingest_client.primitives.tasks import UDFTask
 from nv_ingest_client.util.processing import check_schema
 from nv_ingest_client.util.system import ensure_directory_with_permissions
-from nv_ingest_client.util.util import filter_function_kwargs
+from nv_ingest_client.util.util import filter_function_kwargs, apply_pdf_split_config_to_job_specs
 from nv_ingest_client.util.vdb import VDB, get_vdb_op_cls
 from tqdm import tqdm
@@ -93,17 +94,20 @@ def ensure_job_specs(func):
 class LazyLoadedList(collections.abc.Sequence):
-    def __init__(self, filepath: str, expected_len: Optional[int] = None):
+    def __init__(self, filepath: str, expected_len: Optional[int] = None, compression: Optional[str] = None):
         self.filepath = filepath
         self._len: Optional[int] = expected_len  # Store pre-calculated length
         self._offsets: Optional[List[int]] = None
+        self.compression = compression
         if self._len == 0:
             self._offsets = []
+        self._open = gzip.open if self.compression == "gzip" else open
     def __iter__(self) -> Iterator[Any]:
         try:
-            with open(self.filepath, "r", encoding="utf-8") as f:
+            with self._open(self.filepath, "rt", encoding="utf-8") as f:
                 for line in f:
                     yield json.loads(line)
         except FileNotFoundError:
@@ -120,7 +124,7 @@ class LazyLoadedList(collections.abc.Sequence):
         self._offsets = []
         line_count = 0
         try:
-            with open(self.filepath, "rb") as f:
+            with self._open(self.filepath, "rb") as f:
                 while True:
                     current_pos = f.tell()
                     line = f.readline()
@@ -144,10 +148,12 @@ class LazyLoadedList(collections.abc.Sequence):
     def __len__(self) -> int:
         if self._len is not None:
             return self._len
         if self._offsets is not None:
             self._len = len(self._offsets)
             return self._len
         self._build_index()
         return self._len if self._len is not None else 0
     def __getitem__(self, idx: int) -> Any:
@@ -170,7 +176,7 @@ class LazyLoadedList(collections.abc.Sequence):
             raise IndexError(f"Index {idx} out of range for {self.filepath} (len: {len(self._offsets)})")
         try:
-            with open(self.filepath, "rb") as f:
+            with self._open(self.filepath, "rb") as f:
                 f.seek(self._offsets[idx])
                 line_bytes = f.readline()
                 return json.loads(line_bytes.decode("utf-8"))
@@ -396,15 +402,9 @@ class Ingestor:
         show_progress: bool = False,
         return_failures: bool = False,
         save_to_disk: bool = False,
+        return_traces: bool = False,
         **kwargs: Any,
-    ) -> Union[
-        List[List[Dict[str, Any]]],  # In-memory: List of (response['data'] for each doc)
-        List[LazyLoadedList],  # Disk: List of proxies, one per original doc
-        Tuple[
-            Union[List[List[Dict[str, Any]]], List[LazyLoadedList]],
-            List[Tuple[str, str]],
-        ],
-    ]:  # noqa: E501
+    ) -> Union[List[Any], Tuple[Any, ...]]:
         """
         Ingest documents by submitting jobs and fetching results concurrently.
@@ -414,22 +414,36 @@ class Ingestor:
             Whether to display a progress bar. Default is False.
         return_failures : bool, optional
             If True, return a tuple (results, failures); otherwise, return only results. Default is False.
+        save_to_disk : bool, optional
+            If True, save results to disk and return LazyLoadedList proxies. Default is False.
+        return_traces : bool, optional
+            If True, return trace metrics alongside results. Default is False.
+            Traces contain timing metrics (entry, exit, resident_time) for each stage.
         **kwargs : Any
-            Additional keyword arguments for the underlying client methods. Supported keys:
-            'concurrency_limit', 'timeout', 'max_job_retries', 'retry_delay',
-            'data_only', 'verbose'. Unrecognized keys are passed through to
-            process_jobs_concurrently.
+            Additional keyword arguments for the underlying client methods.
+            Optional flags include `include_parent_trace_ids=True` to also return
+            parent job trace identifiers (V2 API only).
         Returns
         -------
-        results : list of dict
-            List of successful job results when `return_failures` is False.
-        results, failures : tuple (list of dict, list of tuple of str)
-            Tuple containing successful results and failure information when `return_failures` is True.
+        list or tuple
+            Returns vary based on flags:
+            - Default: list of results
+            - return_failures=True: (results, failures)
+            - return_traces=True: (results, traces)
+            - return_failures=True, return_traces=True: (results, failures, traces)
+            - Additional combinations with include_parent_trace_ids kwarg
+        Notes
+        -----
+        Trace metrics include timing data for each processing stage. For detailed
+        usage and examples, see src/nv_ingest/api/v2/README.md
         """
         if save_to_disk and (not self._output_config):
             self.save_to_disk()
+        include_parent_trace_ids = bool(kwargs.pop("include_parent_trace_ids", False))
         self._prepare_ingest_run()
         # Add jobs locally first
@@ -455,6 +469,8 @@ class Ingestor:
                 clean_source_basename = get_valid_filename(os.path.basename(source_name))
                 file_name, file_ext = os.path.splitext(clean_source_basename)
                 file_suffix = f".{file_ext.strip('.')}.results.jsonl"
+                if self._output_config["compression"] == "gzip":
+                    file_suffix += ".gz"
                 jsonl_filepath = os.path.join(output_dir, safe_filename(output_dir, file_name, file_suffix))
                 num_items_saved = save_document_results_to_jsonl(
@@ -462,10 +478,13 @@ class Ingestor:
                     jsonl_filepath,
                     source_name,
                     ensure_parent_dir_exists=False,
+                    compression=self._output_config["compression"],
                 )
                 if num_items_saved > 0:
-                    results = LazyLoadedList(jsonl_filepath, expected_len=num_items_saved)
+                    results = LazyLoadedList(
+                        jsonl_filepath, expected_len=num_items_saved, compression=self._output_config["compression"]
+                    )
                     if results_lock:
                         with results_lock:
                             final_results_payload_list.append(results)
@@ -538,7 +557,24 @@ class Ingestor:
         proc_kwargs = filter_function_kwargs(self._client.process_jobs_concurrently, **kwargs)
-        results, failures = self._client.process_jobs_concurrently(
+        # Telemetry controls (optional)
+        enable_telemetry: Optional[bool] = kwargs.pop("enable_telemetry", None)
+        show_telemetry: Optional[bool] = kwargs.pop("show_telemetry", None)
+        if show_telemetry is None:
+            # Fallback to env NV_INGEST_CLIENT_SHOW_TELEMETRY (0/1), default off
+            try:
+                show_telemetry = bool(int(os.getenv("NV_INGEST_CLIENT_SHOW_TELEMETRY", "0")))
+            except ValueError:
+                show_telemetry = False
+        # If user explicitly wants to show telemetry but did not specify enable_telemetry,
+        # ensure collection is enabled so summary isn't empty.
+        if enable_telemetry is None and show_telemetry:
+            enable_telemetry = True
+        if enable_telemetry is not None and hasattr(self._client, "enable_telemetry"):
+            self._client.enable_telemetry(bool(enable_telemetry))
+        # Call process_jobs_concurrently
+        proc_result = self._client.process_jobs_concurrently(
             job_indices=self._job_ids,
             job_queue_id=self._job_queue_id,
             timeout=timeout,
@@ -547,9 +583,17 @@ class Ingestor:
             return_failures=True,
             stream_to_callback_only=stream_to_callback_only,
             verbose=verbose,
+            return_traces=return_traces,
             **proc_kwargs,
         )
+        # Unpack result based on return_traces flag
+        if return_traces:
+            results, failures, traces_list = proc_result
+        else:
+            results, failures = proc_result
+            traces_list = []  # Empty list when traces not requested
         if show_progress and pbar:
             pbar.close()
@@ -600,7 +644,30 @@ class Ingestor:
                     logger.info("Purging saved results from disk after successful VDB upload.")
                     self._purge_saved_results(results)
-        return (results, failures) if return_failures else results
+        # Print telemetry summary if requested
+        if show_telemetry:
+            try:
+                summary = self._client.summarize_telemetry()
+                # Print to stdout and log for convenience
+                print("NvIngestClient Telemetry Summary:", json.dumps(summary, indent=2))
+                logger.info("NvIngestClient Telemetry Summary: %s", json.dumps(summary, indent=2))
+            except Exception:
+                pass
+        parent_trace_ids = self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
+        # Build return tuple based on requested outputs
+        # Order: results, failures (if requested), traces (if requested), parent_trace_ids (if requested)
+        returns = [results]
+        if return_failures:
+            returns.append(failures)
+        if return_traces:
+            returns.append(traces_list)
+        if include_parent_trace_ids:
+            returns.append(parent_trace_ids)
+        return tuple(returns) if len(returns) > 1 else results
     def ingest_async(self, **kwargs: Any) -> Future:
         """
@@ -1068,6 +1135,7 @@ class Ingestor:
         self,
         output_directory: Optional[str] = None,
         cleanup: bool = True,
+        compression: Optional[str] = "gzip",
     ) -> "Ingestor":
         """Configures the Ingestor to save results to disk instead of memory.
@@ -1092,6 +1160,12 @@ class Ingestor:
             when the Ingestor's context is exited (i.e., when used in a `with`
             statement).
             Defaults to True.
+        compression : str, optional
+            The compression algorithm to use for the saved result files.
+            Currently, the only supported value is `'gzip'`. To disable
+            compression, set this parameter to `None`. Defaults to `'gzip'`,
+            which significantly reduces the disk space required for results.
+            When enabled, files are saved with a `.gz` suffix (e.g., `results.jsonl.gz`).
         Returns
         -------
@@ -1107,6 +1181,7 @@ class Ingestor:
         self._output_config = {
             "output_directory": output_directory,
             "cleanup": cleanup,
+            "compression": compression,
         }
         ensure_directory_with_permissions(output_directory)
@@ -1175,6 +1250,44 @@ class Ingestor:
         return self
+    @ensure_job_specs
+    def pdf_split_config(self, pages_per_chunk: int = 32) -> "Ingestor":
+        """
+        Configure PDF splitting behavior for V2 API.
+        Parameters
+        ----------
+        pages_per_chunk : int, optional
+            Number of pages per PDF chunk (default: 32)
+            Server enforces boundaries: min=1, max=128
+        Returns
+        -------
+        Ingestor
+            Self for method chaining
+        Notes
+        -----
+        - Only affects V2 API endpoints with PDF splitting support
+        - Server will clamp values outside [1, 128] range
+        - Smaller chunks = more parallelism but more overhead
+        - Larger chunks = less overhead but reduced concurrency
+        """
+        MIN_PAGES = 1
+        MAX_PAGES = 128
+        # Warn if value will be clamped by server
+        if pages_per_chunk < MIN_PAGES:
+            logger.warning(f"pages_per_chunk={pages_per_chunk} is below minimum. Server will clamp to {MIN_PAGES}.")
+        elif pages_per_chunk > MAX_PAGES:
+            logger.warning(f"pages_per_chunk={pages_per_chunk} exceeds maximum. Server will clamp to {MAX_PAGES}.")
+        # Flatten all job specs and apply PDF config using shared utility
+        all_job_specs = [spec for job_specs in self._job_specs._file_type_to_job_spec.values() for spec in job_specs]
+        apply_pdf_split_config_to_job_specs(all_job_specs, pages_per_chunk)
+        return self
     def _count_job_states(self, job_states: set[JobStateEnum]) -> int:
         """
         Counts the jobs in specified states.

nv_ingest_client/client/util/processing.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import gzip
 import io
 import json
 import logging
@@ -6,6 +7,7 @@ import re
 from typing import Any
 from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 try:
@@ -33,6 +35,7 @@ def save_document_results_to_jsonl(
     jsonl_output_filepath: str,
     original_source_name_for_log: str,
     ensure_parent_dir_exists: bool = True,
+    compression: Optional[str] = None,
 ) -> Tuple[int, Dict[str, str]]:
     """
     Saves a list of extraction items (for a single source document) to a JSON Lines file.
@@ -50,6 +53,13 @@ def save_document_results_to_jsonl(
             if parent_dir:
                 os.makedirs(parent_dir, exist_ok=True)
+        if compression == "gzip":
+            open_func = gzip.open
+        elif compression is None:
+            open_func = open
+        else:
+            raise ValueError(f"Unsupported compression type: {compression}")
         with io.BytesIO() as buffer:
             for extraction_item in doc_response_data:
                 if USING_ORJSON:
@@ -60,7 +70,7 @@ def save_document_results_to_jsonl(
         count_items_written = len(doc_response_data)
-        with open(jsonl_output_filepath, "wb") as f_jsonl:
+        with open_func(jsonl_output_filepath, "wb") as f_jsonl:
             f_jsonl.write(full_byte_content)
         logger.info(

nv_ingest_client/nv_ingest_cli.py CHANGED Viewed

@@ -25,10 +25,10 @@ from nv_ingest_client.cli.util.click import click_match_and_validate_files
 from nv_ingest_client.cli.util.click import click_validate_batch_size
 from nv_ingest_client.cli.util.click import click_validate_file_exists
 from nv_ingest_client.cli.util.click import click_validate_task
-from nv_ingest_client.cli.util.processing import create_and_process_jobs
 from nv_ingest_client.cli.util.processing import report_statistics
 from nv_ingest_client.cli.util.system import configure_logging
 from nv_ingest_client.client import NvIngestClient
+from nv_ingest_client.client.ingest_job_handler import IngestJobHandler
 from nv_ingest_client.util.dataset import get_dataset_files
 from nv_ingest_client.util.dataset import get_dataset_statistics
 from nv_ingest_client.util.system import ensure_directory_with_permissions
@@ -74,6 +74,12 @@ logger = logging.getLogger(__name__)
 @click.option("--client_host", default="localhost", help="DNS name or URL for the endpoint.")
 @click.option("--client_port", default=7670, type=int, help="Port for the client endpoint.")
 @click.option("--client_kwargs", help="Additional arguments to pass to the client.", default="{}")
+@click.option(
+    "--api_version",
+    default="v1",
+    type=click.Choice(["v1", "v2"], case_sensitive=False),
+    help="API version to use (v1 or v2). V2 required for PDF split page count feature.",
+)
 @click.option(
     "--client_type",
     default="rest",
@@ -119,6 +125,8 @@ Example:
   --task 'extract:{"document_type":"docx", "extract_text":true, "extract_images":true}'
   --task 'embed'
   --task 'caption:{}'
+  --pdf_split_page_count 64  # Configure PDF splitting (requires --api_version v2)
+  --api_version v2           # Use V2 API for PDF splitting support
 \b
 Tasks and Options:
@@ -207,6 +215,12 @@ for locating portions of the system that might be bottlenecks for the overall ru
 )
 @click.option("--zipkin_host", default="localhost", help="DNS name or Zipkin API.")
 @click.option("--zipkin_port", default=9411, type=int, help="Port for the Zipkin trace API")
+@click.option(
+    "--pdf_split_page_count",
+    default=None,
+    type=int,
+    help="Number of pages per PDF chunk for splitting. Allows per-request tuning of PDF split size in v2 api.",
+)
 @click.option("--version", is_flag=True, help="Show version.")
 @click.pass_context
 def main(
@@ -215,6 +229,7 @@ def main(
     client_host: str,
     client_kwargs: str,
     client_port: int,
+    api_version: str,
     client_type: str,
     concurrency_n: int,
     dataset: str,
@@ -228,6 +243,7 @@ def main(
     collect_profiling_traces: bool,
     zipkin_host: str,
     zipkin_port: int,
+    pdf_split_page_count: int,
     task: [str],
     version: [bool],
 ):
@@ -268,6 +284,10 @@ def main(
                 _client_kwargs_obj = json.loads(client_kwargs)
             except Exception:
                 _client_kwargs_obj = {"raw": client_kwargs}
+            # Merge api_version into client_kwargs
+            _client_kwargs_obj["api_version"] = api_version
             _sanitized_client_kwargs = sanitize_for_logging(_client_kwargs_obj)
             logging.debug(
                 f"Creating message client: {client_host} and port: {client_port} -> "
@@ -285,20 +305,24 @@ def main(
                 message_client_allocator=client_allocator,
                 message_client_hostname=client_host,
                 message_client_port=client_port,
-                message_client_kwargs=json.loads(client_kwargs),
+                message_client_kwargs=_client_kwargs_obj,
                 worker_pool_size=concurrency_n,
             )
             start_time_ns = time.time_ns()
-            (total_files, trace_times, pages_processed, trace_ids) = create_and_process_jobs(
-                files=docs,
+            handler = IngestJobHandler(
                 client=ingest_client,
+                files=docs,
                 tasks=task,
                 output_directory=output_directory,
                 batch_size=batch_size,
                 fail_on_error=fail_on_error,
                 save_images_separately=save_images_separately,
+                show_progress=True,
+                show_telemetry=True,
+                pdf_split_page_count=pdf_split_page_count,
             )
+            (total_files, trace_times, pages_processed, trace_ids) = handler.run()
             report_statistics(start_time_ns, trace_times, pages_processed, total_files)

nv_ingest_client/primitives/jobs/job_spec.py CHANGED Viewed

@@ -110,6 +110,7 @@ class JobSpec:
             "job_id": str(self._job_id),
             "tasks": [task.to_dict() for task in self._tasks],
             "tracing_options": self._extended_options.get("tracing_options", {}),
+            "pdf_config": self._extended_options.get("pdf_config", {}),
         }
     @property

nv_ingest_client/primitives/tasks/embed.py CHANGED Viewed

@@ -36,6 +36,8 @@ class EmbedTask(Task):
         image_elements_modality: Optional[str] = None,
         structured_elements_modality: Optional[str] = None,
         audio_elements_modality: Optional[str] = None,
+        custom_content_field: Optional[str] = None,
+        result_target_field: Optional[str] = None,
     ) -> None:
         """
         Initialize the EmbedTask configuration.
@@ -76,6 +78,8 @@ class EmbedTask(Task):
             image_elements_modality=image_elements_modality,
             structured_elements_modality=structured_elements_modality,
             audio_elements_modality=audio_elements_modality,
+            custom_content_field=custom_content_field,
+            result_target_field=result_target_field,
         )
         self._endpoint_url = validated_data.endpoint_url
@@ -86,6 +90,8 @@ class EmbedTask(Task):
         self._image_elements_modality = validated_data.image_elements_modality
         self._structured_elements_modality = validated_data.structured_elements_modality
         self._audio_elements_modality = validated_data.audio_elements_modality
+        self._custom_content_field = validated_data.custom_content_field
+        self._result_target_field = validated_data.result_target_field
     def __str__(self) -> str:
         """
@@ -114,6 +120,10 @@ class EmbedTask(Task):
             info += f"  structured_elements_modality: {self._structured_elements_modality}\n"
         if self._audio_elements_modality:
             info += f"  audio_elements_modality: {self._audio_elements_modality}\n"
+        if self._custom_content_field:
+            info += f"  custom_content_field: {self._custom_content_field}\n"
+        if self._result_target_field:
+            info += f"  result_target_field: {self.result_target_field}\n"
         return info
     def to_dict(self) -> Dict[str, Any]:
@@ -149,4 +159,10 @@ class EmbedTask(Task):
         if self._audio_elements_modality:
             task_properties["audio_elements_modality"] = self._audio_elements_modality
+        if self._custom_content_field:
+            task_properties["custom_content_field"] = self._custom_content_field
+        if self._result_target_field:
+            task_properties["result_target_field"] = self.result_target_field
         return {"type": "embed", "task_properties": task_properties}

nv_ingest_client/primitives/tasks/extract.py CHANGED Viewed

@@ -86,7 +86,7 @@ class ExtractTask(Task):
         extract_page_as_image: bool = False,
         text_depth: str = "document",
         paddle_output_format: str = "pseudo_markdown",
-        table_output_format: str = "pseudo_markdown",
+        table_output_format: str = "markdown",
     ) -> None:
         """
         Setup Extract Task Config

nv_ingest_client/primitives/tasks/filter.py CHANGED Viewed

@@ -31,7 +31,7 @@ class FilterTask(Task):
         min_size: int = 128,
         max_aspect_ratio: Union[int, float] = 5.0,
         min_aspect_ratio: Union[int, float] = 0.2,
-        filter: bool = False,
+        filter: bool = True,
     ) -> None:
         """
         Setup Filter Task Config

nv_ingest_client/primitives/tasks/task_factory.py CHANGED Viewed

@@ -8,18 +8,15 @@ from typing import Dict
 from typing import Type
 from typing import Union
-from .caption import CaptionTask
-from .dedup import DedupTask
-from .embed import EmbedTask
-from .extract import ExtractTask
-from .filter import FilterTask
-from .split import SplitTask
-from .store import StoreEmbedTask
-from .store import StoreTask
-from .task_base import Task
-from .task_base import TaskType
-from .task_base import is_valid_task_type
-from .udf import UDFTask
+from nv_ingest_client.primitives.tasks.task_base import Task, TaskType, is_valid_task_type
+from nv_ingest_client.primitives.tasks.caption import CaptionTask
+from nv_ingest_client.primitives.tasks.dedup import DedupTask
+from nv_ingest_client.primitives.tasks.embed import EmbedTask
+from nv_ingest_client.primitives.tasks.extract import ExtractTask
+from nv_ingest_client.primitives.tasks.filter import FilterTask
+from nv_ingest_client.primitives.tasks.split import SplitTask
+from nv_ingest_client.primitives.tasks.store import StoreEmbedTask, StoreTask
+from nv_ingest_client.primitives.tasks.udf import UDFTask
 class TaskUnimplemented(Task):

nv_ingest_client/primitives/tasks/udf.py CHANGED Viewed

@@ -11,6 +11,7 @@ import logging
 import importlib
 import inspect
 import ast
+import re
 from typing import Dict, Optional, Union
 from nv_ingest_api.internal.enums.common import PipelinePhase
@@ -122,54 +123,50 @@ def _resolve_udf_function(udf_function_spec: str) -> str:
     3. File path: '/path/to/file.py:my_function'
     4. Legacy import path: 'my_module.my_function' (function name only, no imports)
     """
-    if udf_function_spec.strip().startswith("def "):
-        # Already an inline function string
-        return udf_function_spec
+    # Default to treating as inline unless it clearly matches a
+    # module/file specification. This avoids misclassifying inline code that
+    # contains colons, imports, or annotations before the def line.
-    elif ".py:" in udf_function_spec:
-        # File path format: /path/to/file.py:function_name
-        file_path, function_name = udf_function_spec.split(":", 1)
+    spec = udf_function_spec.strip()
+    # 1) File path with function: /path/to/file.py:function_name
+    if ".py:" in spec:
+        file_path, function_name = spec.split(":", 1)
         return _extract_function_with_context(file_path, function_name)
-    elif udf_function_spec.endswith(".py"):
-        # File path format without function name - this is an error
+    # 2) File path without function name is an explicit error
+    if spec.endswith(".py"):
         raise ValueError(
-            f"File path '{udf_function_spec}' is missing function name. "
-            f"Use format 'file.py:function_name' to specify which function to use."
+            f"File path '{udf_function_spec}' is missing function name. Use format 'file.py:function_name'."
         )
-    elif ":" in udf_function_spec and ".py:" not in udf_function_spec:
-        # Module path format with colon: my_module.submodule:function_name
-        # This preserves imports and module context
-        module_path, function_name = udf_function_spec.split(":", 1)
+    # 3) Module path with colon: my.module:function
+    # Be strict: only letters, numbers, underscore, and dots on the left; valid identifier on the right;
+    # no whitespace/newlines.
+    module_colon_pattern = re.compile(r"^[A-Za-z_][\w\.]*:[A-Za-z_][\w]*$")
+    if module_colon_pattern.match(spec):
+        module_path, function_name = spec.split(":", 1)
         try:
-            # Import the module to get its file path
             module = importlib.import_module(module_path)
             module_file = inspect.getfile(module)
-            # Extract the function with full module context
             return _extract_function_with_context(module_file, function_name)
         except ImportError as e:
             raise ValueError(f"Failed to import module '{module_path}': {e}")
         except Exception as e:
             raise ValueError(f"Failed to resolve module path '{module_path}': {e}")
-    elif "." in udf_function_spec:
-        # Legacy import path format: module.submodule.function
-        # This only extracts the function source without imports (legacy behavior)
-        func = _load_function_from_import_path(udf_function_spec)
-        # Get the source code of the function only
+    # 4) Legacy import path: my.module.function (no colon)
+    legacy_import_pattern = re.compile(r"^[A-Za-z_][\w\.]*\.[A-Za-z_][\w]*$")
+    if legacy_import_pattern.match(spec):
+        func = _load_function_from_import_path(spec)
         try:
             source = inspect.getsource(func)
             return source
         except (OSError, TypeError) as e:
             raise ValueError(f"Could not get source code for function from '{udf_function_spec}': {e}")
-    else:
-        raise ValueError(f"Invalid UDF function specification: {udf_function_spec}")
+    # 5) Default: treat as inline UDF source (entire string)
+    return udf_function_spec
 class UDFTask(Task):

nv_ingest_client/util/document_analysis.py CHANGED Viewed

@@ -20,7 +20,7 @@ logger = logging.getLogger(__name__)
 def analyze_document_chunks(
-    results: Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
+    results: Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]],
 ) -> Dict[str, Dict[str, Dict[str, int]]]:
     """
     Analyze ingestor results to count elements by type and page for each document.

nv-ingest-client 2025.9.26.dev20250926__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

Potentially problematic release.

nv-ingest-client 2025.9.26.dev20250926py3-none-any.whl → 2025.11.2.dev20251102py3-none-any.whl