nv-ingest-api 2025.10.14.dev20251014__py3-none-any.whl → 2025.10.16.dev20251016__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

@@ -758,8 +758,8 @@ def get_ocr_model_name(ocr_grpc_endpoint=None, default_model_name=DEFAULT_OCR_MO
758
758
  if ocr_model_name is not None:
759
759
  return ocr_model_name
760
760
 
761
- # 2. If no gRPC endpoint is provided, fall back to the default immediately.
762
- if not ocr_grpc_endpoint:
761
+ # 2. If no gRPC endpoint is provided or the endpoint is a NVCF endpoint, fall back to the default immediately.
762
+ if (not ocr_grpc_endpoint) or ("grpc.nvcf.nvidia.com" in ocr_grpc_endpoint):
763
763
  logger.debug(f"No OCR gRPC endpoint provided. Falling back to default model name '{default_model_name}'.")
764
764
  return default_model_name
765
765
 
@@ -32,6 +32,17 @@ class TracingOptionsSchema(BaseModelNoExt):
32
32
  total_pages: Optional[int] = None
33
33
 
34
34
 
35
+ # PDF Configuration Schema
36
+ class PdfConfigSchema(BaseModelNoExt):
37
+ """PDF-specific configuration options for job submission.
38
+
39
+ Note: split_page_count accepts any positive integer but will be clamped
40
+ to [1, 128] range by the server at runtime.
41
+ """
42
+
43
+ split_page_count: Annotated[int, Field(ge=1)] = 32
44
+
45
+
35
46
  # Ingest Task Schemas
36
47
 
37
48
 
@@ -270,6 +281,7 @@ class IngestJobSchema(BaseModelNoExt):
270
281
  job_id: Union[str, int]
271
282
  tasks: List[IngestTaskSchema]
272
283
  tracing_options: Optional[TracingOptionsSchema] = None
284
+ pdf_config: Optional[PdfConfigSchema] = None
273
285
 
274
286
 
275
287
  # ------------------------------------------------------------------------------
@@ -3,7 +3,6 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
- import os
7
6
  import re
8
7
  import time
9
8
  from typing import Any, Union, Tuple, Optional, Dict, Callable
@@ -104,6 +103,17 @@ class RestClient(MessageBrokerClientBase):
104
103
  Default timeout in seconds for waiting for data after connection. Default is None.
105
104
  http_allocator : Optional[Callable[[], Any]], optional
106
105
  A callable that returns an HTTP client instance. If None, `requests.Session()` is used.
106
+ **kwargs : dict
107
+ Additional keyword arguments. Supported keys:
108
+ - api_version : str, optional
109
+ API version to use ('v1' or 'v2'). Defaults to 'v1' if not specified.
110
+ Invalid versions will log a warning and fall back to 'v1'.
111
+ - base_url : str, optional
112
+ Override the generated base URL.
113
+ - headers : dict, optional
114
+ Additional headers to include in requests.
115
+ - auth : optional
116
+ Authentication configuration for requests.
107
117
 
108
118
  Returns
109
119
  -------
@@ -138,8 +148,19 @@ class RestClient(MessageBrokerClientBase):
138
148
  )
139
149
  self._client = requests.Session()
140
150
 
141
- # Allow API version override via environment variable or kwargs
142
- api_version = kwargs.get("api_version") or os.getenv("NV_INGEST_API_VERSION", "v1")
151
+ # Validate and normalize API version to prevent misconfiguration
152
+ # Default to v1 for backwards compatibility if not explicitly provided
153
+ VALID_API_VERSIONS = {"v1", "v2"}
154
+ raw_api_version = kwargs.get("api_version", "v1")
155
+ api_version = str(raw_api_version).strip().lower()
156
+
157
+ if api_version not in VALID_API_VERSIONS:
158
+ logger.warning(
159
+ f"Invalid API version '{raw_api_version}' specified. "
160
+ f"Valid versions are: {VALID_API_VERSIONS}. Falling back to 'v1'."
161
+ )
162
+ api_version = "v1"
163
+
143
164
  self._api_version = api_version
144
165
  self._submit_endpoint: str = f"/{api_version}/submit_job"
145
166
  self._fetch_endpoint: str = f"/{api_version}/fetch_job"
@@ -1,5 +1,6 @@
1
1
  import os
2
2
  import re
3
+ from typing import Optional
3
4
 
4
5
  # This regex finds all forms of environment variables:
5
6
  # $VAR, ${VAR}, $VAR|default, and ${VAR|default}
@@ -20,12 +21,46 @@ def _replacer(match: re.Match) -> str:
20
21
  var_name = match.group("braced") or match.group("named")
21
22
  default_val = match.group("braced_default") or match.group("named_default")
22
23
 
23
- # Get value from environment, or use default.
24
- value = os.environ.get(var_name, default_val)
24
+ # First try the primary env var
25
+ value = os.environ.get(var_name)
26
+ if value is not None:
27
+ return value
25
28
 
26
- if value is None:
29
+ # If primary is missing, try the default.
30
+ resolved_default = _resolve_default_with_single_fallback(default_val)
31
+
32
+ if resolved_default is None:
27
33
  return ""
28
- return value
34
+
35
+ return resolved_default
36
+
37
+
38
+ def _is_var_ref(token: str) -> Optional[str]:
39
+ """If token is a $VAR or ${VAR} reference, return VAR name; else None."""
40
+ if not token:
41
+ return None
42
+ if token.startswith("${") and token.endswith("}"):
43
+ inner = token[2:-1]
44
+ return inner if re.fullmatch(r"\w+", inner) else None
45
+ if token.startswith("$"):
46
+ inner = token[1:]
47
+ return inner if re.fullmatch(r"\w+", inner) else None
48
+ return None
49
+
50
+
51
+ def _resolve_default_with_single_fallback(default_val: Optional[str]) -> Optional[str]:
52
+ """
53
+ Support a single-level fallback where the default itself can be another env var.
54
+ For example, in $A|$B or ${A|$B}, we try B if A missing.
55
+ """
56
+ if default_val is None:
57
+ return None
58
+
59
+ var = _is_var_ref(default_val)
60
+ if var is not None:
61
+ return os.environ.get(var, None)
62
+
63
+ return default_val
29
64
 
30
65
 
31
66
  def substitute_env_vars_in_yaml_content(raw_content: str) -> str:
@@ -35,6 +70,8 @@ def substitute_env_vars_in_yaml_content(raw_content: str) -> str:
35
70
  This function finds all occurrences of environment variable placeholders
36
71
  ($VAR, ${VAR}, $VAR|default, ${VAR|default}) in the input string
37
72
  and replaces them with their corresponding environment variable values.
73
+ Also supports a single fallback to another env var: $VAR|$OTHER, ${VAR|$OTHER}
74
+ Quoted defaults are preserved EXACTLY as written (e.g., 'a,b' keeps quotes).
38
75
 
39
76
  Args:
40
77
  raw_content: The raw string content of a YAML file.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.10.14.dev20251014
3
+ Version: 2025.10.16.dev20251016
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -58,7 +58,7 @@ nv_ingest_api/internal/primitives/nim/model_interface/decorators.py,sha256=qwubk
58
58
  nv_ingest_api/internal/primitives/nim/model_interface/deplot.py,sha256=TvKdk6PTuI1WNhRmNNrvygaI_DIutkJkDL-XdtLZQac,10787
59
59
  nv_ingest_api/internal/primitives/nim/model_interface/helpers.py,sha256=jqbEbavvr9giODpzsGQSRDu5yZ4YfNfKAQfqUm9yUDI,11698
60
60
  nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py,sha256=WysjDZeegclO3mZgVcGOwzWbr8wSI4pWRiYD4iC2EXo,7098
61
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py,sha256=jFPB0h2SAq4RrZqiHdh28bNHvrkm0RWKZEx-ei-ahlU,29521
61
+ nv_ingest_api/internal/primitives/nim/model_interface/ocr.py,sha256=WuX-veTC510TbvMWYGYP6WCzjYCbUBAUc5ovJUWCrFU,29607
62
62
  nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py,sha256=3XXJkeJaVFe_iIfNn_bDYn79JN20besjZHiNZ5dEnZQ,12778
63
63
  nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py,sha256=lFhppNqrq5X_fzbCWKphvZQMzaJd3gHrkWsyJORzFrU,5010
64
64
  nv_ingest_api/internal/primitives/nim/model_interface/vlm.py,sha256=sUDKKlVqKjiHEGr2D04I7S4pDfnLR8b-NplV1pf5GVQ,6240
@@ -84,7 +84,7 @@ nv_ingest_api/internal/schemas/message_brokers/request_schema.py,sha256=LZX_wXDx
84
84
  nv_ingest_api/internal/schemas/message_brokers/response_schema.py,sha256=4b275HlzBSzpmuE2wdoeaGKPCdKki3wuWldtRIfrj8w,727
85
85
  nv_ingest_api/internal/schemas/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
86
86
  nv_ingest_api/internal/schemas/meta/base_model_noext.py,sha256=8hXU1uuiqZ6t8EsoZ8vlC5EFf2zSZrKEX133FcfZMwI,316
87
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=23k85wUArC9J1g0_4f1KRXYHikAB2hgsH8C5Gw13uVQ,10414
87
+ nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=oPBoukRAnLW8BH6iKB0A_WIdewi_Go0NlxrakBwnswo,10782
88
88
  nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=FDD6yq-QxW8yDwn0Bq6bmWakX41ABMn3cytrvCbT-Po,11961
89
89
  nv_ingest_api/internal/schemas/meta/udf.py,sha256=GgzqbZOlipQgMpDhbXLqbF8xrHenj_hMNqhR_P-1ynw,779
90
90
  nv_ingest_api/internal/schemas/mutate/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -158,16 +158,16 @@ nv_ingest_api/util/service_clients/kafka/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1
158
158
  nv_ingest_api/util/service_clients/redis/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
159
159
  nv_ingest_api/util/service_clients/redis/redis_client.py,sha256=3NLecvIvVN1v-sA7d7G-_f6qJVZyfJE2H8Iu5KG3Aew,37417
160
160
  nv_ingest_api/util/service_clients/rest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
161
- nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=laFVWLBFPlycLZnj6DP8zTABfSgr-FyRS2O_EbpzW_Y,22708
161
+ nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=7ymPxhuN9SP8nPSVepqqbvUxXPaTVunq2aC2bDbg98g,23684
162
162
  nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
163
163
  nv_ingest_api/util/string_processing/configuration.py,sha256=2HS08msccuPCT0fn_jfXRo9_M6hCZ59OxKLxG_47HRY,29888
164
- nv_ingest_api/util/string_processing/yaml.py,sha256=6SW2O6wbXRhGbhETMbtXjYCZn53HeCNOP6a96AaxlHs,1454
164
+ nv_ingest_api/util/string_processing/yaml.py,sha256=4Zdmc4474lUZn6kznqaNTlQJwsmRnnJQZ-DvAWLu-zo,2678
165
165
  nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
166
166
  nv_ingest_api/util/system/hardware_info.py,sha256=1UFM8XE6M3pgQcpbVsCsqDQ7Dj-zzptL-XRE-DEu9UA,27213
167
- nv_ingest_api-2025.10.14.dev20251014.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
167
+ nv_ingest_api-2025.10.16.dev20251016.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
168
168
  udfs/__init__.py,sha256=pXFqPgXIUqHDfj7SAR1Q19tt8KwGv_iMvhHyziz4AYM,205
169
- udfs/llm_summarizer_udf.py,sha256=t_ZFoz0e03uECYcRw4IabRj0GBlwAoJkJn13NL2wbsI,7217
170
- nv_ingest_api-2025.10.14.dev20251014.dist-info/METADATA,sha256=rcnea6vVn8B008pZg9on2O8nBSuwFRIM25KZHJ8Lb2E,14086
171
- nv_ingest_api-2025.10.14.dev20251014.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
172
- nv_ingest_api-2025.10.14.dev20251014.dist-info/top_level.txt,sha256=I1lseG9FF0CH93SPx4kFblsxFuv190cfzaas_CLNIiw,19
173
- nv_ingest_api-2025.10.14.dev20251014.dist-info/RECORD,,
169
+ udfs/llm_summarizer_udf.py,sha256=9kM3W08FVnNHIRrDDZEuQRnPJmOdU2N-YTcnaBy8NdU,7267
170
+ nv_ingest_api-2025.10.16.dev20251016.dist-info/METADATA,sha256=fsVHTpz4y-WxO1gDJOu87u8fSfAxOFrabEkcYf-Wvg8,14086
171
+ nv_ingest_api-2025.10.16.dev20251016.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
172
+ nv_ingest_api-2025.10.16.dev20251016.dist-info/top_level.txt,sha256=I1lseG9FF0CH93SPx4kFblsxFuv190cfzaas_CLNIiw,19
173
+ nv_ingest_api-2025.10.16.dev20251016.dist-info/RECORD,,
@@ -2,8 +2,9 @@
2
2
  """
3
3
  LLM Content Summarizer UDF for NV-Ingest Pipeline
4
4
 
5
- This UDF uses an LLM to generate concise summaries of text content chunks. These summaries are added to the metadata
6
- for enhanced downstream processing and search capabilities.
5
+ Generates document summaries using NVIDIA-hosted LLMs. This production UDF demonstrates how to extract the pipeline
6
+ payload, run custom code (summarization), and inject results into the metadata for downstream usecases (such as
7
+ retrieval).
7
8
 
8
9
  These variables can be set in the environment before running the pipeline. These can be treated as kwargs.
9
10
  - NVIDIA_API_KEY: API key for NVIDIA NIM endpoints (required)
@@ -14,16 +15,14 @@ These variables can be set in the environment before running the pipeline. These
14
15
  - MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
15
16
  TODO: Implement this
16
17
  - NUM_CHUNKS: (Optional) Number of first and last pages to summarize. default=1
18
+
19
+ More info can be found in `examples/udfs/README.md`
17
20
  """
18
21
 
19
22
  import logging
20
23
  import os
21
24
  import time
22
25
 
23
- # REMOVE BEFORE MERGING
24
- # import yaml
25
- # from pathlib import Path
26
-
27
26
 
28
27
  logger = logging.getLogger(__name__)
29
28