nv-ingest-api 2025.10.14.dev20251014__py3-none-any.whl → 2025.10.16.dev20251016__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +2 -2
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +12 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +24 -3
- nv_ingest_api/util/string_processing/yaml.py +41 -4
- {nv_ingest_api-2025.10.14.dev20251014.dist-info → nv_ingest_api-2025.10.16.dev20251016.dist-info}/METADATA +1 -1
- {nv_ingest_api-2025.10.14.dev20251014.dist-info → nv_ingest_api-2025.10.16.dev20251016.dist-info}/RECORD +10 -10
- udfs/llm_summarizer_udf.py +5 -6
- {nv_ingest_api-2025.10.14.dev20251014.dist-info → nv_ingest_api-2025.10.16.dev20251016.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.10.14.dev20251014.dist-info → nv_ingest_api-2025.10.16.dev20251016.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.10.14.dev20251014.dist-info → nv_ingest_api-2025.10.16.dev20251016.dist-info}/top_level.txt +0 -0
|
@@ -758,8 +758,8 @@ def get_ocr_model_name(ocr_grpc_endpoint=None, default_model_name=DEFAULT_OCR_MO
|
|
|
758
758
|
if ocr_model_name is not None:
|
|
759
759
|
return ocr_model_name
|
|
760
760
|
|
|
761
|
-
# 2. If no gRPC endpoint is provided, fall back to the default immediately.
|
|
762
|
-
if not ocr_grpc_endpoint:
|
|
761
|
+
# 2. If no gRPC endpoint is provided or the endpoint is a NVCF endpoint, fall back to the default immediately.
|
|
762
|
+
if (not ocr_grpc_endpoint) or ("grpc.nvcf.nvidia.com" in ocr_grpc_endpoint):
|
|
763
763
|
logger.debug(f"No OCR gRPC endpoint provided. Falling back to default model name '{default_model_name}'.")
|
|
764
764
|
return default_model_name
|
|
765
765
|
|
|
@@ -32,6 +32,17 @@ class TracingOptionsSchema(BaseModelNoExt):
|
|
|
32
32
|
total_pages: Optional[int] = None
|
|
33
33
|
|
|
34
34
|
|
|
35
|
+
# PDF Configuration Schema
|
|
36
|
+
class PdfConfigSchema(BaseModelNoExt):
|
|
37
|
+
"""PDF-specific configuration options for job submission.
|
|
38
|
+
|
|
39
|
+
Note: split_page_count accepts any positive integer but will be clamped
|
|
40
|
+
to [1, 128] range by the server at runtime.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
split_page_count: Annotated[int, Field(ge=1)] = 32
|
|
44
|
+
|
|
45
|
+
|
|
35
46
|
# Ingest Task Schemas
|
|
36
47
|
|
|
37
48
|
|
|
@@ -270,6 +281,7 @@ class IngestJobSchema(BaseModelNoExt):
|
|
|
270
281
|
job_id: Union[str, int]
|
|
271
282
|
tasks: List[IngestTaskSchema]
|
|
272
283
|
tracing_options: Optional[TracingOptionsSchema] = None
|
|
284
|
+
pdf_config: Optional[PdfConfigSchema] = None
|
|
273
285
|
|
|
274
286
|
|
|
275
287
|
# ------------------------------------------------------------------------------
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
import os
|
|
7
6
|
import re
|
|
8
7
|
import time
|
|
9
8
|
from typing import Any, Union, Tuple, Optional, Dict, Callable
|
|
@@ -104,6 +103,17 @@ class RestClient(MessageBrokerClientBase):
|
|
|
104
103
|
Default timeout in seconds for waiting for data after connection. Default is None.
|
|
105
104
|
http_allocator : Optional[Callable[[], Any]], optional
|
|
106
105
|
A callable that returns an HTTP client instance. If None, `requests.Session()` is used.
|
|
106
|
+
**kwargs : dict
|
|
107
|
+
Additional keyword arguments. Supported keys:
|
|
108
|
+
- api_version : str, optional
|
|
109
|
+
API version to use ('v1' or 'v2'). Defaults to 'v1' if not specified.
|
|
110
|
+
Invalid versions will log a warning and fall back to 'v1'.
|
|
111
|
+
- base_url : str, optional
|
|
112
|
+
Override the generated base URL.
|
|
113
|
+
- headers : dict, optional
|
|
114
|
+
Additional headers to include in requests.
|
|
115
|
+
- auth : optional
|
|
116
|
+
Authentication configuration for requests.
|
|
107
117
|
|
|
108
118
|
Returns
|
|
109
119
|
-------
|
|
@@ -138,8 +148,19 @@ class RestClient(MessageBrokerClientBase):
|
|
|
138
148
|
)
|
|
139
149
|
self._client = requests.Session()
|
|
140
150
|
|
|
141
|
-
#
|
|
142
|
-
|
|
151
|
+
# Validate and normalize API version to prevent misconfiguration
|
|
152
|
+
# Default to v1 for backwards compatibility if not explicitly provided
|
|
153
|
+
VALID_API_VERSIONS = {"v1", "v2"}
|
|
154
|
+
raw_api_version = kwargs.get("api_version", "v1")
|
|
155
|
+
api_version = str(raw_api_version).strip().lower()
|
|
156
|
+
|
|
157
|
+
if api_version not in VALID_API_VERSIONS:
|
|
158
|
+
logger.warning(
|
|
159
|
+
f"Invalid API version '{raw_api_version}' specified. "
|
|
160
|
+
f"Valid versions are: {VALID_API_VERSIONS}. Falling back to 'v1'."
|
|
161
|
+
)
|
|
162
|
+
api_version = "v1"
|
|
163
|
+
|
|
143
164
|
self._api_version = api_version
|
|
144
165
|
self._submit_endpoint: str = f"/{api_version}/submit_job"
|
|
145
166
|
self._fetch_endpoint: str = f"/{api_version}/fetch_job"
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import re
|
|
3
|
+
from typing import Optional
|
|
3
4
|
|
|
4
5
|
# This regex finds all forms of environment variables:
|
|
5
6
|
# $VAR, ${VAR}, $VAR|default, and ${VAR|default}
|
|
@@ -20,12 +21,46 @@ def _replacer(match: re.Match) -> str:
|
|
|
20
21
|
var_name = match.group("braced") or match.group("named")
|
|
21
22
|
default_val = match.group("braced_default") or match.group("named_default")
|
|
22
23
|
|
|
23
|
-
#
|
|
24
|
-
value = os.environ.get(var_name
|
|
24
|
+
# First try the primary env var
|
|
25
|
+
value = os.environ.get(var_name)
|
|
26
|
+
if value is not None:
|
|
27
|
+
return value
|
|
25
28
|
|
|
26
|
-
|
|
29
|
+
# If primary is missing, try the default.
|
|
30
|
+
resolved_default = _resolve_default_with_single_fallback(default_val)
|
|
31
|
+
|
|
32
|
+
if resolved_default is None:
|
|
27
33
|
return ""
|
|
28
|
-
|
|
34
|
+
|
|
35
|
+
return resolved_default
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _is_var_ref(token: str) -> Optional[str]:
|
|
39
|
+
"""If token is a $VAR or ${VAR} reference, return VAR name; else None."""
|
|
40
|
+
if not token:
|
|
41
|
+
return None
|
|
42
|
+
if token.startswith("${") and token.endswith("}"):
|
|
43
|
+
inner = token[2:-1]
|
|
44
|
+
return inner if re.fullmatch(r"\w+", inner) else None
|
|
45
|
+
if token.startswith("$"):
|
|
46
|
+
inner = token[1:]
|
|
47
|
+
return inner if re.fullmatch(r"\w+", inner) else None
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _resolve_default_with_single_fallback(default_val: Optional[str]) -> Optional[str]:
|
|
52
|
+
"""
|
|
53
|
+
Support a single-level fallback where the default itself can be another env var.
|
|
54
|
+
For example, in $A|$B or ${A|$B}, we try B if A missing.
|
|
55
|
+
"""
|
|
56
|
+
if default_val is None:
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
var = _is_var_ref(default_val)
|
|
60
|
+
if var is not None:
|
|
61
|
+
return os.environ.get(var, None)
|
|
62
|
+
|
|
63
|
+
return default_val
|
|
29
64
|
|
|
30
65
|
|
|
31
66
|
def substitute_env_vars_in_yaml_content(raw_content: str) -> str:
|
|
@@ -35,6 +70,8 @@ def substitute_env_vars_in_yaml_content(raw_content: str) -> str:
|
|
|
35
70
|
This function finds all occurrences of environment variable placeholders
|
|
36
71
|
($VAR, ${VAR}, $VAR|default, ${VAR|default}) in the input string
|
|
37
72
|
and replaces them with their corresponding environment variable values.
|
|
73
|
+
Also supports a single fallback to another env var: $VAR|$OTHER, ${VAR|$OTHER}
|
|
74
|
+
Quoted defaults are preserved EXACTLY as written (e.g., 'a,b' keeps quotes).
|
|
38
75
|
|
|
39
76
|
Args:
|
|
40
77
|
raw_content: The raw string content of a YAML file.
|
|
@@ -58,7 +58,7 @@ nv_ingest_api/internal/primitives/nim/model_interface/decorators.py,sha256=qwubk
|
|
|
58
58
|
nv_ingest_api/internal/primitives/nim/model_interface/deplot.py,sha256=TvKdk6PTuI1WNhRmNNrvygaI_DIutkJkDL-XdtLZQac,10787
|
|
59
59
|
nv_ingest_api/internal/primitives/nim/model_interface/helpers.py,sha256=jqbEbavvr9giODpzsGQSRDu5yZ4YfNfKAQfqUm9yUDI,11698
|
|
60
60
|
nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py,sha256=WysjDZeegclO3mZgVcGOwzWbr8wSI4pWRiYD4iC2EXo,7098
|
|
61
|
-
nv_ingest_api/internal/primitives/nim/model_interface/ocr.py,sha256=
|
|
61
|
+
nv_ingest_api/internal/primitives/nim/model_interface/ocr.py,sha256=WuX-veTC510TbvMWYGYP6WCzjYCbUBAUc5ovJUWCrFU,29607
|
|
62
62
|
nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py,sha256=3XXJkeJaVFe_iIfNn_bDYn79JN20besjZHiNZ5dEnZQ,12778
|
|
63
63
|
nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py,sha256=lFhppNqrq5X_fzbCWKphvZQMzaJd3gHrkWsyJORzFrU,5010
|
|
64
64
|
nv_ingest_api/internal/primitives/nim/model_interface/vlm.py,sha256=sUDKKlVqKjiHEGr2D04I7S4pDfnLR8b-NplV1pf5GVQ,6240
|
|
@@ -84,7 +84,7 @@ nv_ingest_api/internal/schemas/message_brokers/request_schema.py,sha256=LZX_wXDx
|
|
|
84
84
|
nv_ingest_api/internal/schemas/message_brokers/response_schema.py,sha256=4b275HlzBSzpmuE2wdoeaGKPCdKki3wuWldtRIfrj8w,727
|
|
85
85
|
nv_ingest_api/internal/schemas/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
86
86
|
nv_ingest_api/internal/schemas/meta/base_model_noext.py,sha256=8hXU1uuiqZ6t8EsoZ8vlC5EFf2zSZrKEX133FcfZMwI,316
|
|
87
|
-
nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=
|
|
87
|
+
nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=oPBoukRAnLW8BH6iKB0A_WIdewi_Go0NlxrakBwnswo,10782
|
|
88
88
|
nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=FDD6yq-QxW8yDwn0Bq6bmWakX41ABMn3cytrvCbT-Po,11961
|
|
89
89
|
nv_ingest_api/internal/schemas/meta/udf.py,sha256=GgzqbZOlipQgMpDhbXLqbF8xrHenj_hMNqhR_P-1ynw,779
|
|
90
90
|
nv_ingest_api/internal/schemas/mutate/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
@@ -158,16 +158,16 @@ nv_ingest_api/util/service_clients/kafka/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1
|
|
|
158
158
|
nv_ingest_api/util/service_clients/redis/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
159
159
|
nv_ingest_api/util/service_clients/redis/redis_client.py,sha256=3NLecvIvVN1v-sA7d7G-_f6qJVZyfJE2H8Iu5KG3Aew,37417
|
|
160
160
|
nv_ingest_api/util/service_clients/rest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
161
|
-
nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=
|
|
161
|
+
nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=7ymPxhuN9SP8nPSVepqqbvUxXPaTVunq2aC2bDbg98g,23684
|
|
162
162
|
nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
|
|
163
163
|
nv_ingest_api/util/string_processing/configuration.py,sha256=2HS08msccuPCT0fn_jfXRo9_M6hCZ59OxKLxG_47HRY,29888
|
|
164
|
-
nv_ingest_api/util/string_processing/yaml.py,sha256=
|
|
164
|
+
nv_ingest_api/util/string_processing/yaml.py,sha256=4Zdmc4474lUZn6kznqaNTlQJwsmRnnJQZ-DvAWLu-zo,2678
|
|
165
165
|
nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
166
166
|
nv_ingest_api/util/system/hardware_info.py,sha256=1UFM8XE6M3pgQcpbVsCsqDQ7Dj-zzptL-XRE-DEu9UA,27213
|
|
167
|
-
nv_ingest_api-2025.10.
|
|
167
|
+
nv_ingest_api-2025.10.16.dev20251016.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
168
168
|
udfs/__init__.py,sha256=pXFqPgXIUqHDfj7SAR1Q19tt8KwGv_iMvhHyziz4AYM,205
|
|
169
|
-
udfs/llm_summarizer_udf.py,sha256=
|
|
170
|
-
nv_ingest_api-2025.10.
|
|
171
|
-
nv_ingest_api-2025.10.
|
|
172
|
-
nv_ingest_api-2025.10.
|
|
173
|
-
nv_ingest_api-2025.10.
|
|
169
|
+
udfs/llm_summarizer_udf.py,sha256=9kM3W08FVnNHIRrDDZEuQRnPJmOdU2N-YTcnaBy8NdU,7267
|
|
170
|
+
nv_ingest_api-2025.10.16.dev20251016.dist-info/METADATA,sha256=fsVHTpz4y-WxO1gDJOu87u8fSfAxOFrabEkcYf-Wvg8,14086
|
|
171
|
+
nv_ingest_api-2025.10.16.dev20251016.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
172
|
+
nv_ingest_api-2025.10.16.dev20251016.dist-info/top_level.txt,sha256=I1lseG9FF0CH93SPx4kFblsxFuv190cfzaas_CLNIiw,19
|
|
173
|
+
nv_ingest_api-2025.10.16.dev20251016.dist-info/RECORD,,
|
udfs/llm_summarizer_udf.py
CHANGED
|
@@ -2,8 +2,9 @@
|
|
|
2
2
|
"""
|
|
3
3
|
LLM Content Summarizer UDF for NV-Ingest Pipeline
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
5
|
+
Generates document summaries using NVIDIA-hosted LLMs. This production UDF demonstrates how to extract the pipeline
|
|
6
|
+
payload, run custom code (summarization), and inject results into the metadata for downstream usecases (such as
|
|
7
|
+
retrieval).
|
|
7
8
|
|
|
8
9
|
These variables can be set in the environment before running the pipeline. These can be treated as kwargs.
|
|
9
10
|
- NVIDIA_API_KEY: API key for NVIDIA NIM endpoints (required)
|
|
@@ -14,16 +15,14 @@ These variables can be set in the environment before running the pipeline. These
|
|
|
14
15
|
- MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
|
|
15
16
|
TODO: Implement this
|
|
16
17
|
- NUM_CHUNKS: (Optional) Number of first and last pages to summarize. default=1
|
|
18
|
+
|
|
19
|
+
More info can be found in `examples/udfs/README.md`
|
|
17
20
|
"""
|
|
18
21
|
|
|
19
22
|
import logging
|
|
20
23
|
import os
|
|
21
24
|
import time
|
|
22
25
|
|
|
23
|
-
# REMOVE BEFORE MERGING
|
|
24
|
-
# import yaml
|
|
25
|
-
# from pathlib import Path
|
|
26
|
-
|
|
27
26
|
|
|
28
27
|
logger = logging.getLogger(__name__)
|
|
29
28
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|