nv-ingest-api 26.1.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +218 -0
- nv_ingest_api/interface/extract.py +977 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +200 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +186 -0
- nv_ingest_api/internal/__init__.py +0 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +550 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
- nv_ingest_api/internal/extract/html/__init__.py +3 -0
- nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
- nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
- nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
- nv_ingest_api/internal/meta/__init__.py +3 -0
- nv_ingest_api/internal/meta/udf.py +232 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/control_message_task.py +16 -0
- nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
- nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
- nv_ingest_api/internal/schemas/meta/udf.py +23 -0
- nv_ingest_api/internal/schemas/mixins.py +39 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +251 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +219 -0
- nv_ingest_api/internal/transform/embed_text.py +702 -0
- nv_ingest_api/internal/transform/split_text.py +182 -0
- nv_ingest_api/util/__init__.py +3 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/dataloader/__init__.py +9 -0
- nv_ingest_api/util/dataloader/dataloader.py +409 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +429 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +177 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
- nv_ingest_api/util/image_processing/transforms.py +850 -0
- nv_ingest_api/util/imports/__init__.py +3 -0
- nv_ingest_api/util/imports/callable_signatures.py +108 -0
- nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
- nv_ingest_api/util/introspection/__init__.py +3 -0
- nv_ingest_api/util/introspection/class_inspect.py +145 -0
- nv_ingest_api/util/introspection/function_inspect.py +65 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +102 -0
- nv_ingest_api/util/logging/sanitize.py +84 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +516 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
- nv_ingest_api/util/nim/__init__.py +161 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +428 -0
- nv_ingest_api/util/schema/__init__.py +3 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- nv_ingest_api/util/string_processing/configuration.py +682 -0
- nv_ingest_api/util/string_processing/yaml.py +109 -0
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +594 -0
- nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
- nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
- nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
- nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
- nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
- udfs/__init__.py +5 -0
- udfs/llm_summarizer_udf.py +259 -0
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from typing import Any
|
|
9
|
+
from typing import Dict
|
|
10
|
+
from typing import List
|
|
11
|
+
from typing import Optional
|
|
12
|
+
from typing import Union
|
|
13
|
+
|
|
14
|
+
from pydantic import field_validator, model_validator, Field
|
|
15
|
+
|
|
16
|
+
from nv_ingest_api.internal.enums.common import (
|
|
17
|
+
AccessLevelEnum,
|
|
18
|
+
ContentTypeEnum,
|
|
19
|
+
TextTypeEnum,
|
|
20
|
+
LanguageEnum,
|
|
21
|
+
TableFormatEnum,
|
|
22
|
+
StatusEnum,
|
|
23
|
+
DocumentTypeEnum,
|
|
24
|
+
TaskTypeEnum,
|
|
25
|
+
)
|
|
26
|
+
from nv_ingest_api.internal.schemas.meta.base_model_noext import BaseModelNoExt
|
|
27
|
+
from nv_ingest_api.util.converters import datetools
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# Sub schemas
|
|
33
|
+
class SourceMetadataSchema(BaseModelNoExt):
|
|
34
|
+
"""
|
|
35
|
+
Schema for the knowledge base file from which content
|
|
36
|
+
and metadata is extracted.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
source_name: str
|
|
40
|
+
"""The name of the source file."""
|
|
41
|
+
|
|
42
|
+
source_id: str
|
|
43
|
+
"""The ID of the source file."""
|
|
44
|
+
|
|
45
|
+
source_location: str = ""
|
|
46
|
+
"""The URL, URI, or pointer to the storage location of the source file."""
|
|
47
|
+
|
|
48
|
+
source_type: Union[DocumentTypeEnum, str]
|
|
49
|
+
"""The type of the source file, such as pdf, docx, pptx, or txt."""
|
|
50
|
+
|
|
51
|
+
collection_id: str = ""
|
|
52
|
+
"""The ID of the collection in which the source is contained."""
|
|
53
|
+
|
|
54
|
+
date_created: str = datetime.now().isoformat()
|
|
55
|
+
"""The date the source was created."""
|
|
56
|
+
|
|
57
|
+
last_modified: str = datetime.now().isoformat()
|
|
58
|
+
"""The date the source was last modified."""
|
|
59
|
+
|
|
60
|
+
summary: str = ""
|
|
61
|
+
"""A summary of the source."""
|
|
62
|
+
|
|
63
|
+
partition_id: int = -1
|
|
64
|
+
"""The offset of this data fragment within a larger set of fragments."""
|
|
65
|
+
|
|
66
|
+
access_level: Union[AccessLevelEnum, int] = AccessLevelEnum.UNKNOWN
|
|
67
|
+
"""The role-based access control for the source."""
|
|
68
|
+
|
|
69
|
+
custom_content: Optional[Dict[str, Any]] = None
|
|
70
|
+
|
|
71
|
+
@field_validator("date_created", "last_modified")
|
|
72
|
+
@classmethod
|
|
73
|
+
def validate_fields(cls, field_value):
|
|
74
|
+
datetools.validate_iso8601(field_value)
|
|
75
|
+
return field_value
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class NearbyObjectsSubSchema(BaseModelNoExt):
|
|
79
|
+
"""
|
|
80
|
+
Schema to hold related extracted object.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
content: List[str] = Field(default_factory=list)
|
|
84
|
+
bbox: List[tuple] = Field(default_factory=list)
|
|
85
|
+
type: List[str] = Field(default_factory=list)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class NearbyObjectsSchema(BaseModelNoExt):
|
|
89
|
+
"""
|
|
90
|
+
Schema to hold types of related extracted objects.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
text: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
|
|
94
|
+
images: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
|
|
95
|
+
structured: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class ContentHierarchySchema(BaseModelNoExt):
|
|
99
|
+
"""
|
|
100
|
+
Schema for the extracted content hierarchy.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
page_count: int = -1
|
|
104
|
+
page: int = -1
|
|
105
|
+
block: int = -1
|
|
106
|
+
line: int = -1
|
|
107
|
+
span: int = -1
|
|
108
|
+
nearby_objects: NearbyObjectsSchema = NearbyObjectsSchema()
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class ContentMetadataSchema(BaseModelNoExt):
|
|
112
|
+
"""
|
|
113
|
+
Data extracted from a source; generally Text or Image.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
type: ContentTypeEnum
|
|
117
|
+
"""The type of the content. Text, Image, Structured, Table, or Chart."""
|
|
118
|
+
|
|
119
|
+
description: str = ""
|
|
120
|
+
"""A text description of the content object."""
|
|
121
|
+
|
|
122
|
+
page_number: int = -1
|
|
123
|
+
"""The page number of the content in the source."""
|
|
124
|
+
|
|
125
|
+
hierarchy: ContentHierarchySchema = ContentHierarchySchema()
|
|
126
|
+
"""The location or order of the content within the source."""
|
|
127
|
+
|
|
128
|
+
subtype: Union[ContentTypeEnum, str] = ""
|
|
129
|
+
"""The type of the content for structured data types, such as table or chart."""
|
|
130
|
+
|
|
131
|
+
start_time: int = -1
|
|
132
|
+
"""The timestamp of the start of a piece of audio content."""
|
|
133
|
+
|
|
134
|
+
end_time: int = -1
|
|
135
|
+
"""The timestamp of the end of a piece of audio content."""
|
|
136
|
+
|
|
137
|
+
custom_content: Optional[Dict[str, Any]] = None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class TextMetadataSchema(BaseModelNoExt):
|
|
141
|
+
"""
|
|
142
|
+
The schema for the extracted text content.
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
text_type: TextTypeEnum
|
|
146
|
+
"""The type of the text, such as header or body."""
|
|
147
|
+
|
|
148
|
+
summary: str = ""
|
|
149
|
+
"""An abbreviated summary of the content."""
|
|
150
|
+
|
|
151
|
+
keywords: Union[str, List[str], Dict] = ""
|
|
152
|
+
"""Keywords, named entities, or other phrases."""
|
|
153
|
+
|
|
154
|
+
language: LanguageEnum = LanguageEnum.EN # default to Unknown? Maybe do some kind of heuristic check
|
|
155
|
+
"""The language of the content."""
|
|
156
|
+
|
|
157
|
+
text_location: tuple = (0, 0, 0, 0)
|
|
158
|
+
"""The bounding box of the text, in the format (x1,y1,x2,y2)."""
|
|
159
|
+
|
|
160
|
+
text_location_max_dimensions: tuple = (0, 0)
|
|
161
|
+
"""The maximum dimensions of the bounding box of the text, in the format (x_max,y_max)."""
|
|
162
|
+
|
|
163
|
+
custom_content: Optional[Dict[str, Any]] = None
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class ImageMetadataSchema(BaseModelNoExt):
|
|
167
|
+
"""
|
|
168
|
+
The schema for the extracted image content.
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
image_type: Union[DocumentTypeEnum, str]
|
|
172
|
+
"""The type of the image, such as structured, natural, hybrid, and others."""
|
|
173
|
+
|
|
174
|
+
structured_image_type: ContentTypeEnum = ContentTypeEnum.NONE
|
|
175
|
+
"""The type of the content for structured data types, such as bar chart, pie chart, and others."""
|
|
176
|
+
|
|
177
|
+
caption: str = ""
|
|
178
|
+
"""A caption or subheading associated with the image."""
|
|
179
|
+
|
|
180
|
+
text: str = ""
|
|
181
|
+
"""Extracted text from a structured chart."""
|
|
182
|
+
|
|
183
|
+
image_location: tuple = (0, 0, 0, 0)
|
|
184
|
+
"""The bounding box of the image, in the format (x1,y1,x2,y2)."""
|
|
185
|
+
|
|
186
|
+
image_location_max_dimensions: tuple = (0, 0)
|
|
187
|
+
"""The maximum dimensions of the bounding box of the image, in the format (x_max,y_max)."""
|
|
188
|
+
|
|
189
|
+
uploaded_image_url: str = ""
|
|
190
|
+
"""A mirror of source_metadata.source_location."""
|
|
191
|
+
|
|
192
|
+
width: int = 0
|
|
193
|
+
"""The width of the image."""
|
|
194
|
+
|
|
195
|
+
height: int = 0
|
|
196
|
+
"""The height of the image."""
|
|
197
|
+
|
|
198
|
+
custom_content: Optional[Dict[str, Any]] = None
|
|
199
|
+
|
|
200
|
+
@field_validator("image_type")
|
|
201
|
+
def validate_image_type(cls, v):
|
|
202
|
+
if not isinstance(v, (DocumentTypeEnum, str)):
|
|
203
|
+
raise ValueError("image_type must be a string or DocumentTypeEnum")
|
|
204
|
+
return v
|
|
205
|
+
|
|
206
|
+
@field_validator("width", "height")
|
|
207
|
+
def clamp_non_negative(cls, v, field):
|
|
208
|
+
if v < 0:
|
|
209
|
+
logger.warning(f"{field.field_name} is negative; clamping to 0. Original value: {v}")
|
|
210
|
+
return 0
|
|
211
|
+
return v
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
class TableMetadataSchema(BaseModelNoExt):
|
|
215
|
+
"""
|
|
216
|
+
The schema for the extracted table content.
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
caption: str = ""
|
|
220
|
+
"""The caption for the table."""
|
|
221
|
+
|
|
222
|
+
table_format: TableFormatEnum
|
|
223
|
+
"""
|
|
224
|
+
The format of the table. One of Structured (dataframe / lists of rows and columns), or serialized as markdown,
|
|
225
|
+
html, latex, simple (cells separated as spaces).
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
table_content: str = ""
|
|
229
|
+
"""Extracted text content, formatted according to table_metadata.table_format."""
|
|
230
|
+
|
|
231
|
+
table_content_format: Union[TableFormatEnum, str] = ""
|
|
232
|
+
|
|
233
|
+
table_location: tuple = (0, 0, 0, 0)
|
|
234
|
+
"""The bounding box of the table, in the format (x1,y1,x2,y2)."""
|
|
235
|
+
|
|
236
|
+
table_location_max_dimensions: tuple = (0, 0)
|
|
237
|
+
"""The maximum dimensions of the bounding box of the table, in the format (x_max,y_max)."""
|
|
238
|
+
|
|
239
|
+
uploaded_image_uri: str = ""
|
|
240
|
+
"""A mirror of source_metadata.source_location."""
|
|
241
|
+
|
|
242
|
+
custom_content: Optional[Dict[str, Any]] = None
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class ChartMetadataSchema(BaseModelNoExt):
|
|
246
|
+
"""
|
|
247
|
+
The schema for table content extracted from charts.
|
|
248
|
+
"""
|
|
249
|
+
|
|
250
|
+
caption: str = ""
|
|
251
|
+
"""The caption for the chart."""
|
|
252
|
+
|
|
253
|
+
table_format: TableFormatEnum
|
|
254
|
+
"""
|
|
255
|
+
The format of the table. One of Structured (dataframe / lists of rows and columns), or serialized as markdown,
|
|
256
|
+
html, latex, simple (cells separated as spaces).
|
|
257
|
+
"""
|
|
258
|
+
|
|
259
|
+
table_content: str = ""
|
|
260
|
+
"""Extracted text content, formatted according to chart_metadata.table_format."""
|
|
261
|
+
|
|
262
|
+
table_content_format: Union[TableFormatEnum, str] = ""
|
|
263
|
+
|
|
264
|
+
table_location: tuple = (0, 0, 0, 0)
|
|
265
|
+
"""The bounding box of the chart, in the format (x1,y1,x2,y2)."""
|
|
266
|
+
|
|
267
|
+
table_location_max_dimensions: tuple = (0, 0)
|
|
268
|
+
"""The maximum dimensions of the bounding box of the chart, in the format (x_max,y_max)."""
|
|
269
|
+
|
|
270
|
+
uploaded_image_uri: str = ""
|
|
271
|
+
"""A mirror of source_metadata.source_location."""
|
|
272
|
+
|
|
273
|
+
custom_content: Optional[Dict[str, Any]] = None
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
class AudioMetadataSchema(BaseModelNoExt):
|
|
277
|
+
"""
|
|
278
|
+
The schema for extracted audio content.
|
|
279
|
+
"""
|
|
280
|
+
|
|
281
|
+
audio_transcript: str = ""
|
|
282
|
+
"""A transcript of the audio content."""
|
|
283
|
+
|
|
284
|
+
audio_type: str = ""
|
|
285
|
+
"""The type or format of the audio, such as mp3, wav."""
|
|
286
|
+
|
|
287
|
+
custom_content: Optional[Dict[str, Any]] = None
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
# TODO consider deprecating this in favor of info msg...
|
|
291
|
+
class ErrorMetadataSchema(BaseModelNoExt):
|
|
292
|
+
task: TaskTypeEnum
|
|
293
|
+
status: StatusEnum
|
|
294
|
+
source_id: str = ""
|
|
295
|
+
error_msg: str
|
|
296
|
+
custom_content: Optional[Dict[str, Any]] = None
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class InfoMessageMetadataSchema(BaseModelNoExt):
|
|
300
|
+
task: TaskTypeEnum
|
|
301
|
+
status: StatusEnum
|
|
302
|
+
message: str
|
|
303
|
+
filter: bool
|
|
304
|
+
custom_content: Optional[Dict[str, Any]] = None
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
# Main metadata schema
|
|
308
|
+
class MetadataSchema(BaseModelNoExt):
|
|
309
|
+
"""
|
|
310
|
+
The primary container schema for extraction results.
|
|
311
|
+
"""
|
|
312
|
+
|
|
313
|
+
content: str = ""
|
|
314
|
+
"""The actual textual content extracted from the source."""
|
|
315
|
+
|
|
316
|
+
content_url: str = ""
|
|
317
|
+
"""A URL that points to the location of the content, if applicable."""
|
|
318
|
+
|
|
319
|
+
embedding: Optional[List[float]] = None
|
|
320
|
+
"""An optional numerical vector representation (embedding) of the content."""
|
|
321
|
+
|
|
322
|
+
source_metadata: Optional[SourceMetadataSchema] = None
|
|
323
|
+
"""Metadata about the original source of the content."""
|
|
324
|
+
|
|
325
|
+
content_metadata: Optional[ContentMetadataSchema] = None
|
|
326
|
+
"""General metadata about the extracted content itself."""
|
|
327
|
+
|
|
328
|
+
audio_metadata: Optional[AudioMetadataSchema] = None
|
|
329
|
+
"""Specific metadata for audio content. Automatically set to None if content_metadata.type is not AUDIO."""
|
|
330
|
+
|
|
331
|
+
text_metadata: Optional[TextMetadataSchema] = None
|
|
332
|
+
"""Specific metadata for text content. Automatically set to None if content_metadata.type is not TEXT."""
|
|
333
|
+
|
|
334
|
+
image_metadata: Optional[ImageMetadataSchema] = None
|
|
335
|
+
"""Specific metadata for image content. Automatically set to None if content_metadata.type is not IMAGE."""
|
|
336
|
+
|
|
337
|
+
table_metadata: Optional[TableMetadataSchema] = None
|
|
338
|
+
"""Specific metadata for tabular content. Automatically set to None if content_metadata.type is not STRUCTURED."""
|
|
339
|
+
|
|
340
|
+
chart_metadata: Optional[ChartMetadataSchema] = None
|
|
341
|
+
"""Specific metadata for chart content. Automatically set to None if content_metadata.type is not STRUCTURED."""
|
|
342
|
+
|
|
343
|
+
error_metadata: Optional[ErrorMetadataSchema] = None
|
|
344
|
+
"""Metadata that describes any errors encountered during processing."""
|
|
345
|
+
|
|
346
|
+
info_message_metadata: Optional[InfoMessageMetadataSchema] = None
|
|
347
|
+
"""Informational messages related to the processing."""
|
|
348
|
+
|
|
349
|
+
debug_metadata: Optional[Dict[str, Any]] = None
|
|
350
|
+
"""A dictionary for storing any arbitrary debug information."""
|
|
351
|
+
|
|
352
|
+
raise_on_failure: bool = False
|
|
353
|
+
"""If True, indicates that processing should halt on failure."""
|
|
354
|
+
|
|
355
|
+
total_pages: Optional[int] = None
|
|
356
|
+
"""Total number of pages in the source document (V2 API)."""
|
|
357
|
+
|
|
358
|
+
original_source_id: Optional[str] = None
|
|
359
|
+
"""The original source identifier before any splitting or chunking (V2 API)."""
|
|
360
|
+
|
|
361
|
+
original_source_name: Optional[str] = None
|
|
362
|
+
"""The original source name before any splitting or chunking (V2 API)."""
|
|
363
|
+
|
|
364
|
+
custom_content: Optional[Dict[str, Any]] = None
|
|
365
|
+
|
|
366
|
+
@model_validator(mode="before")
|
|
367
|
+
@classmethod
|
|
368
|
+
def check_metadata_type(cls, values):
|
|
369
|
+
content_type = values.get("content_metadata", {}).get("type", None)
|
|
370
|
+
if content_type != ContentTypeEnum.AUDIO:
|
|
371
|
+
values["audio_metadata"] = None
|
|
372
|
+
if content_type != ContentTypeEnum.IMAGE:
|
|
373
|
+
values["image_metadata"] = None
|
|
374
|
+
if content_type != ContentTypeEnum.TEXT:
|
|
375
|
+
values["text_metadata"] = None
|
|
376
|
+
if content_type != ContentTypeEnum.STRUCTURED:
|
|
377
|
+
values["table_metadata"] = None
|
|
378
|
+
return values
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def validate_metadata(metadata: Dict[str, Any]) -> MetadataSchema:
|
|
382
|
+
"""
|
|
383
|
+
Validates the given metadata dictionary against the MetadataSchema.
|
|
384
|
+
|
|
385
|
+
Parameters:
|
|
386
|
+
- metadata: A dictionary representing metadata to be validated.
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
- An instance of MetadataSchema if validation is successful.
|
|
390
|
+
|
|
391
|
+
Raises:
|
|
392
|
+
- ValidationError: If the metadata does not conform to the schema.
|
|
393
|
+
"""
|
|
394
|
+
return MetadataSchema(**metadata)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field, ConfigDict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class UDFStageSchema(BaseModel):
|
|
9
|
+
"""
|
|
10
|
+
Schema for UDF stage configuration.
|
|
11
|
+
|
|
12
|
+
The UDF function string should be provided in the task config. If no UDF function
|
|
13
|
+
is provided and ignore_empty_udf is True, the message is returned unchanged.
|
|
14
|
+
If ignore_empty_udf is False, an error is raised when no UDF function is provided.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
ignore_empty_udf: bool = Field(
|
|
18
|
+
False,
|
|
19
|
+
description="If True, ignore UDF tasks without udf_function and return message unchanged. "
|
|
20
|
+
"If False, raise error.",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Shared mixins for Pydantic schemas.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
from pydantic import BaseModel, field_validator
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class LowercaseProtocolMixin(BaseModel):
|
|
14
|
+
"""
|
|
15
|
+
Mixin that automatically lowercases any field ending with '_infer_protocol'.
|
|
16
|
+
|
|
17
|
+
This ensures case-insensitive handling of protocol values (e.g., "HTTP" -> "http").
|
|
18
|
+
Apply this mixin to any schema that has protocol fields to normalize user input.
|
|
19
|
+
|
|
20
|
+
Examples
|
|
21
|
+
--------
|
|
22
|
+
>>> class MyConfigSchema(LowercaseProtocolMixin):
|
|
23
|
+
... yolox_infer_protocol: str = ""
|
|
24
|
+
... ocr_infer_protocol: str = ""
|
|
25
|
+
>>>
|
|
26
|
+
>>> config = MyConfigSchema(yolox_infer_protocol="GRPC", ocr_infer_protocol="HTTP")
|
|
27
|
+
>>> config.yolox_infer_protocol
|
|
28
|
+
'grpc'
|
|
29
|
+
>>> config.ocr_infer_protocol
|
|
30
|
+
'http'
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
@field_validator("*", mode="before")
|
|
34
|
+
@classmethod
|
|
35
|
+
def _lowercase_protocol_fields(cls, v: Any, info):
|
|
36
|
+
"""Lowercase any field ending with '_infer_protocol'."""
|
|
37
|
+
if info.field_name.endswith("_infer_protocol") and v is not None:
|
|
38
|
+
return str(v).strip().lower()
|
|
39
|
+
return v
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
from pydantic import ConfigDict, BaseModel
|
|
9
|
+
from pydantic import StrictBool
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ImageDedupSchema(BaseModel):
|
|
15
|
+
raise_on_failure: StrictBool = False
|
|
16
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
|
|
6
|
+
#
|
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
8
|
+
# you may not use this file except in compliance with the License.
|
|
9
|
+
# You may obtain a copy of the License at
|
|
10
|
+
#
|
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
12
|
+
#
|
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
16
|
+
# See the License for the specific language governing permissions and
|
|
17
|
+
# limitations under the License.
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
|
|
21
|
+
from pydantic import ConfigDict, BaseModel
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class EmbeddingStorageSchema(BaseModel):
|
|
27
|
+
raise_on_failure: bool = False
|
|
28
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
|
|
6
|
+
#
|
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
8
|
+
# you may not use this file except in compliance with the License.
|
|
9
|
+
# You may obtain a copy of the License at
|
|
10
|
+
#
|
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
12
|
+
#
|
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
16
|
+
# See the License for the specific language governing permissions and
|
|
17
|
+
# limitations under the License.
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
import os
|
|
21
|
+
from typing import Optional, Dict, Any
|
|
22
|
+
|
|
23
|
+
from pydantic import ConfigDict, BaseModel, Field, field_validator
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
_DEFAULT_STORAGE_URI = os.environ.get("IMAGE_STORAGE_URI", "s3://nv-ingest/artifacts/store/images")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ImageStorageModuleSchema(BaseModel):
|
|
32
|
+
structured: bool = True
|
|
33
|
+
images: bool = True
|
|
34
|
+
storage_uri: str = Field(default_factory=lambda: _DEFAULT_STORAGE_URI)
|
|
35
|
+
storage_options: Dict[str, Any] = Field(default_factory=dict)
|
|
36
|
+
public_base_url: Optional[str] = None
|
|
37
|
+
raise_on_failure: bool = False
|
|
38
|
+
model_config = ConfigDict(extra="forbid")
|
|
39
|
+
|
|
40
|
+
@field_validator("storage_uri")
|
|
41
|
+
@classmethod
|
|
42
|
+
def validate_storage_uri(cls, value: str) -> str:
|
|
43
|
+
if not value or not value.strip():
|
|
44
|
+
raise ValueError("`storage_uri` must be provided.")
|
|
45
|
+
return value
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from pydantic import ConfigDict, BaseModel, model_validator, field_validator, Field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ImageCaptionExtractionSchema(BaseModel):
|
|
10
|
+
api_key: str = Field(default="", repr=False)
|
|
11
|
+
endpoint_url: str = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
12
|
+
prompt: str = "Caption the content of this image:"
|
|
13
|
+
system_prompt: str = "/no_think"
|
|
14
|
+
model_name: str = "nvidia/nemotron-nano-12b-v2-vl"
|
|
15
|
+
raise_on_failure: bool = False
|
|
16
|
+
model_config = ConfigDict(extra="forbid")
|
|
17
|
+
|
|
18
|
+
@field_validator("api_key", mode="before")
|
|
19
|
+
@classmethod
|
|
20
|
+
def _coerce_api_key_none(cls, v):
|
|
21
|
+
return "" if v is None else v
|
|
22
|
+
|
|
23
|
+
@model_validator(mode="before")
|
|
24
|
+
@classmethod
|
|
25
|
+
def _coerce_none_to_defaults(cls, values):
|
|
26
|
+
"""Normalize None inputs so validation keeps existing defaults."""
|
|
27
|
+
if not isinstance(values, dict):
|
|
28
|
+
return values
|
|
29
|
+
|
|
30
|
+
if values.get("api_key") is None:
|
|
31
|
+
values["api_key"] = ""
|
|
32
|
+
if values.get("prompt") is None:
|
|
33
|
+
values["prompt"] = cls.model_fields["prompt"].default
|
|
34
|
+
if values.get("system_prompt") is None:
|
|
35
|
+
values["system_prompt"] = cls.model_fields["system_prompt"].default
|
|
36
|
+
return values
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
from pydantic import ConfigDict, BaseModel
|
|
9
|
+
from pydantic import StrictBool
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ImageFilterSchema(BaseModel):
|
|
15
|
+
raise_on_failure: StrictBool = False
|
|
16
|
+
cpu_only: StrictBool = False
|
|
17
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
from pydantic import ConfigDict, BaseModel, Field, model_validator, field_validator
|
|
9
|
+
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
from nv_ingest_api.util.logging.configuration import LogLevel
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TextEmbeddingSchema(BaseModel):
|
|
18
|
+
api_key: str = Field(default="", repr=False)
|
|
19
|
+
batch_size: int = Field(default=4)
|
|
20
|
+
embedding_model: str = Field(default="nvidia/llama-3.2-nv-embedqa-1b-v2")
|
|
21
|
+
embedding_nim_endpoint: str = Field(default="http://embedding:8000/v1")
|
|
22
|
+
encoding_format: str = Field(default="float")
|
|
23
|
+
httpx_log_level: LogLevel = Field(default=LogLevel.WARNING)
|
|
24
|
+
input_type: str = Field(default="passage")
|
|
25
|
+
raise_on_failure: bool = Field(default=False)
|
|
26
|
+
truncate: str = Field(default="END")
|
|
27
|
+
text_elements_modality: str = Field(default="text")
|
|
28
|
+
image_elements_modality: str = Field(default="text")
|
|
29
|
+
structured_elements_modality: str = Field(default="text")
|
|
30
|
+
audio_elements_modality: str = Field(default="text")
|
|
31
|
+
custom_content_field: Optional[str] = None
|
|
32
|
+
result_target_field: Optional[str] = None
|
|
33
|
+
dimensions: Optional[int] = None
|
|
34
|
+
|
|
35
|
+
model_config = ConfigDict(extra="forbid")
|
|
36
|
+
|
|
37
|
+
@field_validator("api_key", mode="before")
|
|
38
|
+
@classmethod
|
|
39
|
+
def _coerce_api_key_none(cls, v):
|
|
40
|
+
return "" if v is None else v
|
|
41
|
+
|
|
42
|
+
@model_validator(mode="before")
|
|
43
|
+
@classmethod
|
|
44
|
+
def _coerce_none_to_empty(cls, values):
|
|
45
|
+
"""Convert api_key=None to empty string so validation passes when key is omitted."""
|
|
46
|
+
if isinstance(values, dict) and values.get("api_key") is None:
|
|
47
|
+
values["api_key"] = ""
|
|
48
|
+
return values
|