nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,394 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from datetime import datetime
8
+ from typing import Any
9
+ from typing import Dict
10
+ from typing import List
11
+ from typing import Optional
12
+ from typing import Union
13
+
14
+ from pydantic import field_validator, model_validator, Field
15
+
16
+ from nv_ingest_api.internal.enums.common import (
17
+ AccessLevelEnum,
18
+ ContentTypeEnum,
19
+ TextTypeEnum,
20
+ LanguageEnum,
21
+ TableFormatEnum,
22
+ StatusEnum,
23
+ DocumentTypeEnum,
24
+ TaskTypeEnum,
25
+ )
26
+ from nv_ingest_api.internal.schemas.meta.base_model_noext import BaseModelNoExt
27
+ from nv_ingest_api.util.converters import datetools
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ # Sub schemas
33
+ class SourceMetadataSchema(BaseModelNoExt):
34
+ """
35
+ Schema for the knowledge base file from which content
36
+ and metadata is extracted.
37
+ """
38
+
39
+ source_name: str
40
+ """The name of the source file."""
41
+
42
+ source_id: str
43
+ """The ID of the source file."""
44
+
45
+ source_location: str = ""
46
+ """The URL, URI, or pointer to the storage location of the source file."""
47
+
48
+ source_type: Union[DocumentTypeEnum, str]
49
+ """The type of the source file, such as pdf, docx, pptx, or txt."""
50
+
51
+ collection_id: str = ""
52
+ """The ID of the collection in which the source is contained."""
53
+
54
+ date_created: str = datetime.now().isoformat()
55
+ """The date the source was created."""
56
+
57
+ last_modified: str = datetime.now().isoformat()
58
+ """The date the source was last modified."""
59
+
60
+ summary: str = ""
61
+ """A summary of the source."""
62
+
63
+ partition_id: int = -1
64
+ """The offset of this data fragment within a larger set of fragments."""
65
+
66
+ access_level: Union[AccessLevelEnum, int] = AccessLevelEnum.UNKNOWN
67
+ """The role-based access control for the source."""
68
+
69
+ custom_content: Optional[Dict[str, Any]] = None
70
+
71
+ @field_validator("date_created", "last_modified")
72
+ @classmethod
73
+ def validate_fields(cls, field_value):
74
+ datetools.validate_iso8601(field_value)
75
+ return field_value
76
+
77
+
78
+ class NearbyObjectsSubSchema(BaseModelNoExt):
79
+ """
80
+ Schema to hold related extracted object.
81
+ """
82
+
83
+ content: List[str] = Field(default_factory=list)
84
+ bbox: List[tuple] = Field(default_factory=list)
85
+ type: List[str] = Field(default_factory=list)
86
+
87
+
88
+ class NearbyObjectsSchema(BaseModelNoExt):
89
+ """
90
+ Schema to hold types of related extracted objects.
91
+ """
92
+
93
+ text: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
94
+ images: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
95
+ structured: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
96
+
97
+
98
+ class ContentHierarchySchema(BaseModelNoExt):
99
+ """
100
+ Schema for the extracted content hierarchy.
101
+ """
102
+
103
+ page_count: int = -1
104
+ page: int = -1
105
+ block: int = -1
106
+ line: int = -1
107
+ span: int = -1
108
+ nearby_objects: NearbyObjectsSchema = NearbyObjectsSchema()
109
+
110
+
111
+ class ContentMetadataSchema(BaseModelNoExt):
112
+ """
113
+ Data extracted from a source; generally Text or Image.
114
+ """
115
+
116
+ type: ContentTypeEnum
117
+ """The type of the content. Text, Image, Structured, Table, or Chart."""
118
+
119
+ description: str = ""
120
+ """A text description of the content object."""
121
+
122
+ page_number: int = -1
123
+ """The page number of the content in the source."""
124
+
125
+ hierarchy: ContentHierarchySchema = ContentHierarchySchema()
126
+ """The location or order of the content within the source."""
127
+
128
+ subtype: Union[ContentTypeEnum, str] = ""
129
+ """The type of the content for structured data types, such as table or chart."""
130
+
131
+ start_time: int = -1
132
+ """The timestamp of the start of a piece of audio content."""
133
+
134
+ end_time: int = -1
135
+ """The timestamp of the end of a piece of audio content."""
136
+
137
+ custom_content: Optional[Dict[str, Any]] = None
138
+
139
+
140
+ class TextMetadataSchema(BaseModelNoExt):
141
+ """
142
+ The schema for the extracted text content.
143
+ """
144
+
145
+ text_type: TextTypeEnum
146
+ """The type of the text, such as header or body."""
147
+
148
+ summary: str = ""
149
+ """An abbreviated summary of the content."""
150
+
151
+ keywords: Union[str, List[str], Dict] = ""
152
+ """Keywords, named entities, or other phrases."""
153
+
154
+ language: LanguageEnum = LanguageEnum.EN # default to Unknown? Maybe do some kind of heuristic check
155
+ """The language of the content."""
156
+
157
+ text_location: tuple = (0, 0, 0, 0)
158
+ """The bounding box of the text, in the format (x1,y1,x2,y2)."""
159
+
160
+ text_location_max_dimensions: tuple = (0, 0)
161
+ """The maximum dimensions of the bounding box of the text, in the format (x_max,y_max)."""
162
+
163
+ custom_content: Optional[Dict[str, Any]] = None
164
+
165
+
166
+ class ImageMetadataSchema(BaseModelNoExt):
167
+ """
168
+ The schema for the extracted image content.
169
+ """
170
+
171
+ image_type: Union[DocumentTypeEnum, str]
172
+ """The type of the image, such as structured, natural, hybrid, and others."""
173
+
174
+ structured_image_type: ContentTypeEnum = ContentTypeEnum.NONE
175
+ """The type of the content for structured data types, such as bar chart, pie chart, and others."""
176
+
177
+ caption: str = ""
178
+ """A caption or subheading associated with the image."""
179
+
180
+ text: str = ""
181
+ """Extracted text from a structured chart."""
182
+
183
+ image_location: tuple = (0, 0, 0, 0)
184
+ """The bounding box of the image, in the format (x1,y1,x2,y2)."""
185
+
186
+ image_location_max_dimensions: tuple = (0, 0)
187
+ """The maximum dimensions of the bounding box of the image, in the format (x_max,y_max)."""
188
+
189
+ uploaded_image_url: str = ""
190
+ """A mirror of source_metadata.source_location."""
191
+
192
+ width: int = 0
193
+ """The width of the image."""
194
+
195
+ height: int = 0
196
+ """The height of the image."""
197
+
198
+ custom_content: Optional[Dict[str, Any]] = None
199
+
200
+ @field_validator("image_type")
201
+ def validate_image_type(cls, v):
202
+ if not isinstance(v, (DocumentTypeEnum, str)):
203
+ raise ValueError("image_type must be a string or DocumentTypeEnum")
204
+ return v
205
+
206
+ @field_validator("width", "height")
207
+ def clamp_non_negative(cls, v, field):
208
+ if v < 0:
209
+ logger.warning(f"{field.field_name} is negative; clamping to 0. Original value: {v}")
210
+ return 0
211
+ return v
212
+
213
+
214
+ class TableMetadataSchema(BaseModelNoExt):
215
+ """
216
+ The schema for the extracted table content.
217
+ """
218
+
219
+ caption: str = ""
220
+ """The caption for the table."""
221
+
222
+ table_format: TableFormatEnum
223
+ """
224
+ The format of the table. One of Structured (dataframe / lists of rows and columns), or serialized as markdown,
225
+ html, latex, simple (cells separated as spaces).
226
+ """
227
+
228
+ table_content: str = ""
229
+ """Extracted text content, formatted according to table_metadata.table_format."""
230
+
231
+ table_content_format: Union[TableFormatEnum, str] = ""
232
+
233
+ table_location: tuple = (0, 0, 0, 0)
234
+ """The bounding box of the table, in the format (x1,y1,x2,y2)."""
235
+
236
+ table_location_max_dimensions: tuple = (0, 0)
237
+ """The maximum dimensions of the bounding box of the table, in the format (x_max,y_max)."""
238
+
239
+ uploaded_image_uri: str = ""
240
+ """A mirror of source_metadata.source_location."""
241
+
242
+ custom_content: Optional[Dict[str, Any]] = None
243
+
244
+
245
+ class ChartMetadataSchema(BaseModelNoExt):
246
+ """
247
+ The schema for table content extracted from charts.
248
+ """
249
+
250
+ caption: str = ""
251
+ """The caption for the chart."""
252
+
253
+ table_format: TableFormatEnum
254
+ """
255
+ The format of the table. One of Structured (dataframe / lists of rows and columns), or serialized as markdown,
256
+ html, latex, simple (cells separated as spaces).
257
+ """
258
+
259
+ table_content: str = ""
260
+ """Extracted text content, formatted according to chart_metadata.table_format."""
261
+
262
+ table_content_format: Union[TableFormatEnum, str] = ""
263
+
264
+ table_location: tuple = (0, 0, 0, 0)
265
+ """The bounding box of the chart, in the format (x1,y1,x2,y2)."""
266
+
267
+ table_location_max_dimensions: tuple = (0, 0)
268
+ """The maximum dimensions of the bounding box of the chart, in the format (x_max,y_max)."""
269
+
270
+ uploaded_image_uri: str = ""
271
+ """A mirror of source_metadata.source_location."""
272
+
273
+ custom_content: Optional[Dict[str, Any]] = None
274
+
275
+
276
+ class AudioMetadataSchema(BaseModelNoExt):
277
+ """
278
+ The schema for extracted audio content.
279
+ """
280
+
281
+ audio_transcript: str = ""
282
+ """A transcript of the audio content."""
283
+
284
+ audio_type: str = ""
285
+ """The type or format of the audio, such as mp3, wav."""
286
+
287
+ custom_content: Optional[Dict[str, Any]] = None
288
+
289
+
290
+ # TODO consider deprecating this in favor of info msg...
291
+ class ErrorMetadataSchema(BaseModelNoExt):
292
+ task: TaskTypeEnum
293
+ status: StatusEnum
294
+ source_id: str = ""
295
+ error_msg: str
296
+ custom_content: Optional[Dict[str, Any]] = None
297
+
298
+
299
+ class InfoMessageMetadataSchema(BaseModelNoExt):
300
+ task: TaskTypeEnum
301
+ status: StatusEnum
302
+ message: str
303
+ filter: bool
304
+ custom_content: Optional[Dict[str, Any]] = None
305
+
306
+
307
+ # Main metadata schema
308
+ class MetadataSchema(BaseModelNoExt):
309
+ """
310
+ The primary container schema for extraction results.
311
+ """
312
+
313
+ content: str = ""
314
+ """The actual textual content extracted from the source."""
315
+
316
+ content_url: str = ""
317
+ """A URL that points to the location of the content, if applicable."""
318
+
319
+ embedding: Optional[List[float]] = None
320
+ """An optional numerical vector representation (embedding) of the content."""
321
+
322
+ source_metadata: Optional[SourceMetadataSchema] = None
323
+ """Metadata about the original source of the content."""
324
+
325
+ content_metadata: Optional[ContentMetadataSchema] = None
326
+ """General metadata about the extracted content itself."""
327
+
328
+ audio_metadata: Optional[AudioMetadataSchema] = None
329
+ """Specific metadata for audio content. Automatically set to None if content_metadata.type is not AUDIO."""
330
+
331
+ text_metadata: Optional[TextMetadataSchema] = None
332
+ """Specific metadata for text content. Automatically set to None if content_metadata.type is not TEXT."""
333
+
334
+ image_metadata: Optional[ImageMetadataSchema] = None
335
+ """Specific metadata for image content. Automatically set to None if content_metadata.type is not IMAGE."""
336
+
337
+ table_metadata: Optional[TableMetadataSchema] = None
338
+ """Specific metadata for tabular content. Automatically set to None if content_metadata.type is not STRUCTURED."""
339
+
340
+ chart_metadata: Optional[ChartMetadataSchema] = None
341
+ """Specific metadata for chart content. Automatically set to None if content_metadata.type is not STRUCTURED."""
342
+
343
+ error_metadata: Optional[ErrorMetadataSchema] = None
344
+ """Metadata that describes any errors encountered during processing."""
345
+
346
+ info_message_metadata: Optional[InfoMessageMetadataSchema] = None
347
+ """Informational messages related to the processing."""
348
+
349
+ debug_metadata: Optional[Dict[str, Any]] = None
350
+ """A dictionary for storing any arbitrary debug information."""
351
+
352
+ raise_on_failure: bool = False
353
+ """If True, indicates that processing should halt on failure."""
354
+
355
+ total_pages: Optional[int] = None
356
+ """Total number of pages in the source document (V2 API)."""
357
+
358
+ original_source_id: Optional[str] = None
359
+ """The original source identifier before any splitting or chunking (V2 API)."""
360
+
361
+ original_source_name: Optional[str] = None
362
+ """The original source name before any splitting or chunking (V2 API)."""
363
+
364
+ custom_content: Optional[Dict[str, Any]] = None
365
+
366
+ @model_validator(mode="before")
367
+ @classmethod
368
+ def check_metadata_type(cls, values):
369
+ content_type = values.get("content_metadata", {}).get("type", None)
370
+ if content_type != ContentTypeEnum.AUDIO:
371
+ values["audio_metadata"] = None
372
+ if content_type != ContentTypeEnum.IMAGE:
373
+ values["image_metadata"] = None
374
+ if content_type != ContentTypeEnum.TEXT:
375
+ values["text_metadata"] = None
376
+ if content_type != ContentTypeEnum.STRUCTURED:
377
+ values["table_metadata"] = None
378
+ return values
379
+
380
+
381
+ def validate_metadata(metadata: Dict[str, Any]) -> MetadataSchema:
382
+ """
383
+ Validates the given metadata dictionary against the MetadataSchema.
384
+
385
+ Parameters:
386
+ - metadata: A dictionary representing metadata to be validated.
387
+
388
+ Returns:
389
+ - An instance of MetadataSchema if validation is successful.
390
+
391
+ Raises:
392
+ - ValidationError: If the metadata does not conform to the schema.
393
+ """
394
+ return MetadataSchema(**metadata)
@@ -0,0 +1,23 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from pydantic import BaseModel, Field, ConfigDict
6
+
7
+
8
+ class UDFStageSchema(BaseModel):
9
+ """
10
+ Schema for UDF stage configuration.
11
+
12
+ The UDF function string should be provided in the task config. If no UDF function
13
+ is provided and ignore_empty_udf is True, the message is returned unchanged.
14
+ If ignore_empty_udf is False, an error is raised when no UDF function is provided.
15
+ """
16
+
17
+ ignore_empty_udf: bool = Field(
18
+ False,
19
+ description="If True, ignore UDF tasks without udf_function and return message unchanged. "
20
+ "If False, raise error.",
21
+ )
22
+
23
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,39 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ Shared mixins for Pydantic schemas.
7
+ """
8
+
9
+ from typing import Any
10
+ from pydantic import BaseModel, field_validator
11
+
12
+
13
+ class LowercaseProtocolMixin(BaseModel):
14
+ """
15
+ Mixin that automatically lowercases any field ending with '_infer_protocol'.
16
+
17
+ This ensures case-insensitive handling of protocol values (e.g., "HTTP" -> "http").
18
+ Apply this mixin to any schema that has protocol fields to normalize user input.
19
+
20
+ Examples
21
+ --------
22
+ >>> class MyConfigSchema(LowercaseProtocolMixin):
23
+ ... yolox_infer_protocol: str = ""
24
+ ... ocr_infer_protocol: str = ""
25
+ >>>
26
+ >>> config = MyConfigSchema(yolox_infer_protocol="GRPC", ocr_infer_protocol="HTTP")
27
+ >>> config.yolox_infer_protocol
28
+ 'grpc'
29
+ >>> config.ocr_infer_protocol
30
+ 'http'
31
+ """
32
+
33
+ @field_validator("*", mode="before")
34
+ @classmethod
35
+ def _lowercase_protocol_fields(cls, v: Any, info):
36
+ """Lowercase any field ending with '_infer_protocol'."""
37
+ if info.field_name.endswith("_infer_protocol") and v is not None:
38
+ return str(v).strip().lower()
39
+ return v
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,16 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ from pydantic import ConfigDict, BaseModel
9
+ from pydantic import StrictBool
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class ImageDedupSchema(BaseModel):
15
+ raise_on_failure: StrictBool = False
16
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,28 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2022-2024, NVIDIA CORPORATION.
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ import logging
20
+
21
+ from pydantic import ConfigDict, BaseModel
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class EmbeddingStorageSchema(BaseModel):
27
+ raise_on_failure: bool = False
28
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,45 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2022-2024, NVIDIA CORPORATION.
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ import logging
20
+ import os
21
+ from typing import Optional, Dict, Any
22
+
23
+ from pydantic import ConfigDict, BaseModel, Field, field_validator
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ _DEFAULT_STORAGE_URI = os.environ.get("IMAGE_STORAGE_URI", "s3://nv-ingest/artifacts/store/images")
29
+
30
+
31
+ class ImageStorageModuleSchema(BaseModel):
32
+ structured: bool = True
33
+ images: bool = True
34
+ storage_uri: str = Field(default_factory=lambda: _DEFAULT_STORAGE_URI)
35
+ storage_options: Dict[str, Any] = Field(default_factory=dict)
36
+ public_base_url: Optional[str] = None
37
+ raise_on_failure: bool = False
38
+ model_config = ConfigDict(extra="forbid")
39
+
40
+ @field_validator("storage_uri")
41
+ @classmethod
42
+ def validate_storage_uri(cls, value: str) -> str:
43
+ if not value or not value.strip():
44
+ raise ValueError("`storage_uri` must be provided.")
45
+ return value
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,36 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ from pydantic import ConfigDict, BaseModel, model_validator, field_validator, Field
7
+
8
+
9
+ class ImageCaptionExtractionSchema(BaseModel):
10
+ api_key: str = Field(default="", repr=False)
11
+ endpoint_url: str = "https://integrate.api.nvidia.com/v1/chat/completions"
12
+ prompt: str = "Caption the content of this image:"
13
+ system_prompt: str = "/no_think"
14
+ model_name: str = "nvidia/nemotron-nano-12b-v2-vl"
15
+ raise_on_failure: bool = False
16
+ model_config = ConfigDict(extra="forbid")
17
+
18
+ @field_validator("api_key", mode="before")
19
+ @classmethod
20
+ def _coerce_api_key_none(cls, v):
21
+ return "" if v is None else v
22
+
23
+ @model_validator(mode="before")
24
+ @classmethod
25
+ def _coerce_none_to_defaults(cls, values):
26
+ """Normalize None inputs so validation keeps existing defaults."""
27
+ if not isinstance(values, dict):
28
+ return values
29
+
30
+ if values.get("api_key") is None:
31
+ values["api_key"] = ""
32
+ if values.get("prompt") is None:
33
+ values["prompt"] = cls.model_fields["prompt"].default
34
+ if values.get("system_prompt") is None:
35
+ values["system_prompt"] = cls.model_fields["system_prompt"].default
36
+ return values
@@ -0,0 +1,17 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ from pydantic import ConfigDict, BaseModel
9
+ from pydantic import StrictBool
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class ImageFilterSchema(BaseModel):
15
+ raise_on_failure: StrictBool = False
16
+ cpu_only: StrictBool = False
17
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,48 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ from pydantic import ConfigDict, BaseModel, Field, model_validator, field_validator
9
+
10
+ from typing import Optional
11
+
12
+ from nv_ingest_api.util.logging.configuration import LogLevel
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class TextEmbeddingSchema(BaseModel):
18
+ api_key: str = Field(default="", repr=False)
19
+ batch_size: int = Field(default=4)
20
+ embedding_model: str = Field(default="nvidia/llama-3.2-nv-embedqa-1b-v2")
21
+ embedding_nim_endpoint: str = Field(default="http://embedding:8000/v1")
22
+ encoding_format: str = Field(default="float")
23
+ httpx_log_level: LogLevel = Field(default=LogLevel.WARNING)
24
+ input_type: str = Field(default="passage")
25
+ raise_on_failure: bool = Field(default=False)
26
+ truncate: str = Field(default="END")
27
+ text_elements_modality: str = Field(default="text")
28
+ image_elements_modality: str = Field(default="text")
29
+ structured_elements_modality: str = Field(default="text")
30
+ audio_elements_modality: str = Field(default="text")
31
+ custom_content_field: Optional[str] = None
32
+ result_target_field: Optional[str] = None
33
+ dimensions: Optional[int] = None
34
+
35
+ model_config = ConfigDict(extra="forbid")
36
+
37
+ @field_validator("api_key", mode="before")
38
+ @classmethod
39
+ def _coerce_api_key_none(cls, v):
40
+ return "" if v is None else v
41
+
42
+ @model_validator(mode="before")
43
+ @classmethod
44
+ def _coerce_none_to_empty(cls, values):
45
+ """Convert api_key=None to empty string so validation passes when key is omitted."""
46
+ if isinstance(values, dict) and values.get("api_key") is None:
47
+ values["api_key"] = ""
48
+ return values