nv-ingest-api 2025.9.15.dev20250915__py3-none-any.whl → 2025.9.17.dev20250917__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +153 -1
- {nv_ingest_api-2025.9.15.dev20250915.dist-info → nv_ingest_api-2025.9.17.dev20250917.dist-info}/METADATA +1 -1
- {nv_ingest_api-2025.9.15.dev20250915.dist-info → nv_ingest_api-2025.9.17.dev20250917.dist-info}/RECORD +6 -6
- {nv_ingest_api-2025.9.15.dev20250915.dist-info → nv_ingest_api-2025.9.17.dev20250917.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.9.15.dev20250915.dist-info → nv_ingest_api-2025.9.17.dev20250917.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.9.15.dev20250915.dist-info → nv_ingest_api-2025.9.17.dev20250917.dist-info}/top_level.txt +0 -0
|
@@ -37,15 +37,35 @@ class SourceMetadataSchema(BaseModelNoExt):
|
|
|
37
37
|
"""
|
|
38
38
|
|
|
39
39
|
source_name: str
|
|
40
|
+
"""The name of the source file."""
|
|
41
|
+
|
|
40
42
|
source_id: str
|
|
43
|
+
"""The ID of the source file."""
|
|
44
|
+
|
|
41
45
|
source_location: str = ""
|
|
46
|
+
"""The URL, URI, or pointer to the storage location of the source file."""
|
|
47
|
+
|
|
42
48
|
source_type: Union[DocumentTypeEnum, str]
|
|
49
|
+
"""The type of the source file, such as pdf, docx, pptx, or txt."""
|
|
50
|
+
|
|
43
51
|
collection_id: str = ""
|
|
52
|
+
"""The ID of the collection in which the source is contained."""
|
|
53
|
+
|
|
44
54
|
date_created: str = datetime.now().isoformat()
|
|
55
|
+
"""The date the source was created."""
|
|
56
|
+
|
|
45
57
|
last_modified: str = datetime.now().isoformat()
|
|
58
|
+
"""The date the source was last modified."""
|
|
59
|
+
|
|
46
60
|
summary: str = ""
|
|
61
|
+
"""A summary of the source."""
|
|
62
|
+
|
|
47
63
|
partition_id: int = -1
|
|
64
|
+
"""The offset of this data fragment within a larger set of fragments."""
|
|
65
|
+
|
|
48
66
|
access_level: Union[AccessLevelEnum, int] = AccessLevelEnum.UNKNOWN
|
|
67
|
+
"""The role-based access control for the source."""
|
|
68
|
+
|
|
49
69
|
custom_content: Optional[Dict[str, Any]] = None
|
|
50
70
|
|
|
51
71
|
@field_validator("date_created", "last_modified")
|
|
@@ -94,35 +114,87 @@ class ContentMetadataSchema(BaseModelNoExt):
|
|
|
94
114
|
"""
|
|
95
115
|
|
|
96
116
|
type: ContentTypeEnum
|
|
117
|
+
"""The type of the content. Text, Image, Structured, Table, or Chart."""
|
|
118
|
+
|
|
97
119
|
description: str = ""
|
|
120
|
+
"""A text description of the content object."""
|
|
121
|
+
|
|
98
122
|
page_number: int = -1
|
|
123
|
+
"""The page number of the content in the source."""
|
|
124
|
+
|
|
99
125
|
hierarchy: ContentHierarchySchema = ContentHierarchySchema()
|
|
126
|
+
"""The location or order of the content within the source."""
|
|
127
|
+
|
|
100
128
|
subtype: Union[ContentTypeEnum, str] = ""
|
|
129
|
+
"""The type of the content for structured data types, such as table or chart."""
|
|
130
|
+
|
|
101
131
|
start_time: int = -1
|
|
132
|
+
"""The timestamp of the start of a piece of audio content."""
|
|
133
|
+
|
|
102
134
|
end_time: int = -1
|
|
135
|
+
"""The timestamp of the end of a piece of audio content."""
|
|
136
|
+
|
|
103
137
|
custom_content: Optional[Dict[str, Any]] = None
|
|
104
138
|
|
|
105
139
|
|
|
106
140
|
class TextMetadataSchema(BaseModelNoExt):
|
|
141
|
+
"""
|
|
142
|
+
The schema for the extracted text content.
|
|
143
|
+
"""
|
|
144
|
+
|
|
107
145
|
text_type: TextTypeEnum
|
|
146
|
+
"""The type of the text, such as header or body."""
|
|
147
|
+
|
|
108
148
|
summary: str = ""
|
|
149
|
+
"""An abbreviated summary of the content."""
|
|
150
|
+
|
|
109
151
|
keywords: Union[str, List[str], Dict] = ""
|
|
152
|
+
"""Keywords, named entities, or other phrases."""
|
|
153
|
+
|
|
110
154
|
language: LanguageEnum = "en" # default to Unknown? Maybe do some kind of heuristic check
|
|
155
|
+
"""The language of the content."""
|
|
156
|
+
|
|
111
157
|
text_location: tuple = (0, 0, 0, 0)
|
|
112
|
-
|
|
158
|
+
"""The bounding box of the text, in the format (x1,y1,x2,y2)."""
|
|
159
|
+
|
|
160
|
+
text_location_max_dimensions: tuple = (0, 0)
|
|
161
|
+
"""The maximum dimensions of the bounding box of the text, in the format (x_max,y_max)."""
|
|
162
|
+
|
|
113
163
|
custom_content: Optional[Dict[str, Any]] = None
|
|
114
164
|
|
|
115
165
|
|
|
116
166
|
class ImageMetadataSchema(BaseModelNoExt):
|
|
167
|
+
"""
|
|
168
|
+
The schema for the extracted image content.
|
|
169
|
+
"""
|
|
170
|
+
|
|
117
171
|
image_type: Union[DocumentTypeEnum, str]
|
|
172
|
+
"""The type of the image, such as structured, natural, hybrid, and others."""
|
|
173
|
+
|
|
118
174
|
structured_image_type: ContentTypeEnum = ContentTypeEnum.NONE
|
|
175
|
+
"""The type of the content for structured data types, such as bar chart, pie chart, and others."""
|
|
176
|
+
|
|
119
177
|
caption: str = ""
|
|
178
|
+
"""A caption or subheading associated with the image."""
|
|
179
|
+
|
|
120
180
|
text: str = ""
|
|
181
|
+
"""Extracted text from a structured chart."""
|
|
182
|
+
|
|
121
183
|
image_location: tuple = (0, 0, 0, 0)
|
|
184
|
+
"""The bounding box of the image, in the format (x1,y1,x2,y2)."""
|
|
185
|
+
|
|
122
186
|
image_location_max_dimensions: tuple = (0, 0)
|
|
187
|
+
"""The maximum dimensions of the bounding box of the image, in the format (x_max,y_max)."""
|
|
188
|
+
|
|
123
189
|
uploaded_image_url: str = ""
|
|
190
|
+
"""A mirror of source_metadata.source_location."""
|
|
191
|
+
|
|
124
192
|
width: int = 0
|
|
193
|
+
"""The width of the image."""
|
|
194
|
+
|
|
125
195
|
height: int = 0
|
|
196
|
+
"""The height of the image."""
|
|
197
|
+
|
|
126
198
|
custom_content: Optional[Dict[str, Any]] = None
|
|
127
199
|
|
|
128
200
|
@field_validator("image_type")
|
|
@@ -140,30 +212,78 @@ class ImageMetadataSchema(BaseModelNoExt):
|
|
|
140
212
|
|
|
141
213
|
|
|
142
214
|
class TableMetadataSchema(BaseModelNoExt):
|
|
215
|
+
"""
|
|
216
|
+
The schema for the extracted table content.
|
|
217
|
+
"""
|
|
218
|
+
|
|
143
219
|
caption: str = ""
|
|
220
|
+
"""The caption for the table."""
|
|
221
|
+
|
|
144
222
|
table_format: TableFormatEnum
|
|
223
|
+
"""
|
|
224
|
+
The format of the table. One of Structured (dataframe / lists of rows and columns), or serialized as markdown,
|
|
225
|
+
html, latex, simple (cells separated as spaces).
|
|
226
|
+
"""
|
|
227
|
+
|
|
145
228
|
table_content: str = ""
|
|
229
|
+
"""Extracted text content, formatted according to table_metadata.table_format."""
|
|
230
|
+
|
|
146
231
|
table_content_format: Union[TableFormatEnum, str] = ""
|
|
232
|
+
|
|
147
233
|
table_location: tuple = (0, 0, 0, 0)
|
|
234
|
+
"""The bounding box of the table, in the format (x1,y1,x2,y2)."""
|
|
235
|
+
|
|
148
236
|
table_location_max_dimensions: tuple = (0, 0)
|
|
237
|
+
"""The maximum dimensions of the bounding box of the table, in the format (x_max,y_max)."""
|
|
238
|
+
|
|
149
239
|
uploaded_image_uri: str = ""
|
|
240
|
+
"""A mirror of source_metadata.source_location."""
|
|
241
|
+
|
|
150
242
|
custom_content: Optional[Dict[str, Any]] = None
|
|
151
243
|
|
|
152
244
|
|
|
153
245
|
class ChartMetadataSchema(BaseModelNoExt):
|
|
246
|
+
"""
|
|
247
|
+
The schema for extracted chart content.
|
|
248
|
+
"""
|
|
249
|
+
|
|
154
250
|
caption: str = ""
|
|
251
|
+
"""The caption for the chart."""
|
|
252
|
+
|
|
155
253
|
table_format: TableFormatEnum
|
|
254
|
+
"""
|
|
255
|
+
The format of the table. One of Structured (dataframe / lists of rows and columns), or serialized as markdown,
|
|
256
|
+
html, latex, simple (cells separated as spaces).
|
|
257
|
+
"""
|
|
258
|
+
|
|
156
259
|
table_content: str = ""
|
|
260
|
+
"""Extracted text content, formatted according to chart_metadata.table_format."""
|
|
261
|
+
|
|
157
262
|
table_content_format: Union[TableFormatEnum, str] = ""
|
|
263
|
+
|
|
158
264
|
table_location: tuple = (0, 0, 0, 0)
|
|
265
|
+
"""The bounding box of the chart, in the format (x1,y1,x2,y2)."""
|
|
266
|
+
|
|
159
267
|
table_location_max_dimensions: tuple = (0, 0)
|
|
268
|
+
"""The maximum dimensions of the bounding box of the chart, in the format (x_max,y_max)."""
|
|
269
|
+
|
|
160
270
|
uploaded_image_uri: str = ""
|
|
271
|
+
"""A mirror of source_metadata.source_location."""
|
|
272
|
+
|
|
161
273
|
custom_content: Optional[Dict[str, Any]] = None
|
|
162
274
|
|
|
163
275
|
|
|
164
276
|
class AudioMetadataSchema(BaseModelNoExt):
|
|
277
|
+
"""
|
|
278
|
+
The schema for extracted audio content.
|
|
279
|
+
"""
|
|
280
|
+
|
|
165
281
|
audio_transcript: str = ""
|
|
282
|
+
"""A transcript of the audio content."""
|
|
283
|
+
|
|
166
284
|
audio_type: str = ""
|
|
285
|
+
"""The type or format of the audio, such as mp3, wav."""
|
|
286
|
+
|
|
167
287
|
custom_content: Optional[Dict[str, Any]] = None
|
|
168
288
|
|
|
169
289
|
|
|
@@ -186,20 +306,52 @@ class InfoMessageMetadataSchema(BaseModelNoExt):
|
|
|
186
306
|
|
|
187
307
|
# Main metadata schema
|
|
188
308
|
class MetadataSchema(BaseModelNoExt):
|
|
309
|
+
"""
|
|
310
|
+
The primary container schema for extraction results.
|
|
311
|
+
"""
|
|
312
|
+
|
|
189
313
|
content: str = ""
|
|
314
|
+
"""The actual textual content extracted from the source."""
|
|
315
|
+
|
|
190
316
|
content_url: str = ""
|
|
317
|
+
"""A URL that points to the location of the content, if applicable."""
|
|
318
|
+
|
|
191
319
|
embedding: Optional[List[float]] = None
|
|
320
|
+
"""An optional numerical vector representation (embedding) of the content."""
|
|
321
|
+
|
|
192
322
|
source_metadata: Optional[SourceMetadataSchema] = None
|
|
323
|
+
"""Metadata about the original source of the content."""
|
|
324
|
+
|
|
193
325
|
content_metadata: Optional[ContentMetadataSchema] = None
|
|
326
|
+
"""General metadata about the extracted content itself."""
|
|
327
|
+
|
|
194
328
|
audio_metadata: Optional[AudioMetadataSchema] = None
|
|
329
|
+
"""Specific metadata for audio content. Automatically set to None if content_metadata.type is not AUDIO."""
|
|
330
|
+
|
|
195
331
|
text_metadata: Optional[TextMetadataSchema] = None
|
|
332
|
+
"""Specific metadata for text content. Automatically set to None if content_metadata.type is not TEXT."""
|
|
333
|
+
|
|
196
334
|
image_metadata: Optional[ImageMetadataSchema] = None
|
|
335
|
+
"""Specific metadata for image content. Automatically set to None if content_metadata.type is not IMAGE."""
|
|
336
|
+
|
|
197
337
|
table_metadata: Optional[TableMetadataSchema] = None
|
|
338
|
+
"""Specific metadata for tabular content. Automatically set to None if content_metadata.type is not STRUCTURED."""
|
|
339
|
+
|
|
198
340
|
chart_metadata: Optional[ChartMetadataSchema] = None
|
|
341
|
+
"""Specific metadata for chart content. Automatically set to None if content_metadata.type is not STRUCTURED."""
|
|
342
|
+
|
|
199
343
|
error_metadata: Optional[ErrorMetadataSchema] = None
|
|
344
|
+
"""Metadata that describes any errors encountered during processing."""
|
|
345
|
+
|
|
200
346
|
info_message_metadata: Optional[InfoMessageMetadataSchema] = None
|
|
347
|
+
"""Informational messages related to the processing."""
|
|
348
|
+
|
|
201
349
|
debug_metadata: Optional[Dict[str, Any]] = None
|
|
350
|
+
"""A dictionary for storing any arbitrary debug information."""
|
|
351
|
+
|
|
202
352
|
raise_on_failure: bool = False
|
|
353
|
+
"""If True, indicates that processing should halt on failure."""
|
|
354
|
+
|
|
203
355
|
custom_content: Optional[Dict[str, Any]] = None
|
|
204
356
|
|
|
205
357
|
@model_validator(mode="before")
|
|
@@ -85,7 +85,7 @@ nv_ingest_api/internal/schemas/message_brokers/response_schema.py,sha256=4b275Hl
|
|
|
85
85
|
nv_ingest_api/internal/schemas/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
86
86
|
nv_ingest_api/internal/schemas/meta/base_model_noext.py,sha256=8hXU1uuiqZ6t8EsoZ8vlC5EFf2zSZrKEX133FcfZMwI,316
|
|
87
87
|
nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=cIpoesvIs0dR6s8dGjGHL246k5kf7hDmdhA48i8Si7s,10253
|
|
88
|
-
nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=
|
|
88
|
+
nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=FDD6yq-QxW8yDwn0Bq6bmWakX41ABMn3cytrvCbT-Po,11961
|
|
89
89
|
nv_ingest_api/internal/schemas/meta/udf.py,sha256=GgzqbZOlipQgMpDhbXLqbF8xrHenj_hMNqhR_P-1ynw,779
|
|
90
90
|
nv_ingest_api/internal/schemas/mutate/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
91
91
|
nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py,sha256=k1JOdlPPpsipc0XhHf-9YxJ_-W0HvpVE1ZhYmr7fzj0,395
|
|
@@ -162,10 +162,10 @@ nv_ingest_api/util/string_processing/configuration.py,sha256=2HS08msccuPCT0fn_jf
|
|
|
162
162
|
nv_ingest_api/util/string_processing/yaml.py,sha256=6SW2O6wbXRhGbhETMbtXjYCZn53HeCNOP6a96AaxlHs,1454
|
|
163
163
|
nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
164
164
|
nv_ingest_api/util/system/hardware_info.py,sha256=1UFM8XE6M3pgQcpbVsCsqDQ7Dj-zzptL-XRE-DEu9UA,27213
|
|
165
|
-
nv_ingest_api-2025.9.
|
|
165
|
+
nv_ingest_api-2025.9.17.dev20250917.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
166
166
|
udfs/__init__.py,sha256=pXFqPgXIUqHDfj7SAR1Q19tt8KwGv_iMvhHyziz4AYM,205
|
|
167
167
|
udfs/llm_summarizer_udf.py,sha256=sIMfcH4GRyciTKUtq4dmhd6fZmAp07X32irIC4k7nEI,7316
|
|
168
|
-
nv_ingest_api-2025.9.
|
|
169
|
-
nv_ingest_api-2025.9.
|
|
170
|
-
nv_ingest_api-2025.9.
|
|
171
|
-
nv_ingest_api-2025.9.
|
|
168
|
+
nv_ingest_api-2025.9.17.dev20250917.dist-info/METADATA,sha256=-di8AcnkMj-wrN71D92h01wDzweW2WwQ6pKtkpCsS9w,13947
|
|
169
|
+
nv_ingest_api-2025.9.17.dev20250917.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
170
|
+
nv_ingest_api-2025.9.17.dev20250917.dist-info/top_level.txt,sha256=I1lseG9FF0CH93SPx4kFblsxFuv190cfzaas_CLNIiw,19
|
|
171
|
+
nv_ingest_api-2025.9.17.dev20250917.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|