nv-ingest-api 2025.9.15.dev20250915__py3-none-any.whl → 2025.9.17.dev20250917__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

@@ -37,15 +37,35 @@ class SourceMetadataSchema(BaseModelNoExt):
37
37
  """
38
38
 
39
39
  source_name: str
40
+ """The name of the source file."""
41
+
40
42
  source_id: str
43
+ """The ID of the source file."""
44
+
41
45
  source_location: str = ""
46
+ """The URL, URI, or pointer to the storage location of the source file."""
47
+
42
48
  source_type: Union[DocumentTypeEnum, str]
49
+ """The type of the source file, such as pdf, docx, pptx, or txt."""
50
+
43
51
  collection_id: str = ""
52
+ """The ID of the collection in which the source is contained."""
53
+
44
54
  date_created: str = datetime.now().isoformat()
55
+ """The date the source was created."""
56
+
45
57
  last_modified: str = datetime.now().isoformat()
58
+ """The date the source was last modified."""
59
+
46
60
  summary: str = ""
61
+ """A summary of the source."""
62
+
47
63
  partition_id: int = -1
64
+ """The offset of this data fragment within a larger set of fragments."""
65
+
48
66
  access_level: Union[AccessLevelEnum, int] = AccessLevelEnum.UNKNOWN
67
+ """The role-based access control for the source."""
68
+
49
69
  custom_content: Optional[Dict[str, Any]] = None
50
70
 
51
71
  @field_validator("date_created", "last_modified")
@@ -94,35 +114,87 @@ class ContentMetadataSchema(BaseModelNoExt):
94
114
  """
95
115
 
96
116
  type: ContentTypeEnum
117
+ """The type of the content. Text, Image, Structured, Table, or Chart."""
118
+
97
119
  description: str = ""
120
+ """A text description of the content object."""
121
+
98
122
  page_number: int = -1
123
+ """The page number of the content in the source."""
124
+
99
125
  hierarchy: ContentHierarchySchema = ContentHierarchySchema()
126
+ """The location or order of the content within the source."""
127
+
100
128
  subtype: Union[ContentTypeEnum, str] = ""
129
+ """The type of the content for structured data types, such as table or chart."""
130
+
101
131
  start_time: int = -1
132
+ """The timestamp of the start of a piece of audio content."""
133
+
102
134
  end_time: int = -1
135
+ """The timestamp of the end of a piece of audio content."""
136
+
103
137
  custom_content: Optional[Dict[str, Any]] = None
104
138
 
105
139
 
106
140
  class TextMetadataSchema(BaseModelNoExt):
141
+ """
142
+ The schema for the extracted text content.
143
+ """
144
+
107
145
  text_type: TextTypeEnum
146
+ """The type of the text, such as header or body."""
147
+
108
148
  summary: str = ""
149
+ """An abbreviated summary of the content."""
150
+
109
151
  keywords: Union[str, List[str], Dict] = ""
152
+ """Keywords, named entities, or other phrases."""
153
+
110
154
  language: LanguageEnum = "en" # default to Unknown? Maybe do some kind of heuristic check
155
+ """The language of the content."""
156
+
111
157
  text_location: tuple = (0, 0, 0, 0)
112
- text_location_max_dimensions: tuple = (0, 0, 0, 0)
158
+ """The bounding box of the text, in the format (x1,y1,x2,y2)."""
159
+
160
+ text_location_max_dimensions: tuple = (0, 0)
161
+ """The maximum dimensions of the bounding box of the text, in the format (x_max,y_max)."""
162
+
113
163
  custom_content: Optional[Dict[str, Any]] = None
114
164
 
115
165
 
116
166
  class ImageMetadataSchema(BaseModelNoExt):
167
+ """
168
+ The schema for the extracted image content.
169
+ """
170
+
117
171
  image_type: Union[DocumentTypeEnum, str]
172
+ """The type of the image, such as structured, natural, hybrid, and others."""
173
+
118
174
  structured_image_type: ContentTypeEnum = ContentTypeEnum.NONE
175
+ """The type of the content for structured data types, such as bar chart, pie chart, and others."""
176
+
119
177
  caption: str = ""
178
+ """A caption or subheading associated with the image."""
179
+
120
180
  text: str = ""
181
+ """Extracted text from a structured chart."""
182
+
121
183
  image_location: tuple = (0, 0, 0, 0)
184
+ """The bounding box of the image, in the format (x1,y1,x2,y2)."""
185
+
122
186
  image_location_max_dimensions: tuple = (0, 0)
187
+ """The maximum dimensions of the bounding box of the image, in the format (x_max,y_max)."""
188
+
123
189
  uploaded_image_url: str = ""
190
+ """A mirror of source_metadata.source_location."""
191
+
124
192
  width: int = 0
193
+ """The width of the image."""
194
+
125
195
  height: int = 0
196
+ """The height of the image."""
197
+
126
198
  custom_content: Optional[Dict[str, Any]] = None
127
199
 
128
200
  @field_validator("image_type")
@@ -140,30 +212,78 @@ class ImageMetadataSchema(BaseModelNoExt):
140
212
 
141
213
 
142
214
  class TableMetadataSchema(BaseModelNoExt):
215
+ """
216
+ The schema for the extracted table content.
217
+ """
218
+
143
219
  caption: str = ""
220
+ """The caption for the table."""
221
+
144
222
  table_format: TableFormatEnum
223
+ """
224
+ The format of the table. One of Structured (dataframe / lists of rows and columns), or serialized as markdown,
225
+ html, latex, simple (cells separated as spaces).
226
+ """
227
+
145
228
  table_content: str = ""
229
+ """Extracted text content, formatted according to table_metadata.table_format."""
230
+
146
231
  table_content_format: Union[TableFormatEnum, str] = ""
232
+
147
233
  table_location: tuple = (0, 0, 0, 0)
234
+ """The bounding box of the table, in the format (x1,y1,x2,y2)."""
235
+
148
236
  table_location_max_dimensions: tuple = (0, 0)
237
+ """The maximum dimensions of the bounding box of the table, in the format (x_max,y_max)."""
238
+
149
239
  uploaded_image_uri: str = ""
240
+ """A mirror of source_metadata.source_location."""
241
+
150
242
  custom_content: Optional[Dict[str, Any]] = None
151
243
 
152
244
 
153
245
  class ChartMetadataSchema(BaseModelNoExt):
246
+ """
247
+ The schema for extracted chart content.
248
+ """
249
+
154
250
  caption: str = ""
251
+ """The caption for the chart."""
252
+
155
253
  table_format: TableFormatEnum
254
+ """
255
+ The format of the table. One of Structured (dataframe / lists of rows and columns), or serialized as markdown,
256
+ html, latex, simple (cells separated as spaces).
257
+ """
258
+
156
259
  table_content: str = ""
260
+ """Extracted text content, formatted according to chart_metadata.table_format."""
261
+
157
262
  table_content_format: Union[TableFormatEnum, str] = ""
263
+
158
264
  table_location: tuple = (0, 0, 0, 0)
265
+ """The bounding box of the chart, in the format (x1,y1,x2,y2)."""
266
+
159
267
  table_location_max_dimensions: tuple = (0, 0)
268
+ """The maximum dimensions of the bounding box of the chart, in the format (x_max,y_max)."""
269
+
160
270
  uploaded_image_uri: str = ""
271
+ """A mirror of source_metadata.source_location."""
272
+
161
273
  custom_content: Optional[Dict[str, Any]] = None
162
274
 
163
275
 
164
276
  class AudioMetadataSchema(BaseModelNoExt):
277
+ """
278
+ The schema for extracted audio content.
279
+ """
280
+
165
281
  audio_transcript: str = ""
282
+ """A transcript of the audio content."""
283
+
166
284
  audio_type: str = ""
285
+ """The type or format of the audio, such as mp3, wav."""
286
+
167
287
  custom_content: Optional[Dict[str, Any]] = None
168
288
 
169
289
 
@@ -186,20 +306,52 @@ class InfoMessageMetadataSchema(BaseModelNoExt):
186
306
 
187
307
  # Main metadata schema
188
308
  class MetadataSchema(BaseModelNoExt):
309
+ """
310
+ The primary container schema for extraction results.
311
+ """
312
+
189
313
  content: str = ""
314
+ """The actual textual content extracted from the source."""
315
+
190
316
  content_url: str = ""
317
+ """A URL that points to the location of the content, if applicable."""
318
+
191
319
  embedding: Optional[List[float]] = None
320
+ """An optional numerical vector representation (embedding) of the content."""
321
+
192
322
  source_metadata: Optional[SourceMetadataSchema] = None
323
+ """Metadata about the original source of the content."""
324
+
193
325
  content_metadata: Optional[ContentMetadataSchema] = None
326
+ """General metadata about the extracted content itself."""
327
+
194
328
  audio_metadata: Optional[AudioMetadataSchema] = None
329
+ """Specific metadata for audio content. Automatically set to None if content_metadata.type is not AUDIO."""
330
+
195
331
  text_metadata: Optional[TextMetadataSchema] = None
332
+ """Specific metadata for text content. Automatically set to None if content_metadata.type is not TEXT."""
333
+
196
334
  image_metadata: Optional[ImageMetadataSchema] = None
335
+ """Specific metadata for image content. Automatically set to None if content_metadata.type is not IMAGE."""
336
+
197
337
  table_metadata: Optional[TableMetadataSchema] = None
338
+ """Specific metadata for tabular content. Automatically set to None if content_metadata.type is not STRUCTURED."""
339
+
198
340
  chart_metadata: Optional[ChartMetadataSchema] = None
341
+ """Specific metadata for chart content. Automatically set to None if content_metadata.type is not STRUCTURED."""
342
+
199
343
  error_metadata: Optional[ErrorMetadataSchema] = None
344
+ """Metadata that describes any errors encountered during processing."""
345
+
200
346
  info_message_metadata: Optional[InfoMessageMetadataSchema] = None
347
+ """Informational messages related to the processing."""
348
+
201
349
  debug_metadata: Optional[Dict[str, Any]] = None
350
+ """A dictionary for storing any arbitrary debug information."""
351
+
202
352
  raise_on_failure: bool = False
353
+ """If True, indicates that processing should halt on failure."""
354
+
203
355
  custom_content: Optional[Dict[str, Any]] = None
204
356
 
205
357
  @model_validator(mode="before")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.9.15.dev20250915
3
+ Version: 2025.9.17.dev20250917
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -85,7 +85,7 @@ nv_ingest_api/internal/schemas/message_brokers/response_schema.py,sha256=4b275Hl
85
85
  nv_ingest_api/internal/schemas/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
86
86
  nv_ingest_api/internal/schemas/meta/base_model_noext.py,sha256=8hXU1uuiqZ6t8EsoZ8vlC5EFf2zSZrKEX133FcfZMwI,316
87
87
  nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=cIpoesvIs0dR6s8dGjGHL246k5kf7hDmdhA48i8Si7s,10253
88
- nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=VnAzkSFat_ckI19mlwQTlFrvP6EZVCwyNl9bt51b8oU,7193
88
+ nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=FDD6yq-QxW8yDwn0Bq6bmWakX41ABMn3cytrvCbT-Po,11961
89
89
  nv_ingest_api/internal/schemas/meta/udf.py,sha256=GgzqbZOlipQgMpDhbXLqbF8xrHenj_hMNqhR_P-1ynw,779
90
90
  nv_ingest_api/internal/schemas/mutate/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
91
91
  nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py,sha256=k1JOdlPPpsipc0XhHf-9YxJ_-W0HvpVE1ZhYmr7fzj0,395
@@ -162,10 +162,10 @@ nv_ingest_api/util/string_processing/configuration.py,sha256=2HS08msccuPCT0fn_jf
162
162
  nv_ingest_api/util/string_processing/yaml.py,sha256=6SW2O6wbXRhGbhETMbtXjYCZn53HeCNOP6a96AaxlHs,1454
163
163
  nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
164
164
  nv_ingest_api/util/system/hardware_info.py,sha256=1UFM8XE6M3pgQcpbVsCsqDQ7Dj-zzptL-XRE-DEu9UA,27213
165
- nv_ingest_api-2025.9.15.dev20250915.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
165
+ nv_ingest_api-2025.9.17.dev20250917.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
166
166
  udfs/__init__.py,sha256=pXFqPgXIUqHDfj7SAR1Q19tt8KwGv_iMvhHyziz4AYM,205
167
167
  udfs/llm_summarizer_udf.py,sha256=sIMfcH4GRyciTKUtq4dmhd6fZmAp07X32irIC4k7nEI,7316
168
- nv_ingest_api-2025.9.15.dev20250915.dist-info/METADATA,sha256=yTAx16AGbKVYkgkHPCIy3wfAFdhb84Nia0zXee7N4-M,13947
169
- nv_ingest_api-2025.9.15.dev20250915.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
170
- nv_ingest_api-2025.9.15.dev20250915.dist-info/top_level.txt,sha256=I1lseG9FF0CH93SPx4kFblsxFuv190cfzaas_CLNIiw,19
171
- nv_ingest_api-2025.9.15.dev20250915.dist-info/RECORD,,
168
+ nv_ingest_api-2025.9.17.dev20250917.dist-info/METADATA,sha256=-di8AcnkMj-wrN71D92h01wDzweW2WwQ6pKtkpCsS9w,13947
169
+ nv_ingest_api-2025.9.17.dev20250917.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
170
+ nv_ingest_api-2025.9.17.dev20250917.dist-info/top_level.txt,sha256=I1lseG9FF0CH93SPx4kFblsxFuv190cfzaas_CLNIiw,19
171
+ nv_ingest_api-2025.9.17.dev20250917.dist-info/RECORD,,