nv-ingest-api 2025.7.15.dev20250715__py3-none-any.whl → 2025.7.16.dev20250716__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

@@ -52,6 +52,8 @@ class ContentDescriptionEnum(str, Enum):
52
52
  Description for image extracted from PDF document.
53
53
  PDF_INFOGRAPHIC : str
54
54
  Description for structured infographic extracted from PDF document.
55
+ PDF_PAGE_IMAGE : str
56
+ Description for a full-page image rendered from a PDF document.
55
57
  PDF_TABLE : str
56
58
  Description for structured table extracted from PDF document.
57
59
  PDF_TEXT : str
@@ -70,6 +72,7 @@ class ContentDescriptionEnum(str, Enum):
70
72
  PDF_CHART: str = "Structured chart extracted from PDF document."
71
73
  PDF_IMAGE: str = "Image extracted from PDF document."
72
74
  PDF_INFOGRAPHIC: str = "Structured infographic extracted from PDF document."
75
+ PDF_PAGE_IMAGE: str = "Full-page image rendered from a PDF document."
73
76
  PDF_TABLE: str = "Structured table extracted from PDF document."
74
77
  PDF_TEXT: str = "Unstructured text from PDF document."
75
78
  PPTX_IMAGE: str = "Image extracted from PPTX presentation."
@@ -94,6 +97,8 @@ class ContentTypeEnum(str, Enum):
94
97
  Represents image content.
95
98
  INFO_MSG : str
96
99
  Represents an informational message.
100
+ PAGE_IMAGE : str
101
+ Represents a full-page image rendered from a document.
97
102
  STRUCTURED : str
98
103
  Represents structured content.
99
104
  TEXT : str
@@ -111,6 +116,7 @@ class ContentTypeEnum(str, Enum):
111
116
  INFOGRAPHIC: str = "infographic"
112
117
  INFO_MSG: str = "info_message"
113
118
  NONE: str = "none"
119
+ PAGE_IMAGE: str = "page_image"
114
120
  STRUCTURED: str = "structured"
115
121
  TABLE: str = "table"
116
122
  TEXT: str = "text"
@@ -4,20 +4,21 @@
4
4
  # Copyright (c) 2024, NVIDIA CORPORATION.
5
5
 
6
6
  import base64
7
+ import inspect
7
8
  import io
8
-
9
- import pandas as pd
10
- from typing import Any, Dict, List, Optional
11
9
  import logging
10
+ from typing import Any
11
+ from typing import Dict
12
+ from typing import List
13
+ from typing import Optional
12
14
 
13
- from nv_ingest_api.internal.extract.pdf.engines import (
14
- adobe_extractor,
15
- llama_parse_extractor,
16
- nemoretriever_parse_extractor,
17
- pdfium_extractor,
18
- tika_extractor,
19
- unstructured_io_extractor,
20
- )
15
+ import pandas as pd
16
+ from nv_ingest_api.internal.extract.pdf.engines import adobe_extractor
17
+ from nv_ingest_api.internal.extract.pdf.engines import llama_parse_extractor
18
+ from nv_ingest_api.internal.extract.pdf.engines import nemoretriever_parse_extractor
19
+ from nv_ingest_api.internal.extract.pdf.engines import pdfium_extractor
20
+ from nv_ingest_api.internal.extract.pdf.engines import tika_extractor
21
+ from nv_ingest_api.internal.extract.pdf.engines import unstructured_io_extractor
21
22
  from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
22
23
 
23
24
  # Import extraction functions for different engines.
@@ -43,6 +44,7 @@ def _work_extract_pdf(
43
44
  extract_infographics: bool,
44
45
  extract_tables: bool,
45
46
  extract_charts: bool,
47
+ extract_page_as_image: bool,
46
48
  extractor_config: dict,
47
49
  execution_trace_log=None,
48
50
  ) -> Any:
@@ -52,17 +54,25 @@ def _work_extract_pdf(
52
54
 
53
55
  extract_method = extractor_config["extract_method"]
54
56
  extractor_fn = EXTRACTOR_LOOKUP.get(extract_method, pdfium_extractor)
55
- return extractor_fn(
56
- pdf_stream,
57
- extract_text,
58
- extract_images,
59
- extract_infographics,
60
- extract_tables,
61
- extract_charts,
62
- extractor_config,
63
- execution_trace_log,
57
+
58
+ extractor_fn_args = dict(
59
+ pdf_stream=pdf_stream,
60
+ extract_text=extract_text,
61
+ extract_images=extract_images,
62
+ extract_infographics=extract_infographics,
63
+ extract_tables=extract_tables,
64
+ extract_charts=extract_charts,
65
+ extractor_config=extractor_config,
66
+ execution_trace_log=execution_trace_log,
64
67
  )
65
68
 
69
+ if "extract_page_as_image" in inspect.signature(extractor_fn).parameters:
70
+ extractor_fn_args["extract_page_as_image"] = extract_page_as_image
71
+ elif extract_page_as_image:
72
+ logger.warning(f"`extract_page_as_image` is set to True, but {extract_method} does not support it.")
73
+
74
+ return extractor_fn(**extractor_fn_args)
75
+
66
76
 
67
77
  @unified_exception_handler
68
78
  def _orchestrate_row_extraction(
@@ -97,6 +107,7 @@ def _orchestrate_row_extraction(
97
107
  extract_tables = params.pop("extract_tables", False)
98
108
  extract_charts = params.pop("extract_charts", False)
99
109
  extract_infographics = params.pop("extract_infographics", False)
110
+ extract_page_as_image = params.pop("extract_page_as_image", False)
100
111
  extract_method = params.get("extract_method", "pdfium")
101
112
  except KeyError as e:
102
113
  raise ValueError(f"Missing required extraction flag: {e}")
@@ -137,6 +148,7 @@ def _orchestrate_row_extraction(
137
148
  extract_text=extract_text,
138
149
  extract_images=extract_images,
139
150
  extract_infographics=extract_infographics,
151
+ extract_page_as_image=extract_page_as_image,
140
152
  extract_tables=extract_tables,
141
153
  extract_charts=extract_charts,
142
154
  extractor_config=extractor_config,
@@ -24,6 +24,7 @@ import numpy as np
24
24
  import pandas as pd
25
25
  import pypdfium2 as libpdfium
26
26
 
27
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
27
28
  from nv_ingest_api.internal.primitives.nim.default_values import YOLOX_MAX_BATCH_SIZE
28
29
  from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
29
30
  YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
@@ -35,6 +36,7 @@ from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
35
36
  from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema
36
37
  from nv_ingest_api.internal.enums.common import TableFormatEnum, TextTypeEnum, AccessLevelEnum
37
38
  from nv_ingest_api.util.metadata.aggregators import (
39
+ construct_image_metadata_from_base64,
38
40
  construct_image_metadata_from_pdf_image,
39
41
  extract_pdf_metadata,
40
42
  construct_text_metadata,
@@ -47,6 +49,7 @@ from nv_ingest_api.util.pdf.pdfium import (
47
49
  extract_image_like_objects_from_pdfium_page,
48
50
  )
49
51
  from nv_ingest_api.util.pdf.pdfium import pdfium_pages_to_numpy
52
+ from nv_ingest_api.util.image_processing import scale_image_to_encoding_size
50
53
  from nv_ingest_api.util.image_processing.transforms import numpy_to_base64, crop_image
51
54
 
52
55
  logger = logging.getLogger(__name__)
@@ -385,6 +388,7 @@ def pdfium_extractor(
385
388
  extract_infographics: bool,
386
389
  extract_tables: bool,
387
390
  extract_charts: bool,
391
+ extract_page_as_image: bool,
388
392
  extractor_config: dict,
389
393
  execution_trace_log: Optional[List[Any]] = None,
390
394
  ) -> pd.DataFrame:
@@ -525,6 +529,24 @@ def pdfium_extractor(
525
529
  )
526
530
  extracted_data.extend(image_data)
527
531
 
532
+ # Full page image extraction
533
+ if extract_page_as_image:
534
+ page_text = _extract_page_text(page)
535
+ image, _ = pdfium_pages_to_numpy([page], scale_tuple=(16384, 16384), trace_info=execution_trace_log)
536
+ base64_image = numpy_to_base64(image[0])
537
+ if len(base64_image) > 2**24 - 1:
538
+ base64_image, _ = scale_image_to_encoding_size(base64_image, max_base64_size=2**24 - 1)
539
+ image_meta = construct_image_metadata_from_base64(
540
+ base64_image,
541
+ page_idx,
542
+ page_count,
543
+ source_metadata,
544
+ base_unified_metadata,
545
+ subtype=ContentTypeEnum.PAGE_IMAGE,
546
+ text=page_text,
547
+ )
548
+ extracted_data.append(image_meta)
549
+
528
550
  # If we want tables or charts, rasterize the page and store it
529
551
  if extract_tables or extract_charts or extract_infographics:
530
552
  image, padding_offsets = pdfium_pages_to_numpy(
@@ -575,6 +597,7 @@ def pdfium_extractor(
575
597
  execution_trace_log=execution_trace_log,
576
598
  )
577
599
  futures.append(future)
600
+
578
601
  pages_for_tables.clear()
579
602
 
580
603
  # Wait for all asynchronous jobs to complete.
@@ -107,6 +107,10 @@ class IngestTaskEmbedSchema(BaseModelNoExt):
107
107
  model_name: Optional[str] = None
108
108
  api_key: Optional[str] = None
109
109
  filter_errors: bool = False
110
+ text_elements_modality: Optional[str] = None
111
+ image_elements_modality: Optional[str] = None
112
+ structured_elements_modality: Optional[str] = None
113
+ audio_elements_modality: Optional[str] = None
110
114
 
111
115
 
112
116
  class IngestTaskVdbUploadSchema(BaseModelNoExt):
@@ -195,6 +199,7 @@ class IngestTaskSchema(BaseModelNoExt):
195
199
  validated_task_properties = expected_schema_cls(**task_properties)
196
200
  values["type"] = task_type # ensure type is now always the enum
197
201
  values["task_properties"] = validated_task_properties
202
+
198
203
  return values
199
204
 
200
205
  @field_validator("type", mode="before")
@@ -22,5 +22,9 @@ class TextEmbeddingSchema(BaseModel):
22
22
  input_type: str = Field(default="passage")
23
23
  raise_on_failure: bool = Field(default=False)
24
24
  truncate: str = Field(default="END")
25
+ text_elements_modality: str = Field(default="text")
26
+ image_elements_modality: str = Field(default="text")
27
+ structured_elements_modality: str = Field(default="text")
28
+ audio_elements_modality: str = Field(default="text")
25
29
 
26
30
  model_config = ConfigDict(extra="forbid")
@@ -4,6 +4,7 @@
4
4
 
5
5
  import logging
6
6
  from concurrent.futures import ThreadPoolExecutor
7
+ from functools import partial
7
8
  from typing import Any, Dict, Tuple, Optional, Iterable, List
8
9
 
9
10
  import pandas as pd
@@ -19,6 +20,9 @@ from nv_ingest_api.util.schema.schema_validator import validate_schema
19
20
  logger = logging.getLogger(__name__)
20
21
 
21
22
 
23
+ MULTI_MODAL_MODELS = ["llama-3.2-nemoretriever-1b-vlm-embed-v1"]
24
+
25
+
22
26
  # ------------------------------------------------------------------------------
23
27
  # Asynchronous Embedding Requests
24
28
  # ------------------------------------------------------------------------------
@@ -33,6 +37,7 @@ def _make_async_request(
33
37
  input_type: str,
34
38
  truncate: str,
35
39
  filter_errors: bool,
40
+ modalities: Optional[List[str]] = None,
36
41
  ) -> list:
37
42
  """
38
43
  Interacts directly with the NIM embedding service to calculate embeddings for a batch of prompts.
@@ -74,11 +79,18 @@ def _make_async_request(
74
79
  base_url=embedding_nim_endpoint,
75
80
  )
76
81
 
82
+ extra_body = {
83
+ "input_type": input_type,
84
+ "truncate": truncate,
85
+ }
86
+ if modalities:
87
+ extra_body["modality"] = modalities
88
+
77
89
  resp = client.embeddings.create(
78
90
  input=prompts,
79
91
  model=embedding_model,
80
92
  encoding_format=encoding_format,
81
- extra_body={"input_type": input_type, "truncate": truncate},
93
+ extra_body=extra_body,
82
94
  )
83
95
 
84
96
  response["embedding"] = resp.data
@@ -110,6 +122,7 @@ def _async_request_handler(
110
122
  input_type: str,
111
123
  truncate: str,
112
124
  filter_errors: bool,
125
+ modalities: Optional[List[str]] = None,
113
126
  ) -> List[dict]:
114
127
  """
115
128
  Gathers calculated embedding results from the NIM embedding service concurrently.
@@ -138,6 +151,9 @@ def _async_request_handler(
138
151
  List[dict]
139
152
  A list of response dictionaries from the embedding service.
140
153
  """
154
+ if modalities is None:
155
+ modalities = [None] * len(prompts)
156
+
141
157
  with ThreadPoolExecutor() as executor:
142
158
  futures = [
143
159
  executor.submit(
@@ -150,8 +166,9 @@ def _async_request_handler(
150
166
  input_type=input_type,
151
167
  truncate=truncate,
152
168
  filter_errors=filter_errors,
169
+ modalities=modality_batch,
153
170
  )
154
- for prompt_batch in prompts
171
+ for prompt_batch, modality_batch in zip(prompts, modalities)
155
172
  ]
156
173
  results = [future.result() for future in futures]
157
174
 
@@ -167,6 +184,7 @@ def _async_runner(
167
184
  input_type: str,
168
185
  truncate: str,
169
186
  filter_errors: bool,
187
+ modalities: Optional[List[str]] = None,
170
188
  ) -> dict:
171
189
  """
172
190
  Concurrently launches all NIM embedding requests and flattens the results.
@@ -204,6 +222,7 @@ def _async_runner(
204
222
  input_type,
205
223
  truncate,
206
224
  filter_errors,
225
+ modalities=modalities,
207
226
  )
208
227
 
209
228
  flat_results = {"embeddings": [], "info_msgs": []}
@@ -263,7 +282,19 @@ def _add_embeddings(row, embeddings, info_msgs):
263
282
  return row
264
283
 
265
284
 
266
- def _get_pandas_text_content(row):
285
+ def _format_image_input_string(image_b64: Optional[str]) -> str:
286
+ if not image_b64:
287
+ return
288
+ return f"data:image/png;base64,{image_b64}"
289
+
290
+
291
+ def _format_text_image_pair_input_string(text: Optional[str], image_b64: Optional[str]) -> str:
292
+ if (not text) or (not text.strip()) or (not image_b64):
293
+ return
294
+ return f"{text.strip()} {_format_image_input_string(image_b64)}"
295
+
296
+
297
+ def _get_pandas_text_content(row, modality="text"):
267
298
  """
268
299
  Extracts text content from a DataFrame row.
269
300
 
@@ -280,7 +311,7 @@ def _get_pandas_text_content(row):
280
311
  return row["content"]
281
312
 
282
313
 
283
- def _get_pandas_table_content(row):
314
+ def _get_pandas_table_content(row, modality="text"):
284
315
  """
285
316
  Extracts table/chart content from a DataFrame row.
286
317
 
@@ -294,10 +325,19 @@ def _get_pandas_table_content(row):
294
325
  str
295
326
  The table/chart content from the row.
296
327
  """
297
- return row.get("table_metadata", {}).get("table_content")
328
+ if modality == "text":
329
+ content = row.get("table_metadata", {}).get("table_content")
330
+ elif modality == "image":
331
+ content = _format_image_input_string(row.get("content"))
332
+ elif modality == "text_image":
333
+ text = row.get("table_metadata", {}).get("table_content")
334
+ image = row.get("content")
335
+ content = _format_text_image_pair_input_string(text, image)
336
+
337
+ return content
298
338
 
299
339
 
300
- def _get_pandas_image_content(row):
340
+ def _get_pandas_image_content(row, modality="text"):
301
341
  """
302
342
  Extracts image caption content from a DataFrame row.
303
343
 
@@ -311,10 +351,28 @@ def _get_pandas_image_content(row):
311
351
  str
312
352
  The image caption from the row.
313
353
  """
314
- return row.get("image_metadata", {}).get("caption")
354
+ subtype = row.get("content_metadata", {}).get("subtype")
355
+ if modality == "text":
356
+ if subtype == "page_image":
357
+ content = row.get("image_metadata", {}).get("text")
358
+ else:
359
+ content = row.get("image_metadata", {}).get("caption")
360
+ elif modality == "image":
361
+ content = _format_image_input_string(row.get("content"))
362
+ elif modality == "text_image":
363
+ if subtype == "page_image":
364
+ text = row.get("image_metadata", {}).get("text")
365
+ else:
366
+ text = row.get("image_metadata", {}).get("caption")
367
+ image = row.get("content")
368
+ content = _format_text_image_pair_input_string(text, image)
315
369
 
370
+ # A workaround to save memory.
371
+ row["content"] = ""
372
+ return content
316
373
 
317
- def _get_pandas_audio_content(row):
374
+
375
+ def _get_pandas_audio_content(row, modality="text"):
318
376
  """
319
377
  A pandas UDF used to select extracted audio transcription to be used to create embeddings.
320
378
  """
@@ -408,6 +466,23 @@ def _concatenate_extractions_pandas(
408
466
  # ------------------------------------------------------------------------------
409
467
 
410
468
 
469
+ def does_model_support_multimodal_embeddings(model: str) -> bool:
470
+ """
471
+ Checks if a given model supports multi-modal embeddings.
472
+
473
+ Parameters
474
+ ----------
475
+ model : str
476
+ The name of the model.
477
+
478
+ Returns
479
+ -------
480
+ bool
481
+ True if the model supports multi-modal embeddings, False otherwise.
482
+ """
483
+ return model in MULTI_MODAL_MODELS
484
+
485
+
411
486
  def transform_create_text_embeddings_internal(
412
487
  df_transform_ledger: pd.DataFrame,
413
488
  task_config: Dict[str, Any],
@@ -460,6 +535,15 @@ def transform_create_text_embeddings_internal(
460
535
  ContentTypeEnum.AUDIO: _get_pandas_audio_content,
461
536
  ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
462
537
  }
538
+ task_type_to_modality = {
539
+ ContentTypeEnum.TEXT: task_config.get("text_elements_modality") or transform_config.text_elements_modality,
540
+ ContentTypeEnum.STRUCTURED: (
541
+ task_config.get("structured_elements_modality") or transform_config.structured_elements_modality
542
+ ),
543
+ ContentTypeEnum.IMAGE: task_config.get("image_elements_modality") or transform_config.image_elements_modality,
544
+ ContentTypeEnum.AUDIO: task_config.get("audio_elements_modality") or transform_config.audio_elements_modality,
545
+ ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
546
+ }
463
547
 
464
548
  def _content_type_getter(row):
465
549
  return row["content_metadata"]["type"]
@@ -480,7 +564,7 @@ def transform_create_text_embeddings_internal(
480
564
  # Extract content and normalize empty or non-str to None
481
565
  extracted_content = (
482
566
  df_content["metadata"]
483
- .apply(content_getter)
567
+ .apply(partial(content_getter, modality=task_type_to_modality[content_type]))
484
568
  .apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
485
569
  )
486
570
  df_content["_content"] = extracted_content
@@ -488,9 +572,15 @@ def transform_create_text_embeddings_internal(
488
572
  # Prepare batches for only valid (non-None) content
489
573
  valid_content_mask = df_content["_content"].notna()
490
574
  if valid_content_mask.any():
491
- filtered_content_batches = _generate_batches(
492
- df_content.loc[valid_content_mask, "_content"].tolist(), batch_size=transform_config.batch_size
493
- )
575
+ filtered_content_list = df_content.loc[valid_content_mask, "_content"].tolist()
576
+ filtered_content_batches = _generate_batches(filtered_content_list, batch_size=transform_config.batch_size)
577
+
578
+ if model_name in MULTI_MODAL_MODELS:
579
+ modality_list = [task_type_to_modality[content_type]] * len(filtered_content_list)
580
+ modality_batches = _generate_batches(modality_list, batch_size=transform_config.batch_size)
581
+ else:
582
+ modality_batches = None
583
+
494
584
  content_embeddings = _async_runner(
495
585
  filtered_content_batches,
496
586
  api_key,
@@ -500,6 +590,7 @@ def transform_create_text_embeddings_internal(
500
590
  transform_config.input_type,
501
591
  transform_config.truncate,
502
592
  False,
593
+ modalities=modality_batches,
503
594
  )
504
595
  # Build a simple row index -> embedding map
505
596
  embeddings_dict = dict(
@@ -20,6 +20,9 @@ cv2.setNumThreads(1)
20
20
  DEFAULT_MAX_WIDTH = 1024
21
21
  DEFAULT_MAX_HEIGHT = 1280
22
22
 
23
+ # Workaround for PIL.Image.DecompressionBombError
24
+ Image.MAX_IMAGE_PIXELS = None
25
+
23
26
  logger = logging.getLogger(__name__)
24
27
 
25
28
 
@@ -201,6 +201,8 @@ def construct_image_metadata_from_base64(
201
201
  page_count: int,
202
202
  source_metadata: Dict[str, Any],
203
203
  base_unified_metadata: Dict[str, Any],
204
+ subtype: None | ContentTypeEnum | str = "",
205
+ text: str = "",
204
206
  ) -> List[Any]:
205
207
  """
206
208
  Extracts image data from a base64-encoded image string, decodes the image to get
@@ -252,6 +254,7 @@ def construct_image_metadata_from_base64(
252
254
  "line": -1,
253
255
  "span": -1,
254
256
  },
257
+ "subtype": subtype or "",
255
258
  }
256
259
 
257
260
  # Construct image metadata
@@ -259,7 +262,7 @@ def construct_image_metadata_from_base64(
259
262
  "image_type": DocumentTypeEnum.PNG,
260
263
  "structured_image_type": ContentTypeEnum.UNKNOWN,
261
264
  "caption": "",
262
- "text": "",
265
+ "text": text,
263
266
  "image_location": bbox,
264
267
  "image_location_max_dimensions": (width, height),
265
268
  "height": height,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.7.15.dev20250715
3
+ Version: 2025.7.16.dev20250716
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -7,7 +7,7 @@ nv_ingest_api/interface/transform.py,sha256=g6YnFR7TpEU0xNtzCvv6kqnFbuCwQ6vRMjjB
7
7
  nv_ingest_api/interface/utility.py,sha256=AL4l0cJNvTjG1MAe1YNTk1jbbPED3g4HCewzx6Ffcio,7296
8
8
  nv_ingest_api/internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  nv_ingest_api/internal/enums/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
10
- nv_ingest_api/internal/enums/common.py,sha256=HSj7qqNr6KXu_FIyK_Wvel24R-r8lV7dLA173z5XFBc,12321
10
+ nv_ingest_api/internal/enums/common.py,sha256=lzDJ35VWfIwlL_Lx_q0dfHUuwEB7CXudHIQAilpjoRw,12611
11
11
  nv_ingest_api/internal/extract/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
12
12
  nv_ingest_api/internal/extract/audio/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
13
13
  nv_ingest_api/internal/extract/audio/audio_extraction.py,sha256=_jf_UC_FTqZr-xEpwG8edwBzdDjM01gGhqm9ulOsDcY,6973
@@ -32,10 +32,10 @@ nv_ingest_api/internal/extract/pdf/engines/__init__.py,sha256=u4GnAZmDKRl0RwYGIR
32
32
  nv_ingest_api/internal/extract/pdf/engines/adobe.py,sha256=VT0dEqkU-y2uGkaCqxtKYov_Q8R1028UQVBchgMLca4,17466
33
33
  nv_ingest_api/internal/extract/pdf/engines/llama.py,sha256=PpKTqS8jGHBV6mKLGZWwjpfT8ga6Fy8ffrvL-gPAf2c,8182
34
34
  nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py,sha256=XNYz4S2tMFBv0KFzXNERrVs-1raxJ_iIIXpBGlJFcD0,22987
35
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=vtdBue1EEQJsHcBuX3NdPutbLfyKPIzily6JOK6yV0w,22421
35
+ nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=8hUJUdpx6FhOBgabFmGhJiAQdl12kR8YoSbUfN-geOk,23506
36
36
  nv_ingest_api/internal/extract/pdf/engines/tika.py,sha256=6GyR2l6EsgNZl9jnYDXLeKNK9Fj2Mw9y2UWDq-eSkOc,3169
37
37
  nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py,sha256=jrv2B4VZAH4PevAQrFz965qz8UyXq3rViiOTbGLejec,14908
38
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py,sha256=Jk3wrQ2CZs167juvEZ-uV6qXWQjR08hhIu8otk2MWj4,4931
38
+ nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py,sha256=4bvN6LsPksLicI6jM0JqbJFiOZNHEcuc8MVVW4XfgV8,5875
39
39
  nv_ingest_api/internal/extract/pptx/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
40
40
  nv_ingest_api/internal/extract/pptx/pptx_extractor.py,sha256=o-0P2dDyRFW37uQi_lKk6-eFozTcZvbq-2Y4I0EBMIY,7749
41
41
  nv_ingest_api/internal/extract/pptx/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -82,7 +82,7 @@ nv_ingest_api/internal/schemas/message_brokers/request_schema.py,sha256=LZX_wXDx
82
82
  nv_ingest_api/internal/schemas/message_brokers/response_schema.py,sha256=4b275HlzBSzpmuE2wdoeaGKPCdKki3wuWldtRIfrj8w,727
83
83
  nv_ingest_api/internal/schemas/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
84
84
  nv_ingest_api/internal/schemas/meta/base_model_noext.py,sha256=8hXU1uuiqZ6t8EsoZ8vlC5EFf2zSZrKEX133FcfZMwI,316
85
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=szDvgc2A_JetD2Jyewyl4ac4lwpy3NiLxD9dOYz42sM,8116
85
+ nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=ceYQjRjhBSDbbZ6q-Db7Y6GHVOvWPdGAMb3TX1vMWfY,8321
86
86
  nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=VnAzkSFat_ckI19mlwQTlFrvP6EZVCwyNl9bt51b8oU,7193
87
87
  nv_ingest_api/internal/schemas/mutate/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
88
88
  nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py,sha256=k1JOdlPPpsipc0XhHf-9YxJ_-W0HvpVE1ZhYmr7fzj0,395
@@ -92,14 +92,14 @@ nv_ingest_api/internal/schemas/store/store_image_schema.py,sha256=p2LGij9i6sG6RY
92
92
  nv_ingest_api/internal/schemas/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
93
93
  nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py,sha256=OtM1iPw26uioC3mghbOJQurKGg641uQfhASH462VqOY,578
94
94
  nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py,sha256=31ThI5fr0yyENeJeE1xMAA-pxk1QVJLwM842zMate_k,429
95
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py,sha256=ongmHkJA2953f9_RI7ZYzf5BUnFzVL6Al5E8WKyfgw4,885
95
+ nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py,sha256=RZCISA8CUqKiY8eJuk4uWxzo4PZ-fuYdzMO7_LYFkoM,1117
96
96
  nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py,sha256=D9K8tvu-tkEBQkZo7uuRzgrHdGyM3ZcNycHbHy5HV2E,791
97
97
  nv_ingest_api/internal/store/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
98
98
  nv_ingest_api/internal/store/embed_text_upload.py,sha256=maxb4FPsBvWgvlrjAPEBlRZEFdJX5NxPG-p8kUbzV7I,9898
99
99
  nv_ingest_api/internal/store/image_upload.py,sha256=GNlY4k3pfcHv3lzXxkbmGLeHFsf9PI25bkBn6Xn9h3I,9654
100
100
  nv_ingest_api/internal/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
101
101
  nv_ingest_api/internal/transform/caption_image.py,sha256=0ILCG2F8ESqKtZiPUM-6F1BHUflFZ76Dzi2GNzkE-lU,8517
102
- nv_ingest_api/internal/transform/embed_text.py,sha256=A8JMotTkC8KQ0pmz4AIJhaKebza6JzhQ0aEnHX2oHY8,16539
102
+ nv_ingest_api/internal/transform/embed_text.py,sha256=kvVGlNH1S91UENXWLD31uh3KzlfJYOlYitpIFMsyowU,20033
103
103
  nv_ingest_api/internal/transform/split_text.py,sha256=-kwpRWSVZrPldm1hn3-tVz_TkzuKM-kPvNU3HTp9zOY,7476
104
104
  nv_ingest_api/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
105
105
  nv_ingest_api/util/control_message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -123,7 +123,7 @@ nv_ingest_api/util/image_processing/__init__.py,sha256=Jiy8C1ZuSrNb_eBM1ZTV9IKFI
123
123
  nv_ingest_api/util/image_processing/clustering.py,sha256=sUGlZI4cx1q8h4Pns1N9JVpdfSM2BOH8zRmn9QFCtzI,9236
124
124
  nv_ingest_api/util/image_processing/processing.py,sha256=LSoDDEmahr7a-qSS12McVcowRe3dOrAZwa1h-PD_JPQ,6554
125
125
  nv_ingest_api/util/image_processing/table_and_chart.py,sha256=bxOu9PZYkG_WFCDGw_JLaO60S2pDSN8EOWK3xkIwr2A,14376
126
- nv_ingest_api/util/image_processing/transforms.py,sha256=CJVGQgUvHk_mzihR8ZZrvwJUBgUYcgFAKzXyRTmKdCE,23371
126
+ nv_ingest_api/util/image_processing/transforms.py,sha256=3-xeUerc2AaXJTYuR23EjwdtjRQ8F85pS5D9zxR4cLA,23452
127
127
  nv_ingest_api/util/imports/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
128
128
  nv_ingest_api/util/imports/callable_signatures.py,sha256=e2bJB1pmkN4Ee-Bf-VggOSBaQ4RXofWF5eKkWXgIj2U,1855
129
129
  nv_ingest_api/util/imports/dynamic_resolvers.py,sha256=7GByV_-8z2X0tnVoabCxVioxOP3sYMros3ZllVAW-wY,4343
@@ -135,7 +135,7 @@ nv_ingest_api/util/message_brokers/simple_message_broker/broker.py,sha256=h9Q4q_
135
135
  nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py,sha256=3p-LRqG8qLnsfEhBNf73_DG22C08JKahTqUvPLS2Apg,2554
136
136
  nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py,sha256=fh7Q0wO5H_FtrHV1VdT6V66aZNqglOh_2XdkfLt8hgg,15722
137
137
  nv_ingest_api/util/metadata/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
138
- nv_ingest_api/util/metadata/aggregators.py,sha256=Y5JSKuLhhk_ldpzT3eRIcVg7QM7cTNhfQZn4g5bcbq4,15884
138
+ nv_ingest_api/util/metadata/aggregators.py,sha256=YYdvJ1E04eGFZKKHUxXoH6mzLg8nor9Smvnv0qzqK5w,15988
139
139
  nv_ingest_api/util/multi_processing/__init__.py,sha256=4fojP8Rp_5Hu1YAkqGylqTyEZ-HBVVEunn5Z9I99swA,242
140
140
  nv_ingest_api/util/multi_processing/mp_pool_singleton.py,sha256=dTfP82DgGPaXEJH3jywTO8rNlLZUniD4FFzwv84_giE,7372
141
141
  nv_ingest_api/util/nim/__init__.py,sha256=UqbiXFCqjWcjNvoduXd_0gOUOGBT8JvppiYHOmMyneA,1775
@@ -153,8 +153,8 @@ nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=dZ-jrk7IK7oNtHoXFS
153
153
  nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
154
154
  nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
155
  nv_ingest_api/util/system/hardware_info.py,sha256=ORZeKpH9kSGU_vuPhyBwkIiMyCViKUX2CP__MCjrfbU,19463
156
- nv_ingest_api-2025.7.15.dev20250715.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
157
- nv_ingest_api-2025.7.15.dev20250715.dist-info/METADATA,sha256=OWZyeCR9DZ23SdT0RcMdodCkxR508CZZaVczdM3qXPE,13947
158
- nv_ingest_api-2025.7.15.dev20250715.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
159
- nv_ingest_api-2025.7.15.dev20250715.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
160
- nv_ingest_api-2025.7.15.dev20250715.dist-info/RECORD,,
156
+ nv_ingest_api-2025.7.16.dev20250716.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
157
+ nv_ingest_api-2025.7.16.dev20250716.dist-info/METADATA,sha256=RaPAkQ4Dtkkrn6hi9Va1t2XDpDgRbe-bFqmCVL3IlEA,13947
158
+ nv_ingest_api-2025.7.16.dev20250716.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
159
+ nv_ingest_api-2025.7.16.dev20250716.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
160
+ nv_ingest_api-2025.7.16.dev20250716.dist-info/RECORD,,