nv-ingest-api 2025.7.15.dev20250715__py3-none-any.whl → 2025.7.16.dev20250716__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/internal/enums/common.py +6 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +32 -20
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +23 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +5 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +4 -0
- nv_ingest_api/internal/transform/embed_text.py +103 -12
- nv_ingest_api/util/image_processing/transforms.py +3 -0
- nv_ingest_api/util/metadata/aggregators.py +4 -1
- {nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.16.dev20250716.dist-info}/METADATA +1 -1
- {nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.16.dev20250716.dist-info}/RECORD +13 -13
- {nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.16.dev20250716.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.16.dev20250716.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.16.dev20250716.dist-info}/top_level.txt +0 -0
|
@@ -52,6 +52,8 @@ class ContentDescriptionEnum(str, Enum):
|
|
|
52
52
|
Description for image extracted from PDF document.
|
|
53
53
|
PDF_INFOGRAPHIC : str
|
|
54
54
|
Description for structured infographic extracted from PDF document.
|
|
55
|
+
PDF_PAGE_IMAGE : str
|
|
56
|
+
Description for a full-page image rendered from a PDF document.
|
|
55
57
|
PDF_TABLE : str
|
|
56
58
|
Description for structured table extracted from PDF document.
|
|
57
59
|
PDF_TEXT : str
|
|
@@ -70,6 +72,7 @@ class ContentDescriptionEnum(str, Enum):
|
|
|
70
72
|
PDF_CHART: str = "Structured chart extracted from PDF document."
|
|
71
73
|
PDF_IMAGE: str = "Image extracted from PDF document."
|
|
72
74
|
PDF_INFOGRAPHIC: str = "Structured infographic extracted from PDF document."
|
|
75
|
+
PDF_PAGE_IMAGE: str = "Full-page image rendered from a PDF document."
|
|
73
76
|
PDF_TABLE: str = "Structured table extracted from PDF document."
|
|
74
77
|
PDF_TEXT: str = "Unstructured text from PDF document."
|
|
75
78
|
PPTX_IMAGE: str = "Image extracted from PPTX presentation."
|
|
@@ -94,6 +97,8 @@ class ContentTypeEnum(str, Enum):
|
|
|
94
97
|
Represents image content.
|
|
95
98
|
INFO_MSG : str
|
|
96
99
|
Represents an informational message.
|
|
100
|
+
PAGE_IMAGE : str
|
|
101
|
+
Represents a full-page image rendered from a document.
|
|
97
102
|
STRUCTURED : str
|
|
98
103
|
Represents structured content.
|
|
99
104
|
TEXT : str
|
|
@@ -111,6 +116,7 @@ class ContentTypeEnum(str, Enum):
|
|
|
111
116
|
INFOGRAPHIC: str = "infographic"
|
|
112
117
|
INFO_MSG: str = "info_message"
|
|
113
118
|
NONE: str = "none"
|
|
119
|
+
PAGE_IMAGE: str = "page_image"
|
|
114
120
|
STRUCTURED: str = "structured"
|
|
115
121
|
TABLE: str = "table"
|
|
116
122
|
TEXT: str = "text"
|
|
@@ -4,20 +4,21 @@
|
|
|
4
4
|
# Copyright (c) 2024, NVIDIA CORPORATION.
|
|
5
5
|
|
|
6
6
|
import base64
|
|
7
|
+
import inspect
|
|
7
8
|
import io
|
|
8
|
-
|
|
9
|
-
import pandas as pd
|
|
10
|
-
from typing import Any, Dict, List, Optional
|
|
11
9
|
import logging
|
|
10
|
+
from typing import Any
|
|
11
|
+
from typing import Dict
|
|
12
|
+
from typing import List
|
|
13
|
+
from typing import Optional
|
|
12
14
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
)
|
|
15
|
+
import pandas as pd
|
|
16
|
+
from nv_ingest_api.internal.extract.pdf.engines import adobe_extractor
|
|
17
|
+
from nv_ingest_api.internal.extract.pdf.engines import llama_parse_extractor
|
|
18
|
+
from nv_ingest_api.internal.extract.pdf.engines import nemoretriever_parse_extractor
|
|
19
|
+
from nv_ingest_api.internal.extract.pdf.engines import pdfium_extractor
|
|
20
|
+
from nv_ingest_api.internal.extract.pdf.engines import tika_extractor
|
|
21
|
+
from nv_ingest_api.internal.extract.pdf.engines import unstructured_io_extractor
|
|
21
22
|
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
22
23
|
|
|
23
24
|
# Import extraction functions for different engines.
|
|
@@ -43,6 +44,7 @@ def _work_extract_pdf(
|
|
|
43
44
|
extract_infographics: bool,
|
|
44
45
|
extract_tables: bool,
|
|
45
46
|
extract_charts: bool,
|
|
47
|
+
extract_page_as_image: bool,
|
|
46
48
|
extractor_config: dict,
|
|
47
49
|
execution_trace_log=None,
|
|
48
50
|
) -> Any:
|
|
@@ -52,17 +54,25 @@ def _work_extract_pdf(
|
|
|
52
54
|
|
|
53
55
|
extract_method = extractor_config["extract_method"]
|
|
54
56
|
extractor_fn = EXTRACTOR_LOOKUP.get(extract_method, pdfium_extractor)
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
57
|
+
|
|
58
|
+
extractor_fn_args = dict(
|
|
59
|
+
pdf_stream=pdf_stream,
|
|
60
|
+
extract_text=extract_text,
|
|
61
|
+
extract_images=extract_images,
|
|
62
|
+
extract_infographics=extract_infographics,
|
|
63
|
+
extract_tables=extract_tables,
|
|
64
|
+
extract_charts=extract_charts,
|
|
65
|
+
extractor_config=extractor_config,
|
|
66
|
+
execution_trace_log=execution_trace_log,
|
|
64
67
|
)
|
|
65
68
|
|
|
69
|
+
if "extract_page_as_image" in inspect.signature(extractor_fn).parameters:
|
|
70
|
+
extractor_fn_args["extract_page_as_image"] = extract_page_as_image
|
|
71
|
+
elif extract_page_as_image:
|
|
72
|
+
logger.warning(f"`extract_page_as_image` is set to True, but {extract_method} does not support it.")
|
|
73
|
+
|
|
74
|
+
return extractor_fn(**extractor_fn_args)
|
|
75
|
+
|
|
66
76
|
|
|
67
77
|
@unified_exception_handler
|
|
68
78
|
def _orchestrate_row_extraction(
|
|
@@ -97,6 +107,7 @@ def _orchestrate_row_extraction(
|
|
|
97
107
|
extract_tables = params.pop("extract_tables", False)
|
|
98
108
|
extract_charts = params.pop("extract_charts", False)
|
|
99
109
|
extract_infographics = params.pop("extract_infographics", False)
|
|
110
|
+
extract_page_as_image = params.pop("extract_page_as_image", False)
|
|
100
111
|
extract_method = params.get("extract_method", "pdfium")
|
|
101
112
|
except KeyError as e:
|
|
102
113
|
raise ValueError(f"Missing required extraction flag: {e}")
|
|
@@ -137,6 +148,7 @@ def _orchestrate_row_extraction(
|
|
|
137
148
|
extract_text=extract_text,
|
|
138
149
|
extract_images=extract_images,
|
|
139
150
|
extract_infographics=extract_infographics,
|
|
151
|
+
extract_page_as_image=extract_page_as_image,
|
|
140
152
|
extract_tables=extract_tables,
|
|
141
153
|
extract_charts=extract_charts,
|
|
142
154
|
extractor_config=extractor_config,
|
|
@@ -24,6 +24,7 @@ import numpy as np
|
|
|
24
24
|
import pandas as pd
|
|
25
25
|
import pypdfium2 as libpdfium
|
|
26
26
|
|
|
27
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
27
28
|
from nv_ingest_api.internal.primitives.nim.default_values import YOLOX_MAX_BATCH_SIZE
|
|
28
29
|
from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
|
|
29
30
|
YOLOX_PAGE_IMAGE_PREPROC_WIDTH,
|
|
@@ -35,6 +36,7 @@ from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
|
|
|
35
36
|
from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema
|
|
36
37
|
from nv_ingest_api.internal.enums.common import TableFormatEnum, TextTypeEnum, AccessLevelEnum
|
|
37
38
|
from nv_ingest_api.util.metadata.aggregators import (
|
|
39
|
+
construct_image_metadata_from_base64,
|
|
38
40
|
construct_image_metadata_from_pdf_image,
|
|
39
41
|
extract_pdf_metadata,
|
|
40
42
|
construct_text_metadata,
|
|
@@ -47,6 +49,7 @@ from nv_ingest_api.util.pdf.pdfium import (
|
|
|
47
49
|
extract_image_like_objects_from_pdfium_page,
|
|
48
50
|
)
|
|
49
51
|
from nv_ingest_api.util.pdf.pdfium import pdfium_pages_to_numpy
|
|
52
|
+
from nv_ingest_api.util.image_processing import scale_image_to_encoding_size
|
|
50
53
|
from nv_ingest_api.util.image_processing.transforms import numpy_to_base64, crop_image
|
|
51
54
|
|
|
52
55
|
logger = logging.getLogger(__name__)
|
|
@@ -385,6 +388,7 @@ def pdfium_extractor(
|
|
|
385
388
|
extract_infographics: bool,
|
|
386
389
|
extract_tables: bool,
|
|
387
390
|
extract_charts: bool,
|
|
391
|
+
extract_page_as_image: bool,
|
|
388
392
|
extractor_config: dict,
|
|
389
393
|
execution_trace_log: Optional[List[Any]] = None,
|
|
390
394
|
) -> pd.DataFrame:
|
|
@@ -525,6 +529,24 @@ def pdfium_extractor(
|
|
|
525
529
|
)
|
|
526
530
|
extracted_data.extend(image_data)
|
|
527
531
|
|
|
532
|
+
# Full page image extraction
|
|
533
|
+
if extract_page_as_image:
|
|
534
|
+
page_text = _extract_page_text(page)
|
|
535
|
+
image, _ = pdfium_pages_to_numpy([page], scale_tuple=(16384, 16384), trace_info=execution_trace_log)
|
|
536
|
+
base64_image = numpy_to_base64(image[0])
|
|
537
|
+
if len(base64_image) > 2**24 - 1:
|
|
538
|
+
base64_image, _ = scale_image_to_encoding_size(base64_image, max_base64_size=2**24 - 1)
|
|
539
|
+
image_meta = construct_image_metadata_from_base64(
|
|
540
|
+
base64_image,
|
|
541
|
+
page_idx,
|
|
542
|
+
page_count,
|
|
543
|
+
source_metadata,
|
|
544
|
+
base_unified_metadata,
|
|
545
|
+
subtype=ContentTypeEnum.PAGE_IMAGE,
|
|
546
|
+
text=page_text,
|
|
547
|
+
)
|
|
548
|
+
extracted_data.append(image_meta)
|
|
549
|
+
|
|
528
550
|
# If we want tables or charts, rasterize the page and store it
|
|
529
551
|
if extract_tables or extract_charts or extract_infographics:
|
|
530
552
|
image, padding_offsets = pdfium_pages_to_numpy(
|
|
@@ -575,6 +597,7 @@ def pdfium_extractor(
|
|
|
575
597
|
execution_trace_log=execution_trace_log,
|
|
576
598
|
)
|
|
577
599
|
futures.append(future)
|
|
600
|
+
|
|
578
601
|
pages_for_tables.clear()
|
|
579
602
|
|
|
580
603
|
# Wait for all asynchronous jobs to complete.
|
|
@@ -107,6 +107,10 @@ class IngestTaskEmbedSchema(BaseModelNoExt):
|
|
|
107
107
|
model_name: Optional[str] = None
|
|
108
108
|
api_key: Optional[str] = None
|
|
109
109
|
filter_errors: bool = False
|
|
110
|
+
text_elements_modality: Optional[str] = None
|
|
111
|
+
image_elements_modality: Optional[str] = None
|
|
112
|
+
structured_elements_modality: Optional[str] = None
|
|
113
|
+
audio_elements_modality: Optional[str] = None
|
|
110
114
|
|
|
111
115
|
|
|
112
116
|
class IngestTaskVdbUploadSchema(BaseModelNoExt):
|
|
@@ -195,6 +199,7 @@ class IngestTaskSchema(BaseModelNoExt):
|
|
|
195
199
|
validated_task_properties = expected_schema_cls(**task_properties)
|
|
196
200
|
values["type"] = task_type # ensure type is now always the enum
|
|
197
201
|
values["task_properties"] = validated_task_properties
|
|
202
|
+
|
|
198
203
|
return values
|
|
199
204
|
|
|
200
205
|
@field_validator("type", mode="before")
|
|
@@ -22,5 +22,9 @@ class TextEmbeddingSchema(BaseModel):
|
|
|
22
22
|
input_type: str = Field(default="passage")
|
|
23
23
|
raise_on_failure: bool = Field(default=False)
|
|
24
24
|
truncate: str = Field(default="END")
|
|
25
|
+
text_elements_modality: str = Field(default="text")
|
|
26
|
+
image_elements_modality: str = Field(default="text")
|
|
27
|
+
structured_elements_modality: str = Field(default="text")
|
|
28
|
+
audio_elements_modality: str = Field(default="text")
|
|
25
29
|
|
|
26
30
|
model_config = ConfigDict(extra="forbid")
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
6
|
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
+
from functools import partial
|
|
7
8
|
from typing import Any, Dict, Tuple, Optional, Iterable, List
|
|
8
9
|
|
|
9
10
|
import pandas as pd
|
|
@@ -19,6 +20,9 @@ from nv_ingest_api.util.schema.schema_validator import validate_schema
|
|
|
19
20
|
logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
21
22
|
|
|
23
|
+
MULTI_MODAL_MODELS = ["llama-3.2-nemoretriever-1b-vlm-embed-v1"]
|
|
24
|
+
|
|
25
|
+
|
|
22
26
|
# ------------------------------------------------------------------------------
|
|
23
27
|
# Asynchronous Embedding Requests
|
|
24
28
|
# ------------------------------------------------------------------------------
|
|
@@ -33,6 +37,7 @@ def _make_async_request(
|
|
|
33
37
|
input_type: str,
|
|
34
38
|
truncate: str,
|
|
35
39
|
filter_errors: bool,
|
|
40
|
+
modalities: Optional[List[str]] = None,
|
|
36
41
|
) -> list:
|
|
37
42
|
"""
|
|
38
43
|
Interacts directly with the NIM embedding service to calculate embeddings for a batch of prompts.
|
|
@@ -74,11 +79,18 @@ def _make_async_request(
|
|
|
74
79
|
base_url=embedding_nim_endpoint,
|
|
75
80
|
)
|
|
76
81
|
|
|
82
|
+
extra_body = {
|
|
83
|
+
"input_type": input_type,
|
|
84
|
+
"truncate": truncate,
|
|
85
|
+
}
|
|
86
|
+
if modalities:
|
|
87
|
+
extra_body["modality"] = modalities
|
|
88
|
+
|
|
77
89
|
resp = client.embeddings.create(
|
|
78
90
|
input=prompts,
|
|
79
91
|
model=embedding_model,
|
|
80
92
|
encoding_format=encoding_format,
|
|
81
|
-
extra_body=
|
|
93
|
+
extra_body=extra_body,
|
|
82
94
|
)
|
|
83
95
|
|
|
84
96
|
response["embedding"] = resp.data
|
|
@@ -110,6 +122,7 @@ def _async_request_handler(
|
|
|
110
122
|
input_type: str,
|
|
111
123
|
truncate: str,
|
|
112
124
|
filter_errors: bool,
|
|
125
|
+
modalities: Optional[List[str]] = None,
|
|
113
126
|
) -> List[dict]:
|
|
114
127
|
"""
|
|
115
128
|
Gathers calculated embedding results from the NIM embedding service concurrently.
|
|
@@ -138,6 +151,9 @@ def _async_request_handler(
|
|
|
138
151
|
List[dict]
|
|
139
152
|
A list of response dictionaries from the embedding service.
|
|
140
153
|
"""
|
|
154
|
+
if modalities is None:
|
|
155
|
+
modalities = [None] * len(prompts)
|
|
156
|
+
|
|
141
157
|
with ThreadPoolExecutor() as executor:
|
|
142
158
|
futures = [
|
|
143
159
|
executor.submit(
|
|
@@ -150,8 +166,9 @@ def _async_request_handler(
|
|
|
150
166
|
input_type=input_type,
|
|
151
167
|
truncate=truncate,
|
|
152
168
|
filter_errors=filter_errors,
|
|
169
|
+
modalities=modality_batch,
|
|
153
170
|
)
|
|
154
|
-
for prompt_batch in prompts
|
|
171
|
+
for prompt_batch, modality_batch in zip(prompts, modalities)
|
|
155
172
|
]
|
|
156
173
|
results = [future.result() for future in futures]
|
|
157
174
|
|
|
@@ -167,6 +184,7 @@ def _async_runner(
|
|
|
167
184
|
input_type: str,
|
|
168
185
|
truncate: str,
|
|
169
186
|
filter_errors: bool,
|
|
187
|
+
modalities: Optional[List[str]] = None,
|
|
170
188
|
) -> dict:
|
|
171
189
|
"""
|
|
172
190
|
Concurrently launches all NIM embedding requests and flattens the results.
|
|
@@ -204,6 +222,7 @@ def _async_runner(
|
|
|
204
222
|
input_type,
|
|
205
223
|
truncate,
|
|
206
224
|
filter_errors,
|
|
225
|
+
modalities=modalities,
|
|
207
226
|
)
|
|
208
227
|
|
|
209
228
|
flat_results = {"embeddings": [], "info_msgs": []}
|
|
@@ -263,7 +282,19 @@ def _add_embeddings(row, embeddings, info_msgs):
|
|
|
263
282
|
return row
|
|
264
283
|
|
|
265
284
|
|
|
266
|
-
def
|
|
285
|
+
def _format_image_input_string(image_b64: Optional[str]) -> str:
|
|
286
|
+
if not image_b64:
|
|
287
|
+
return
|
|
288
|
+
return f"data:image/png;base64,{image_b64}"
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _format_text_image_pair_input_string(text: Optional[str], image_b64: Optional[str]) -> str:
|
|
292
|
+
if (not text) or (not text.strip()) or (not image_b64):
|
|
293
|
+
return
|
|
294
|
+
return f"{text.strip()} {_format_image_input_string(image_b64)}"
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _get_pandas_text_content(row, modality="text"):
|
|
267
298
|
"""
|
|
268
299
|
Extracts text content from a DataFrame row.
|
|
269
300
|
|
|
@@ -280,7 +311,7 @@ def _get_pandas_text_content(row):
|
|
|
280
311
|
return row["content"]
|
|
281
312
|
|
|
282
313
|
|
|
283
|
-
def _get_pandas_table_content(row):
|
|
314
|
+
def _get_pandas_table_content(row, modality="text"):
|
|
284
315
|
"""
|
|
285
316
|
Extracts table/chart content from a DataFrame row.
|
|
286
317
|
|
|
@@ -294,10 +325,19 @@ def _get_pandas_table_content(row):
|
|
|
294
325
|
str
|
|
295
326
|
The table/chart content from the row.
|
|
296
327
|
"""
|
|
297
|
-
|
|
328
|
+
if modality == "text":
|
|
329
|
+
content = row.get("table_metadata", {}).get("table_content")
|
|
330
|
+
elif modality == "image":
|
|
331
|
+
content = _format_image_input_string(row.get("content"))
|
|
332
|
+
elif modality == "text_image":
|
|
333
|
+
text = row.get("table_metadata", {}).get("table_content")
|
|
334
|
+
image = row.get("content")
|
|
335
|
+
content = _format_text_image_pair_input_string(text, image)
|
|
336
|
+
|
|
337
|
+
return content
|
|
298
338
|
|
|
299
339
|
|
|
300
|
-
def _get_pandas_image_content(row):
|
|
340
|
+
def _get_pandas_image_content(row, modality="text"):
|
|
301
341
|
"""
|
|
302
342
|
Extracts image caption content from a DataFrame row.
|
|
303
343
|
|
|
@@ -311,10 +351,28 @@ def _get_pandas_image_content(row):
|
|
|
311
351
|
str
|
|
312
352
|
The image caption from the row.
|
|
313
353
|
"""
|
|
314
|
-
|
|
354
|
+
subtype = row.get("content_metadata", {}).get("subtype")
|
|
355
|
+
if modality == "text":
|
|
356
|
+
if subtype == "page_image":
|
|
357
|
+
content = row.get("image_metadata", {}).get("text")
|
|
358
|
+
else:
|
|
359
|
+
content = row.get("image_metadata", {}).get("caption")
|
|
360
|
+
elif modality == "image":
|
|
361
|
+
content = _format_image_input_string(row.get("content"))
|
|
362
|
+
elif modality == "text_image":
|
|
363
|
+
if subtype == "page_image":
|
|
364
|
+
text = row.get("image_metadata", {}).get("text")
|
|
365
|
+
else:
|
|
366
|
+
text = row.get("image_metadata", {}).get("caption")
|
|
367
|
+
image = row.get("content")
|
|
368
|
+
content = _format_text_image_pair_input_string(text, image)
|
|
315
369
|
|
|
370
|
+
# A workaround to save memory.
|
|
371
|
+
row["content"] = ""
|
|
372
|
+
return content
|
|
316
373
|
|
|
317
|
-
|
|
374
|
+
|
|
375
|
+
def _get_pandas_audio_content(row, modality="text"):
|
|
318
376
|
"""
|
|
319
377
|
A pandas UDF used to select extracted audio transcription to be used to create embeddings.
|
|
320
378
|
"""
|
|
@@ -408,6 +466,23 @@ def _concatenate_extractions_pandas(
|
|
|
408
466
|
# ------------------------------------------------------------------------------
|
|
409
467
|
|
|
410
468
|
|
|
469
|
+
def does_model_support_multimodal_embeddings(model: str) -> bool:
|
|
470
|
+
"""
|
|
471
|
+
Checks if a given model supports multi-modal embeddings.
|
|
472
|
+
|
|
473
|
+
Parameters
|
|
474
|
+
----------
|
|
475
|
+
model : str
|
|
476
|
+
The name of the model.
|
|
477
|
+
|
|
478
|
+
Returns
|
|
479
|
+
-------
|
|
480
|
+
bool
|
|
481
|
+
True if the model supports multi-modal embeddings, False otherwise.
|
|
482
|
+
"""
|
|
483
|
+
return model in MULTI_MODAL_MODELS
|
|
484
|
+
|
|
485
|
+
|
|
411
486
|
def transform_create_text_embeddings_internal(
|
|
412
487
|
df_transform_ledger: pd.DataFrame,
|
|
413
488
|
task_config: Dict[str, Any],
|
|
@@ -460,6 +535,15 @@ def transform_create_text_embeddings_internal(
|
|
|
460
535
|
ContentTypeEnum.AUDIO: _get_pandas_audio_content,
|
|
461
536
|
ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
|
|
462
537
|
}
|
|
538
|
+
task_type_to_modality = {
|
|
539
|
+
ContentTypeEnum.TEXT: task_config.get("text_elements_modality") or transform_config.text_elements_modality,
|
|
540
|
+
ContentTypeEnum.STRUCTURED: (
|
|
541
|
+
task_config.get("structured_elements_modality") or transform_config.structured_elements_modality
|
|
542
|
+
),
|
|
543
|
+
ContentTypeEnum.IMAGE: task_config.get("image_elements_modality") or transform_config.image_elements_modality,
|
|
544
|
+
ContentTypeEnum.AUDIO: task_config.get("audio_elements_modality") or transform_config.audio_elements_modality,
|
|
545
|
+
ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
|
|
546
|
+
}
|
|
463
547
|
|
|
464
548
|
def _content_type_getter(row):
|
|
465
549
|
return row["content_metadata"]["type"]
|
|
@@ -480,7 +564,7 @@ def transform_create_text_embeddings_internal(
|
|
|
480
564
|
# Extract content and normalize empty or non-str to None
|
|
481
565
|
extracted_content = (
|
|
482
566
|
df_content["metadata"]
|
|
483
|
-
.apply(content_getter)
|
|
567
|
+
.apply(partial(content_getter, modality=task_type_to_modality[content_type]))
|
|
484
568
|
.apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
|
|
485
569
|
)
|
|
486
570
|
df_content["_content"] = extracted_content
|
|
@@ -488,9 +572,15 @@ def transform_create_text_embeddings_internal(
|
|
|
488
572
|
# Prepare batches for only valid (non-None) content
|
|
489
573
|
valid_content_mask = df_content["_content"].notna()
|
|
490
574
|
if valid_content_mask.any():
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
575
|
+
filtered_content_list = df_content.loc[valid_content_mask, "_content"].tolist()
|
|
576
|
+
filtered_content_batches = _generate_batches(filtered_content_list, batch_size=transform_config.batch_size)
|
|
577
|
+
|
|
578
|
+
if model_name in MULTI_MODAL_MODELS:
|
|
579
|
+
modality_list = [task_type_to_modality[content_type]] * len(filtered_content_list)
|
|
580
|
+
modality_batches = _generate_batches(modality_list, batch_size=transform_config.batch_size)
|
|
581
|
+
else:
|
|
582
|
+
modality_batches = None
|
|
583
|
+
|
|
494
584
|
content_embeddings = _async_runner(
|
|
495
585
|
filtered_content_batches,
|
|
496
586
|
api_key,
|
|
@@ -500,6 +590,7 @@ def transform_create_text_embeddings_internal(
|
|
|
500
590
|
transform_config.input_type,
|
|
501
591
|
transform_config.truncate,
|
|
502
592
|
False,
|
|
593
|
+
modalities=modality_batches,
|
|
503
594
|
)
|
|
504
595
|
# Build a simple row index -> embedding map
|
|
505
596
|
embeddings_dict = dict(
|
|
@@ -201,6 +201,8 @@ def construct_image_metadata_from_base64(
|
|
|
201
201
|
page_count: int,
|
|
202
202
|
source_metadata: Dict[str, Any],
|
|
203
203
|
base_unified_metadata: Dict[str, Any],
|
|
204
|
+
subtype: None | ContentTypeEnum | str = "",
|
|
205
|
+
text: str = "",
|
|
204
206
|
) -> List[Any]:
|
|
205
207
|
"""
|
|
206
208
|
Extracts image data from a base64-encoded image string, decodes the image to get
|
|
@@ -252,6 +254,7 @@ def construct_image_metadata_from_base64(
|
|
|
252
254
|
"line": -1,
|
|
253
255
|
"span": -1,
|
|
254
256
|
},
|
|
257
|
+
"subtype": subtype or "",
|
|
255
258
|
}
|
|
256
259
|
|
|
257
260
|
# Construct image metadata
|
|
@@ -259,7 +262,7 @@ def construct_image_metadata_from_base64(
|
|
|
259
262
|
"image_type": DocumentTypeEnum.PNG,
|
|
260
263
|
"structured_image_type": ContentTypeEnum.UNKNOWN,
|
|
261
264
|
"caption": "",
|
|
262
|
-
"text":
|
|
265
|
+
"text": text,
|
|
263
266
|
"image_location": bbox,
|
|
264
267
|
"image_location_max_dimensions": (width, height),
|
|
265
268
|
"height": height,
|
|
@@ -7,7 +7,7 @@ nv_ingest_api/interface/transform.py,sha256=g6YnFR7TpEU0xNtzCvv6kqnFbuCwQ6vRMjjB
|
|
|
7
7
|
nv_ingest_api/interface/utility.py,sha256=AL4l0cJNvTjG1MAe1YNTk1jbbPED3g4HCewzx6Ffcio,7296
|
|
8
8
|
nv_ingest_api/internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
nv_ingest_api/internal/enums/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
10
|
-
nv_ingest_api/internal/enums/common.py,sha256=
|
|
10
|
+
nv_ingest_api/internal/enums/common.py,sha256=lzDJ35VWfIwlL_Lx_q0dfHUuwEB7CXudHIQAilpjoRw,12611
|
|
11
11
|
nv_ingest_api/internal/extract/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
|
|
12
12
|
nv_ingest_api/internal/extract/audio/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
13
13
|
nv_ingest_api/internal/extract/audio/audio_extraction.py,sha256=_jf_UC_FTqZr-xEpwG8edwBzdDjM01gGhqm9ulOsDcY,6973
|
|
@@ -32,10 +32,10 @@ nv_ingest_api/internal/extract/pdf/engines/__init__.py,sha256=u4GnAZmDKRl0RwYGIR
|
|
|
32
32
|
nv_ingest_api/internal/extract/pdf/engines/adobe.py,sha256=VT0dEqkU-y2uGkaCqxtKYov_Q8R1028UQVBchgMLca4,17466
|
|
33
33
|
nv_ingest_api/internal/extract/pdf/engines/llama.py,sha256=PpKTqS8jGHBV6mKLGZWwjpfT8ga6Fy8ffrvL-gPAf2c,8182
|
|
34
34
|
nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py,sha256=XNYz4S2tMFBv0KFzXNERrVs-1raxJ_iIIXpBGlJFcD0,22987
|
|
35
|
-
nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=
|
|
35
|
+
nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=8hUJUdpx6FhOBgabFmGhJiAQdl12kR8YoSbUfN-geOk,23506
|
|
36
36
|
nv_ingest_api/internal/extract/pdf/engines/tika.py,sha256=6GyR2l6EsgNZl9jnYDXLeKNK9Fj2Mw9y2UWDq-eSkOc,3169
|
|
37
37
|
nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py,sha256=jrv2B4VZAH4PevAQrFz965qz8UyXq3rViiOTbGLejec,14908
|
|
38
|
-
nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py,sha256=
|
|
38
|
+
nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py,sha256=4bvN6LsPksLicI6jM0JqbJFiOZNHEcuc8MVVW4XfgV8,5875
|
|
39
39
|
nv_ingest_api/internal/extract/pptx/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
|
|
40
40
|
nv_ingest_api/internal/extract/pptx/pptx_extractor.py,sha256=o-0P2dDyRFW37uQi_lKk6-eFozTcZvbq-2Y4I0EBMIY,7749
|
|
41
41
|
nv_ingest_api/internal/extract/pptx/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -82,7 +82,7 @@ nv_ingest_api/internal/schemas/message_brokers/request_schema.py,sha256=LZX_wXDx
|
|
|
82
82
|
nv_ingest_api/internal/schemas/message_brokers/response_schema.py,sha256=4b275HlzBSzpmuE2wdoeaGKPCdKki3wuWldtRIfrj8w,727
|
|
83
83
|
nv_ingest_api/internal/schemas/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
84
84
|
nv_ingest_api/internal/schemas/meta/base_model_noext.py,sha256=8hXU1uuiqZ6t8EsoZ8vlC5EFf2zSZrKEX133FcfZMwI,316
|
|
85
|
-
nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=
|
|
85
|
+
nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=ceYQjRjhBSDbbZ6q-Db7Y6GHVOvWPdGAMb3TX1vMWfY,8321
|
|
86
86
|
nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=VnAzkSFat_ckI19mlwQTlFrvP6EZVCwyNl9bt51b8oU,7193
|
|
87
87
|
nv_ingest_api/internal/schemas/mutate/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
88
88
|
nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py,sha256=k1JOdlPPpsipc0XhHf-9YxJ_-W0HvpVE1ZhYmr7fzj0,395
|
|
@@ -92,14 +92,14 @@ nv_ingest_api/internal/schemas/store/store_image_schema.py,sha256=p2LGij9i6sG6RY
|
|
|
92
92
|
nv_ingest_api/internal/schemas/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
93
93
|
nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py,sha256=OtM1iPw26uioC3mghbOJQurKGg641uQfhASH462VqOY,578
|
|
94
94
|
nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py,sha256=31ThI5fr0yyENeJeE1xMAA-pxk1QVJLwM842zMate_k,429
|
|
95
|
-
nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py,sha256=
|
|
95
|
+
nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py,sha256=RZCISA8CUqKiY8eJuk4uWxzo4PZ-fuYdzMO7_LYFkoM,1117
|
|
96
96
|
nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py,sha256=D9K8tvu-tkEBQkZo7uuRzgrHdGyM3ZcNycHbHy5HV2E,791
|
|
97
97
|
nv_ingest_api/internal/store/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
98
98
|
nv_ingest_api/internal/store/embed_text_upload.py,sha256=maxb4FPsBvWgvlrjAPEBlRZEFdJX5NxPG-p8kUbzV7I,9898
|
|
99
99
|
nv_ingest_api/internal/store/image_upload.py,sha256=GNlY4k3pfcHv3lzXxkbmGLeHFsf9PI25bkBn6Xn9h3I,9654
|
|
100
100
|
nv_ingest_api/internal/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
101
101
|
nv_ingest_api/internal/transform/caption_image.py,sha256=0ILCG2F8ESqKtZiPUM-6F1BHUflFZ76Dzi2GNzkE-lU,8517
|
|
102
|
-
nv_ingest_api/internal/transform/embed_text.py,sha256=
|
|
102
|
+
nv_ingest_api/internal/transform/embed_text.py,sha256=kvVGlNH1S91UENXWLD31uh3KzlfJYOlYitpIFMsyowU,20033
|
|
103
103
|
nv_ingest_api/internal/transform/split_text.py,sha256=-kwpRWSVZrPldm1hn3-tVz_TkzuKM-kPvNU3HTp9zOY,7476
|
|
104
104
|
nv_ingest_api/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
105
105
|
nv_ingest_api/util/control_message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -123,7 +123,7 @@ nv_ingest_api/util/image_processing/__init__.py,sha256=Jiy8C1ZuSrNb_eBM1ZTV9IKFI
|
|
|
123
123
|
nv_ingest_api/util/image_processing/clustering.py,sha256=sUGlZI4cx1q8h4Pns1N9JVpdfSM2BOH8zRmn9QFCtzI,9236
|
|
124
124
|
nv_ingest_api/util/image_processing/processing.py,sha256=LSoDDEmahr7a-qSS12McVcowRe3dOrAZwa1h-PD_JPQ,6554
|
|
125
125
|
nv_ingest_api/util/image_processing/table_and_chart.py,sha256=bxOu9PZYkG_WFCDGw_JLaO60S2pDSN8EOWK3xkIwr2A,14376
|
|
126
|
-
nv_ingest_api/util/image_processing/transforms.py,sha256=
|
|
126
|
+
nv_ingest_api/util/image_processing/transforms.py,sha256=3-xeUerc2AaXJTYuR23EjwdtjRQ8F85pS5D9zxR4cLA,23452
|
|
127
127
|
nv_ingest_api/util/imports/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
128
128
|
nv_ingest_api/util/imports/callable_signatures.py,sha256=e2bJB1pmkN4Ee-Bf-VggOSBaQ4RXofWF5eKkWXgIj2U,1855
|
|
129
129
|
nv_ingest_api/util/imports/dynamic_resolvers.py,sha256=7GByV_-8z2X0tnVoabCxVioxOP3sYMros3ZllVAW-wY,4343
|
|
@@ -135,7 +135,7 @@ nv_ingest_api/util/message_brokers/simple_message_broker/broker.py,sha256=h9Q4q_
|
|
|
135
135
|
nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py,sha256=3p-LRqG8qLnsfEhBNf73_DG22C08JKahTqUvPLS2Apg,2554
|
|
136
136
|
nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py,sha256=fh7Q0wO5H_FtrHV1VdT6V66aZNqglOh_2XdkfLt8hgg,15722
|
|
137
137
|
nv_ingest_api/util/metadata/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
|
|
138
|
-
nv_ingest_api/util/metadata/aggregators.py,sha256=
|
|
138
|
+
nv_ingest_api/util/metadata/aggregators.py,sha256=YYdvJ1E04eGFZKKHUxXoH6mzLg8nor9Smvnv0qzqK5w,15988
|
|
139
139
|
nv_ingest_api/util/multi_processing/__init__.py,sha256=4fojP8Rp_5Hu1YAkqGylqTyEZ-HBVVEunn5Z9I99swA,242
|
|
140
140
|
nv_ingest_api/util/multi_processing/mp_pool_singleton.py,sha256=dTfP82DgGPaXEJH3jywTO8rNlLZUniD4FFzwv84_giE,7372
|
|
141
141
|
nv_ingest_api/util/nim/__init__.py,sha256=UqbiXFCqjWcjNvoduXd_0gOUOGBT8JvppiYHOmMyneA,1775
|
|
@@ -153,8 +153,8 @@ nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=dZ-jrk7IK7oNtHoXFS
|
|
|
153
153
|
nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
|
|
154
154
|
nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
155
155
|
nv_ingest_api/util/system/hardware_info.py,sha256=ORZeKpH9kSGU_vuPhyBwkIiMyCViKUX2CP__MCjrfbU,19463
|
|
156
|
-
nv_ingest_api-2025.7.
|
|
157
|
-
nv_ingest_api-2025.7.
|
|
158
|
-
nv_ingest_api-2025.7.
|
|
159
|
-
nv_ingest_api-2025.7.
|
|
160
|
-
nv_ingest_api-2025.7.
|
|
156
|
+
nv_ingest_api-2025.7.16.dev20250716.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
157
|
+
nv_ingest_api-2025.7.16.dev20250716.dist-info/METADATA,sha256=RaPAkQ4Dtkkrn6hi9Va1t2XDpDgRbe-bFqmCVL3IlEA,13947
|
|
158
|
+
nv_ingest_api-2025.7.16.dev20250716.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
159
|
+
nv_ingest_api-2025.7.16.dev20250716.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
|
|
160
|
+
nv_ingest_api-2025.7.16.dev20250716.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|