nv-ingest-api 2025.4.16.dev20250416__py3-none-any.whl → 2025.4.17.dev20250417__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +215 -0
- nv_ingest_api/interface/extract.py +972 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +218 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +200 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +494 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
- nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
- nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +232 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +205 -0
- nv_ingest_api/internal/transform/embed_text.py +496 -0
- nv_ingest_api/internal/transform/split_text.py +157 -0
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +223 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +179 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
- nv_ingest_api/util/image_processing/transforms.py +407 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +31 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +435 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +469 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
- nv_ingest_api/util/nim/__init__.py +56 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +427 -0
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +72 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +334 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +398 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.17.dev20250417.dist-info/RECORD +152 -0
- nv_ingest_api-2025.4.16.dev20250416.dist-info/RECORD +0 -9
- /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
- {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
from typing import Optional, Dict, List, Union
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from nv_ingest_api.interface.utility import (
|
|
11
|
+
build_dataframe_from_files,
|
|
12
|
+
)
|
|
13
|
+
from nv_ingest_api.internal.enums.common import DocumentTypeEnum
|
|
14
|
+
from nv_ingest_api.internal.schemas.transform.transform_image_caption_schema import ImageCaptionExtractionSchema
|
|
15
|
+
from nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema import TextEmbeddingSchema
|
|
16
|
+
from nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema import TextSplitterSchema
|
|
17
|
+
from nv_ingest_api.internal.transform.caption_image import transform_image_create_vlm_caption_internal
|
|
18
|
+
from nv_ingest_api.internal.transform.embed_text import transform_create_text_embeddings_internal
|
|
19
|
+
from nv_ingest_api.internal.transform.split_text import transform_text_split_and_tokenize_internal
|
|
20
|
+
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@unified_exception_handler
|
|
24
|
+
def transform_text_create_embeddings(
|
|
25
|
+
*,
|
|
26
|
+
inputs: pd.DataFrame,
|
|
27
|
+
api_key: str,
|
|
28
|
+
batch_size: Optional[int] = 8192,
|
|
29
|
+
embedding_model: Optional[str] = None,
|
|
30
|
+
embedding_nim_endpoint: Optional[str] = None,
|
|
31
|
+
encoding_format: Optional[str] = None,
|
|
32
|
+
input_type: Optional[str] = None,
|
|
33
|
+
truncate: Optional[str] = None,
|
|
34
|
+
) -> pd.DataFrame:
|
|
35
|
+
"""
|
|
36
|
+
Creates text embeddings using the provided configuration.
|
|
37
|
+
Parameters provided as None will use the default values from EmbedExtractionsSchema.
|
|
38
|
+
"""
|
|
39
|
+
task_config = {}
|
|
40
|
+
|
|
41
|
+
# Build configuration parameters only if provided; defaults come from EmbedExtractionsSchema.
|
|
42
|
+
config_kwargs = {
|
|
43
|
+
"batch_size": batch_size,
|
|
44
|
+
"embedding_model": embedding_model,
|
|
45
|
+
"embedding_nim_endpoint": embedding_nim_endpoint,
|
|
46
|
+
"encoding_format": encoding_format,
|
|
47
|
+
"input_type": input_type,
|
|
48
|
+
"truncate": truncate,
|
|
49
|
+
}
|
|
50
|
+
# Remove any keys with a None value.
|
|
51
|
+
config_kwargs = {k: v for k, v in config_kwargs.items() if v is not None}
|
|
52
|
+
config_kwargs["api_key"] = api_key
|
|
53
|
+
|
|
54
|
+
transform_config = TextEmbeddingSchema(**config_kwargs)
|
|
55
|
+
|
|
56
|
+
result, _ = transform_create_text_embeddings_internal(
|
|
57
|
+
df_transform_ledger=inputs,
|
|
58
|
+
task_config=task_config,
|
|
59
|
+
transform_config=transform_config,
|
|
60
|
+
execution_trace_log=None,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
return result
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@unified_exception_handler
|
|
67
|
+
def transform_image_create_vlm_caption(
|
|
68
|
+
*,
|
|
69
|
+
inputs: Union[pd.DataFrame, tuple, List[tuple]],
|
|
70
|
+
api_key: Optional[str] = None,
|
|
71
|
+
prompt: Optional[str] = None,
|
|
72
|
+
endpoint_url: Optional[str] = None,
|
|
73
|
+
model_name: Optional[str] = None,
|
|
74
|
+
) -> pd.DataFrame:
|
|
75
|
+
"""
|
|
76
|
+
Extract captions for image content using the VLM model API.
|
|
77
|
+
|
|
78
|
+
This function processes image content for caption generation. It accepts input in one
|
|
79
|
+
of three forms:
|
|
80
|
+
|
|
81
|
+
1. A pandas DataFrame with the following required structure:
|
|
82
|
+
- Columns:
|
|
83
|
+
- ``source_name`` (str): Identifier for the source file.
|
|
84
|
+
- ``source_id`` (str): Unique identifier for the file.
|
|
85
|
+
- ``content`` (str): Base64-encoded string representing the file content.
|
|
86
|
+
- ``document_type`` (str): A string representing the document type (e.g., DocumentTypeEnum.PNG).
|
|
87
|
+
- ``metadata`` (dict): A dictionary containing at least:
|
|
88
|
+
- ``content``: Same as the base64-encoded file content.
|
|
89
|
+
- ``source_metadata``: Dictionary created via :func:`create_source_metadata`.
|
|
90
|
+
- ``content_metadata``: Dictionary created via :func:`create_content_metadata`.
|
|
91
|
+
- ``image_metadata``: For image files, initialized as an empty dict ({}); other metadata fields
|
|
92
|
+
(audio_metadata, text_metadata, etc.) are typically None or empty.
|
|
93
|
+
- ``raise_on_failure``: Boolean flag (typically False).
|
|
94
|
+
|
|
95
|
+
2. A single tuple of the form ``(file_source, document_type)``.
|
|
96
|
+
- ``file_source``: Either a file path (str) or a file-like object (e.g., BytesIO).
|
|
97
|
+
- ``document_type``: A string representing the document type (e.g., DocumentTypeEnum.PNG).
|
|
98
|
+
|
|
99
|
+
3. A list of such tuples.
|
|
100
|
+
|
|
101
|
+
For non-DataFrame inputs, a DataFrame is constructed using the helper function
|
|
102
|
+
:func:`build_dataframe_from_files`. When the file_source is a file-like object, its content
|
|
103
|
+
is converted to a base64-encoded string using :func:`read_bytesio_as_base64`; if it is a file
|
|
104
|
+
path (str), :func:`read_file_as_base64` is used.
|
|
105
|
+
|
|
106
|
+
Parameters
|
|
107
|
+
----------
|
|
108
|
+
inputs : Union[pd.DataFrame, tuple, List[tuple]]
|
|
109
|
+
Input data representing image content. Accepted formats:
|
|
110
|
+
- A pandas DataFrame with the required structure as described above.
|
|
111
|
+
- A single tuple ``(file_source, document_type)``.
|
|
112
|
+
- A list of tuples of the form ``(file_source, document_type)``.
|
|
113
|
+
In the tuples, ``file_source`` is either a file path (str) or a file-like object (e.g., BytesIO),
|
|
114
|
+
and ``document_type`` is a string (typically one of the DocumentTypeEnum values).
|
|
115
|
+
|
|
116
|
+
api_key : Optional[str], default=None
|
|
117
|
+
API key for authentication with the VLM endpoint. If not provided, defaults are used.
|
|
118
|
+
|
|
119
|
+
prompt : Optional[str], default=None
|
|
120
|
+
Text prompt to guide caption generation.
|
|
121
|
+
|
|
122
|
+
endpoint_url : Optional[str], default=None
|
|
123
|
+
URL of the VLM model HTTP endpoint.
|
|
124
|
+
|
|
125
|
+
model_name : Optional[str], default=None
|
|
126
|
+
Name of the model to be used for caption generation.
|
|
127
|
+
|
|
128
|
+
Returns
|
|
129
|
+
-------
|
|
130
|
+
pd.DataFrame
|
|
131
|
+
A pandas DataFrame with generated captions inserted into the
|
|
132
|
+
``metadata.image_metadata.caption`` field for each image row.
|
|
133
|
+
|
|
134
|
+
Raises
|
|
135
|
+
------
|
|
136
|
+
ValueError
|
|
137
|
+
If the input is not a DataFrame, tuple, or list of tuples, or if any tuple is not of length 2.
|
|
138
|
+
Exception
|
|
139
|
+
Propagates any exception encountered during processing or caption extraction.
|
|
140
|
+
|
|
141
|
+
Examples
|
|
142
|
+
--------
|
|
143
|
+
>>> # Example using a DataFrame:
|
|
144
|
+
>>> df = pd.DataFrame({
|
|
145
|
+
... "source_name": ["image.png"],
|
|
146
|
+
... "source_id": ["image.png"],
|
|
147
|
+
... "content": ["<base64-string>"],
|
|
148
|
+
... "document_type": ["png"],
|
|
149
|
+
... "metadata": [{
|
|
150
|
+
... "content": "<base64-string>",
|
|
151
|
+
... "source_metadata": {...},
|
|
152
|
+
... "content_metadata": {...},
|
|
153
|
+
... "image_metadata": {},
|
|
154
|
+
... "raise_on_failure": False,
|
|
155
|
+
... }],
|
|
156
|
+
... })
|
|
157
|
+
>>> transform_image_create_vlm_caption(inputs=df, api_key="key", prompt="Caption the image:")
|
|
158
|
+
|
|
159
|
+
>>> # Example using a tuple:
|
|
160
|
+
>>> transform_image_create_vlm_caption(inputs=("image.png", DocumentTypeEnum.PNG), api_key="key",
|
|
161
|
+
prompt="Caption the image:")
|
|
162
|
+
|
|
163
|
+
>>> # Example using a list of tuples with file paths:
|
|
164
|
+
>>> transform_image_create_vlm_caption(inputs=[("image.png", DocumentTypeEnum.PNG),
|
|
165
|
+
("image2.png", DocumentTypeEnum.PNG)], api_key="key", prompt="Caption the image:")
|
|
166
|
+
|
|
167
|
+
>>> # Example using a list of tuples with BytesIO objects:
|
|
168
|
+
>>> from io import BytesIO
|
|
169
|
+
>>> with open("image.png", "rb") as f:
|
|
170
|
+
... bytes_io = BytesIO(f.read())
|
|
171
|
+
>>> transform_image_create_vlm_caption(inputs=[(bytes_io, DocumentTypeEnum.PNG)],
|
|
172
|
+
api_key="key", prompt="Caption the image:")
|
|
173
|
+
"""
|
|
174
|
+
if not isinstance(inputs, pd.DataFrame):
|
|
175
|
+
# Normalize a single tuple to a list.
|
|
176
|
+
if isinstance(inputs, tuple):
|
|
177
|
+
file_items = [inputs]
|
|
178
|
+
elif isinstance(inputs, list):
|
|
179
|
+
file_items = inputs
|
|
180
|
+
else:
|
|
181
|
+
raise ValueError(
|
|
182
|
+
"df_ledger must be a DataFrame, a tuple (file_source, document_type), or a list of such tuples."
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
file_sources: List[Union[str, BytesIO]] = []
|
|
186
|
+
source_names: List[str] = []
|
|
187
|
+
source_ids: List[str] = []
|
|
188
|
+
doc_types: List[str] = []
|
|
189
|
+
|
|
190
|
+
for item in file_items:
|
|
191
|
+
if not (isinstance(item, tuple) and len(item) == 2):
|
|
192
|
+
raise ValueError("Each item must be a tuple of (file_source, document_type).")
|
|
193
|
+
file_source, doc_type = item
|
|
194
|
+
file_sources.append(file_source)
|
|
195
|
+
# Use the file_source string as the identifier if available; else construct one.
|
|
196
|
+
if isinstance(file_source, str):
|
|
197
|
+
identifier = file_source
|
|
198
|
+
else:
|
|
199
|
+
identifier = f"bytesio_{doc_type}"
|
|
200
|
+
source_names.append(identifier)
|
|
201
|
+
source_ids.append(identifier)
|
|
202
|
+
doc_types.append(doc_type)
|
|
203
|
+
|
|
204
|
+
inputs = build_dataframe_from_files(file_sources, source_names, source_ids, doc_types)
|
|
205
|
+
|
|
206
|
+
task_config: Dict[str, Optional[str]] = {
|
|
207
|
+
"api_key": api_key,
|
|
208
|
+
"prompt": prompt,
|
|
209
|
+
"endpoint_url": endpoint_url,
|
|
210
|
+
"model_name": model_name,
|
|
211
|
+
}
|
|
212
|
+
filtered_task_config: Dict[str, str] = {k: v for k, v in task_config.items() if v is not None}
|
|
213
|
+
|
|
214
|
+
transform_config = ImageCaptionExtractionSchema(**filtered_task_config)
|
|
215
|
+
|
|
216
|
+
result = transform_image_create_vlm_caption_internal(
|
|
217
|
+
df_transform_ledger=inputs,
|
|
218
|
+
task_config=filtered_task_config,
|
|
219
|
+
transform_config=transform_config,
|
|
220
|
+
execution_trace_log=None,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
return result
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
@unified_exception_handler
|
|
227
|
+
def transform_text_split_and_tokenize(
|
|
228
|
+
*,
|
|
229
|
+
inputs: Union[pd.DataFrame, str, List[str]],
|
|
230
|
+
tokenizer: str,
|
|
231
|
+
chunk_size: int,
|
|
232
|
+
chunk_overlap: int,
|
|
233
|
+
split_source_types: Optional[List[str]] = None,
|
|
234
|
+
hugging_face_access_token: Optional[str] = None,
|
|
235
|
+
) -> pd.DataFrame:
|
|
236
|
+
"""
|
|
237
|
+
Transform and tokenize text documents by splitting them into smaller chunks.
|
|
238
|
+
|
|
239
|
+
This function prepares the configuration parameters for text splitting and tokenization,
|
|
240
|
+
and then delegates the splitting and asynchronous tokenization to an internal function.
|
|
241
|
+
|
|
242
|
+
The function accepts input in one of two forms:
|
|
243
|
+
|
|
244
|
+
1. A pandas DataFrame that already follows the required structure:
|
|
245
|
+
|
|
246
|
+
Required DataFrame Structure:
|
|
247
|
+
- source_name (str): Identifier for the source document.
|
|
248
|
+
- source_id (str): Unique identifier for the document.
|
|
249
|
+
- content (str): The document content (typically as a base64-encoded string).
|
|
250
|
+
- document_type (str): For plain text, set to DocumentTypeEnum.TXT.
|
|
251
|
+
- metadata (dict): Must contain:
|
|
252
|
+
* content: The original text content.
|
|
253
|
+
* content_metadata: A dictionary with a key "type" (e.g., "text").
|
|
254
|
+
* source_metadata: A dictionary with source-specific metadata (e.g., file path, timestamps).
|
|
255
|
+
* Other keys (audio_metadata, image_metadata, etc.) set to None or empty as appropriate.
|
|
256
|
+
* raise_on_failure: Boolean (typically False).
|
|
257
|
+
|
|
258
|
+
2. A plain text string or a list of plain text strings.
|
|
259
|
+
In this case, the function converts each text into a BytesIO object (encoding it as UTF-8)
|
|
260
|
+
and then uses the helper function `build_dataframe_from_files` to construct a DataFrame where:
|
|
261
|
+
- source_name and source_id are generated as "text_0", "text_1", etc.
|
|
262
|
+
- content is the base64-encoded representation of the UTF-8 encoded text.
|
|
263
|
+
- document_type is set to DocumentTypeEnum.TXT.
|
|
264
|
+
- metadata is constructed using helper functions (for source and content metadata),
|
|
265
|
+
with content_metadata's "type" set to "text".
|
|
266
|
+
|
|
267
|
+
Parameters
|
|
268
|
+
----------
|
|
269
|
+
inputs : Union[pd.DataFrame, str, List[str]]
|
|
270
|
+
Either a DataFrame following the required structure, a single plain text string,
|
|
271
|
+
or a list of plain text strings.
|
|
272
|
+
tokenizer : str
|
|
273
|
+
Identifier or path of the tokenizer to be used (e.g., "bert-base-uncased").
|
|
274
|
+
chunk_size : int
|
|
275
|
+
Maximum number of tokens per chunk.
|
|
276
|
+
chunk_overlap : int
|
|
277
|
+
Number of tokens to overlap between consecutive chunks.
|
|
278
|
+
split_source_types : Optional[List[str]], default=["text"]
|
|
279
|
+
List of source types to filter for text splitting. If None or empty, defaults to ["text"].
|
|
280
|
+
hugging_face_access_token : Optional[str], default=None
|
|
281
|
+
Access token for Hugging Face authentication, if required.
|
|
282
|
+
|
|
283
|
+
Returns
|
|
284
|
+
-------
|
|
285
|
+
pd.DataFrame
|
|
286
|
+
A DataFrame with the processed documents, where text content has been split into smaller chunks.
|
|
287
|
+
The returned DataFrame retains the original columns and updates the "metadata" field with
|
|
288
|
+
generated tokenized segments and embedding information.
|
|
289
|
+
|
|
290
|
+
Raises
|
|
291
|
+
------
|
|
292
|
+
Exception
|
|
293
|
+
Propagates any exceptions encountered during text splitting and tokenization, with additional
|
|
294
|
+
context provided by the unified exception handler.
|
|
295
|
+
|
|
296
|
+
Examples
|
|
297
|
+
--------
|
|
298
|
+
>>> # Using a DataFrame:
|
|
299
|
+
>>> import pandas as pd
|
|
300
|
+
>>> df = pd.DataFrame({
|
|
301
|
+
... "source_name": ["doc1.txt"],
|
|
302
|
+
... "source_id": ["doc1.txt"],
|
|
303
|
+
... "content": ["<base64-encoded text>"],
|
|
304
|
+
... "document_type": ["text"],
|
|
305
|
+
... "metadata": [{
|
|
306
|
+
... "content": "This is a document.",
|
|
307
|
+
... "content_metadata": {"type": "text"},
|
|
308
|
+
... "source_metadata": {"source_id": "doc1.txt", "source_name": "doc1.txt", "source_type": "txt"},
|
|
309
|
+
... "audio_metadata": None,
|
|
310
|
+
... "image_metadata": None,
|
|
311
|
+
... "text_metadata": None,
|
|
312
|
+
... "raise_on_failure": False,
|
|
313
|
+
... }],
|
|
314
|
+
... })
|
|
315
|
+
>>> transform_text_split_and_tokenize(
|
|
316
|
+
... inputs=df,
|
|
317
|
+
... tokenizer="bert-base-uncased",
|
|
318
|
+
... chunk_size=512,
|
|
319
|
+
... chunk_overlap=50
|
|
320
|
+
... )
|
|
321
|
+
|
|
322
|
+
>>> # Using a single plain text string:
|
|
323
|
+
>>> transform_text_split_and_tokenize(
|
|
324
|
+
... inputs="This is a plain text document.",
|
|
325
|
+
... tokenizer="bert-base-uncased",
|
|
326
|
+
... chunk_size=512,
|
|
327
|
+
... chunk_overlap=50
|
|
328
|
+
... )
|
|
329
|
+
|
|
330
|
+
>>> # Using a list of plain text strings:
|
|
331
|
+
>>> texts = ["Document one text.", "Document two text."]
|
|
332
|
+
>>> transform_text_split_and_tokenize(
|
|
333
|
+
... inputs=texts,
|
|
334
|
+
... tokenizer="bert-base-uncased",
|
|
335
|
+
... chunk_size=512,
|
|
336
|
+
... chunk_overlap=50
|
|
337
|
+
... )
|
|
338
|
+
"""
|
|
339
|
+
# If input is not a DataFrame, assume it is a string or list of strings and construct a DataFrame.
|
|
340
|
+
if not isinstance(inputs, pd.DataFrame):
|
|
341
|
+
if isinstance(inputs, str):
|
|
342
|
+
texts = [inputs]
|
|
343
|
+
elif isinstance(inputs, list) and all(isinstance(t, str) for t in inputs):
|
|
344
|
+
texts = inputs
|
|
345
|
+
else:
|
|
346
|
+
raise ValueError("df_ledger must be a DataFrame, a string, or a list of strings.")
|
|
347
|
+
# Convert each text string to a BytesIO object with UTF-8 encoding.
|
|
348
|
+
file_sources = [BytesIO(text.encode("utf-8")) for text in texts]
|
|
349
|
+
# Generate unique identifiers for source_name and source_id.
|
|
350
|
+
source_names = [f"text_{i}" for i in range(len(texts))]
|
|
351
|
+
source_ids = source_names.copy()
|
|
352
|
+
# For plain text, document type is set to DocumentTypeEnum.TXT.
|
|
353
|
+
doc_types = [DocumentTypeEnum.TXT for _ in texts]
|
|
354
|
+
inputs = build_dataframe_from_files(file_sources, source_names, source_ids, doc_types)
|
|
355
|
+
|
|
356
|
+
if not split_source_types:
|
|
357
|
+
split_source_types = ["text"]
|
|
358
|
+
|
|
359
|
+
task_config: Dict[str, any] = {
|
|
360
|
+
"chunk_overlap": chunk_overlap,
|
|
361
|
+
"chunk_size": chunk_size,
|
|
362
|
+
"params": {
|
|
363
|
+
"hf_access_token": hugging_face_access_token,
|
|
364
|
+
"split_source_types": split_source_types,
|
|
365
|
+
},
|
|
366
|
+
"tokenizer": tokenizer,
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
transform_config: TextSplitterSchema = TextSplitterSchema(
|
|
370
|
+
chunk_overlap=chunk_overlap,
|
|
371
|
+
chunk_size=chunk_size,
|
|
372
|
+
tokenizer=tokenizer,
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
result = transform_text_split_and_tokenize_internal(
|
|
376
|
+
df_transform_ledger=inputs,
|
|
377
|
+
task_config=task_config,
|
|
378
|
+
transform_config=transform_config,
|
|
379
|
+
execution_trace_log=None,
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
return result
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import os
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from typing import List, Union
|
|
12
|
+
|
|
13
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum, DocumentTypeEnum
|
|
14
|
+
|
|
15
|
+
# ------------------------------------------------------------------------------
|
|
16
|
+
# Mapping from DocumentTypeEnum to ContentTypeEnum
|
|
17
|
+
# ------------------------------------------------------------------------------
|
|
18
|
+
DOCUMENT_TO_CONTENT_MAPPING = {
|
|
19
|
+
DocumentTypeEnum.BMP: ContentTypeEnum.IMAGE,
|
|
20
|
+
DocumentTypeEnum.DOCX: ContentTypeEnum.STRUCTURED,
|
|
21
|
+
DocumentTypeEnum.HTML: ContentTypeEnum.TEXT,
|
|
22
|
+
DocumentTypeEnum.JPEG: ContentTypeEnum.IMAGE,
|
|
23
|
+
DocumentTypeEnum.PDF: ContentTypeEnum.STRUCTURED,
|
|
24
|
+
DocumentTypeEnum.PNG: ContentTypeEnum.IMAGE,
|
|
25
|
+
DocumentTypeEnum.PPTX: ContentTypeEnum.STRUCTURED,
|
|
26
|
+
DocumentTypeEnum.SVG: ContentTypeEnum.IMAGE,
|
|
27
|
+
DocumentTypeEnum.TIFF: ContentTypeEnum.IMAGE,
|
|
28
|
+
DocumentTypeEnum.TXT: ContentTypeEnum.TEXT,
|
|
29
|
+
DocumentTypeEnum.MD: ContentTypeEnum.TEXT,
|
|
30
|
+
DocumentTypeEnum.MP3: ContentTypeEnum.AUDIO,
|
|
31
|
+
DocumentTypeEnum.WAV: ContentTypeEnum.AUDIO,
|
|
32
|
+
DocumentTypeEnum.UNKNOWN: ContentTypeEnum.UNKNOWN,
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ------------------------------------------------------------------------------
|
|
37
|
+
# Helper function to get the document type from a file extension.
|
|
38
|
+
# ------------------------------------------------------------------------------
|
|
39
|
+
def get_document_type_from_extension(file_path: str) -> str:
|
|
40
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
41
|
+
mapping = {
|
|
42
|
+
".png": DocumentTypeEnum.PNG,
|
|
43
|
+
".jpg": DocumentTypeEnum.JPEG,
|
|
44
|
+
".jpeg": DocumentTypeEnum.JPEG,
|
|
45
|
+
".tiff": DocumentTypeEnum.TIFF,
|
|
46
|
+
".svg": DocumentTypeEnum.SVG,
|
|
47
|
+
}
|
|
48
|
+
return mapping.get(ext, DocumentTypeEnum.UNKNOWN)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ------------------------------------------------------------------------------
|
|
52
|
+
# Helper function to read a file and return its base64-encoded string.
|
|
53
|
+
# ------------------------------------------------------------------------------
|
|
54
|
+
def read_file_as_base64(file_path: str) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Reads the file at file_path in binary mode and returns its base64-encoded string.
|
|
57
|
+
"""
|
|
58
|
+
with open(file_path, "rb") as f:
|
|
59
|
+
file_bytes = f.read()
|
|
60
|
+
return base64.b64encode(file_bytes).decode("utf-8")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ------------------------------------------------------------------------------
|
|
64
|
+
# Helper function to read a BytesIO object and return its base64-encoded string.
|
|
65
|
+
# ------------------------------------------------------------------------------
|
|
66
|
+
def read_bytesio_as_base64(file_io: BytesIO) -> str:
|
|
67
|
+
"""
|
|
68
|
+
Reads a BytesIO object and returns its base64-encoded string.
|
|
69
|
+
|
|
70
|
+
Parameters:
|
|
71
|
+
file_io (BytesIO): A file-like object containing binary data.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
str: The base64-encoded string representation of the file's contents.
|
|
75
|
+
"""
|
|
76
|
+
file_bytes = file_io.getvalue()
|
|
77
|
+
return base64.b64encode(file_bytes).decode("utf-8")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# ------------------------------------------------------------------------------
|
|
81
|
+
# Helper function to create source metadata.
|
|
82
|
+
# ------------------------------------------------------------------------------
|
|
83
|
+
def create_source_metadata(source_name: str, source_id: str, document_type: str) -> dict:
|
|
84
|
+
"""
|
|
85
|
+
Creates a source metadata dictionary for a file.
|
|
86
|
+
|
|
87
|
+
The source_type is set to the provided document_type.
|
|
88
|
+
The date_created and last_modified fields are set to the current ISO timestamp.
|
|
89
|
+
"""
|
|
90
|
+
now_iso = datetime.now().isoformat()
|
|
91
|
+
return {
|
|
92
|
+
"source_name": source_name,
|
|
93
|
+
"source_id": source_id,
|
|
94
|
+
"source_location": "",
|
|
95
|
+
"source_type": document_type, # e.g., "pdf", "png", etc.
|
|
96
|
+
"collection_id": "",
|
|
97
|
+
"date_created": now_iso,
|
|
98
|
+
"last_modified": now_iso,
|
|
99
|
+
"summary": "",
|
|
100
|
+
"partition_id": -1,
|
|
101
|
+
"access_level": "unknown", # You may wish to adjust this if needed.
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# ------------------------------------------------------------------------------
|
|
106
|
+
# Helper function to create content metadata.
|
|
107
|
+
# ------------------------------------------------------------------------------
|
|
108
|
+
def create_content_metadata(document_type: str) -> dict:
|
|
109
|
+
"""
|
|
110
|
+
Creates a content metadata dictionary for a file based on its document type.
|
|
111
|
+
|
|
112
|
+
It maps the document type to the corresponding content type.
|
|
113
|
+
"""
|
|
114
|
+
# Use the mapping; if document_type is not found, fallback to "unknown".
|
|
115
|
+
content_type = DOCUMENT_TO_CONTENT_MAPPING.get(document_type, ContentTypeEnum.UNKNOWN)
|
|
116
|
+
return {
|
|
117
|
+
"type": content_type,
|
|
118
|
+
"description": "",
|
|
119
|
+
"page_number": -1,
|
|
120
|
+
"hierarchy": {
|
|
121
|
+
"page_count": -1,
|
|
122
|
+
"page": -1,
|
|
123
|
+
"block": -1,
|
|
124
|
+
"line": -1,
|
|
125
|
+
"span": -1,
|
|
126
|
+
"nearby_objects": {
|
|
127
|
+
"text": {"content": [], "bbox": [], "type": []},
|
|
128
|
+
"images": {"content": [], "bbox": [], "type": []},
|
|
129
|
+
"structured": {"content": [], "bbox": [], "type": []},
|
|
130
|
+
},
|
|
131
|
+
},
|
|
132
|
+
"subtype": "",
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# ------------------------------------------------------------------------------
|
|
137
|
+
# Main helper function to build a DataFrame from lists of files.
|
|
138
|
+
# ------------------------------------------------------------------------------
|
|
139
|
+
def build_dataframe_from_files(
|
|
140
|
+
file_paths: List[Union[str, BytesIO]],
|
|
141
|
+
source_names: List[str],
|
|
142
|
+
source_ids: List[str],
|
|
143
|
+
document_types: List[str],
|
|
144
|
+
) -> pd.DataFrame:
|
|
145
|
+
"""
|
|
146
|
+
Given lists of file paths (or BytesIO objects), source names, source IDs, and document types,
|
|
147
|
+
reads each file (base64-encoding its contents) and constructs a DataFrame.
|
|
148
|
+
|
|
149
|
+
For image content, 'image_metadata' is initialized as an empty dict, so it can later be updated.
|
|
150
|
+
"""
|
|
151
|
+
rows = []
|
|
152
|
+
# Validate that all lists have the same length.
|
|
153
|
+
if not (len(file_paths) == len(source_names) == len(source_ids) == len(document_types)):
|
|
154
|
+
raise ValueError("All input lists must have the same length.")
|
|
155
|
+
|
|
156
|
+
for fp, sname, sid, d_type in zip(file_paths, source_names, source_ids, document_types):
|
|
157
|
+
# Determine if fp is a file path (str) or a file-like object (e.g., BytesIO).
|
|
158
|
+
if isinstance(fp, str):
|
|
159
|
+
encoded_content = read_file_as_base64(fp)
|
|
160
|
+
elif hasattr(fp, "read"):
|
|
161
|
+
encoded_content = read_bytesio_as_base64(fp)
|
|
162
|
+
else:
|
|
163
|
+
raise ValueError("Each element in file_paths must be a string or a file-like object.")
|
|
164
|
+
|
|
165
|
+
# Build metadata components.
|
|
166
|
+
source_meta = create_source_metadata(sname, sid, d_type)
|
|
167
|
+
content_meta = create_content_metadata(d_type)
|
|
168
|
+
# If the content type is image, initialize image_metadata as {}.
|
|
169
|
+
image_metadata = {} if content_meta.get("type") == ContentTypeEnum.IMAGE else None
|
|
170
|
+
|
|
171
|
+
# Assemble the complete metadata dictionary.
|
|
172
|
+
metadata = {
|
|
173
|
+
"content": encoded_content,
|
|
174
|
+
"content_url": "",
|
|
175
|
+
"embedding": None,
|
|
176
|
+
"source_metadata": source_meta,
|
|
177
|
+
"content_metadata": content_meta,
|
|
178
|
+
"audio_metadata": None,
|
|
179
|
+
"text_metadata": None,
|
|
180
|
+
"image_metadata": image_metadata,
|
|
181
|
+
"table_metadata": None,
|
|
182
|
+
"chart_metadata": None,
|
|
183
|
+
"error_metadata": None,
|
|
184
|
+
"info_message_metadata": None,
|
|
185
|
+
"debug_metadata": None,
|
|
186
|
+
"raise_on_failure": False,
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
# Build the row dictionary.
|
|
190
|
+
row = {
|
|
191
|
+
"source_name": sname,
|
|
192
|
+
"source_id": sid,
|
|
193
|
+
"content": encoded_content,
|
|
194
|
+
"document_type": d_type,
|
|
195
|
+
"metadata": metadata,
|
|
196
|
+
}
|
|
197
|
+
rows.append(row)
|
|
198
|
+
|
|
199
|
+
# Create and return the DataFrame.
|
|
200
|
+
return pd.DataFrame(rows)
|