nv-ingest-api 2025.4.16.dev20250416__py3-none-any.whl → 2025.4.17.dev20250417__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +435 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +72 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +334 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +398 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.17.dev20250417.dist-info/RECORD +152 -0
  149. nv_ingest_api-2025.4.16.dev20250416.dist-info/RECORD +0 -9
  150. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  151. {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,382 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from io import BytesIO
6
+ from typing import Optional, Dict, List, Union
7
+
8
+ import pandas as pd
9
+
10
+ from nv_ingest_api.interface.utility import (
11
+ build_dataframe_from_files,
12
+ )
13
+ from nv_ingest_api.internal.enums.common import DocumentTypeEnum
14
+ from nv_ingest_api.internal.schemas.transform.transform_image_caption_schema import ImageCaptionExtractionSchema
15
+ from nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema import TextEmbeddingSchema
16
+ from nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema import TextSplitterSchema
17
+ from nv_ingest_api.internal.transform.caption_image import transform_image_create_vlm_caption_internal
18
+ from nv_ingest_api.internal.transform.embed_text import transform_create_text_embeddings_internal
19
+ from nv_ingest_api.internal.transform.split_text import transform_text_split_and_tokenize_internal
20
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
21
+
22
+
23
+ @unified_exception_handler
24
+ def transform_text_create_embeddings(
25
+ *,
26
+ inputs: pd.DataFrame,
27
+ api_key: str,
28
+ batch_size: Optional[int] = 8192,
29
+ embedding_model: Optional[str] = None,
30
+ embedding_nim_endpoint: Optional[str] = None,
31
+ encoding_format: Optional[str] = None,
32
+ input_type: Optional[str] = None,
33
+ truncate: Optional[str] = None,
34
+ ) -> pd.DataFrame:
35
+ """
36
+ Creates text embeddings using the provided configuration.
37
+ Parameters provided as None will use the default values from EmbedExtractionsSchema.
38
+ """
39
+ task_config = {}
40
+
41
+ # Build configuration parameters only if provided; defaults come from EmbedExtractionsSchema.
42
+ config_kwargs = {
43
+ "batch_size": batch_size,
44
+ "embedding_model": embedding_model,
45
+ "embedding_nim_endpoint": embedding_nim_endpoint,
46
+ "encoding_format": encoding_format,
47
+ "input_type": input_type,
48
+ "truncate": truncate,
49
+ }
50
+ # Remove any keys with a None value.
51
+ config_kwargs = {k: v for k, v in config_kwargs.items() if v is not None}
52
+ config_kwargs["api_key"] = api_key
53
+
54
+ transform_config = TextEmbeddingSchema(**config_kwargs)
55
+
56
+ result, _ = transform_create_text_embeddings_internal(
57
+ df_transform_ledger=inputs,
58
+ task_config=task_config,
59
+ transform_config=transform_config,
60
+ execution_trace_log=None,
61
+ )
62
+
63
+ return result
64
+
65
+
66
+ @unified_exception_handler
67
+ def transform_image_create_vlm_caption(
68
+ *,
69
+ inputs: Union[pd.DataFrame, tuple, List[tuple]],
70
+ api_key: Optional[str] = None,
71
+ prompt: Optional[str] = None,
72
+ endpoint_url: Optional[str] = None,
73
+ model_name: Optional[str] = None,
74
+ ) -> pd.DataFrame:
75
+ """
76
+ Extract captions for image content using the VLM model API.
77
+
78
+ This function processes image content for caption generation. It accepts input in one
79
+ of three forms:
80
+
81
+ 1. A pandas DataFrame with the following required structure:
82
+ - Columns:
83
+ - ``source_name`` (str): Identifier for the source file.
84
+ - ``source_id`` (str): Unique identifier for the file.
85
+ - ``content`` (str): Base64-encoded string representing the file content.
86
+ - ``document_type`` (str): A string representing the document type (e.g., DocumentTypeEnum.PNG).
87
+ - ``metadata`` (dict): A dictionary containing at least:
88
+ - ``content``: Same as the base64-encoded file content.
89
+ - ``source_metadata``: Dictionary created via :func:`create_source_metadata`.
90
+ - ``content_metadata``: Dictionary created via :func:`create_content_metadata`.
91
+ - ``image_metadata``: For image files, initialized as an empty dict ({}); other metadata fields
92
+ (audio_metadata, text_metadata, etc.) are typically None or empty.
93
+ - ``raise_on_failure``: Boolean flag (typically False).
94
+
95
+ 2. A single tuple of the form ``(file_source, document_type)``.
96
+ - ``file_source``: Either a file path (str) or a file-like object (e.g., BytesIO).
97
+ - ``document_type``: A string representing the document type (e.g., DocumentTypeEnum.PNG).
98
+
99
+ 3. A list of such tuples.
100
+
101
+ For non-DataFrame inputs, a DataFrame is constructed using the helper function
102
+ :func:`build_dataframe_from_files`. When the file_source is a file-like object, its content
103
+ is converted to a base64-encoded string using :func:`read_bytesio_as_base64`; if it is a file
104
+ path (str), :func:`read_file_as_base64` is used.
105
+
106
+ Parameters
107
+ ----------
108
+ inputs : Union[pd.DataFrame, tuple, List[tuple]]
109
+ Input data representing image content. Accepted formats:
110
+ - A pandas DataFrame with the required structure as described above.
111
+ - A single tuple ``(file_source, document_type)``.
112
+ - A list of tuples of the form ``(file_source, document_type)``.
113
+ In the tuples, ``file_source`` is either a file path (str) or a file-like object (e.g., BytesIO),
114
+ and ``document_type`` is a string (typically one of the DocumentTypeEnum values).
115
+
116
+ api_key : Optional[str], default=None
117
+ API key for authentication with the VLM endpoint. If not provided, defaults are used.
118
+
119
+ prompt : Optional[str], default=None
120
+ Text prompt to guide caption generation.
121
+
122
+ endpoint_url : Optional[str], default=None
123
+ URL of the VLM model HTTP endpoint.
124
+
125
+ model_name : Optional[str], default=None
126
+ Name of the model to be used for caption generation.
127
+
128
+ Returns
129
+ -------
130
+ pd.DataFrame
131
+ A pandas DataFrame with generated captions inserted into the
132
+ ``metadata.image_metadata.caption`` field for each image row.
133
+
134
+ Raises
135
+ ------
136
+ ValueError
137
+ If the input is not a DataFrame, tuple, or list of tuples, or if any tuple is not of length 2.
138
+ Exception
139
+ Propagates any exception encountered during processing or caption extraction.
140
+
141
+ Examples
142
+ --------
143
+ >>> # Example using a DataFrame:
144
+ >>> df = pd.DataFrame({
145
+ ... "source_name": ["image.png"],
146
+ ... "source_id": ["image.png"],
147
+ ... "content": ["<base64-string>"],
148
+ ... "document_type": ["png"],
149
+ ... "metadata": [{
150
+ ... "content": "<base64-string>",
151
+ ... "source_metadata": {...},
152
+ ... "content_metadata": {...},
153
+ ... "image_metadata": {},
154
+ ... "raise_on_failure": False,
155
+ ... }],
156
+ ... })
157
+ >>> transform_image_create_vlm_caption(inputs=df, api_key="key", prompt="Caption the image:")
158
+
159
+ >>> # Example using a tuple:
160
+ >>> transform_image_create_vlm_caption(inputs=("image.png", DocumentTypeEnum.PNG), api_key="key",
161
+ prompt="Caption the image:")
162
+
163
+ >>> # Example using a list of tuples with file paths:
164
+ >>> transform_image_create_vlm_caption(inputs=[("image.png", DocumentTypeEnum.PNG),
165
+ ("image2.png", DocumentTypeEnum.PNG)], api_key="key", prompt="Caption the image:")
166
+
167
+ >>> # Example using a list of tuples with BytesIO objects:
168
+ >>> from io import BytesIO
169
+ >>> with open("image.png", "rb") as f:
170
+ ... bytes_io = BytesIO(f.read())
171
+ >>> transform_image_create_vlm_caption(inputs=[(bytes_io, DocumentTypeEnum.PNG)],
172
+ api_key="key", prompt="Caption the image:")
173
+ """
174
+ if not isinstance(inputs, pd.DataFrame):
175
+ # Normalize a single tuple to a list.
176
+ if isinstance(inputs, tuple):
177
+ file_items = [inputs]
178
+ elif isinstance(inputs, list):
179
+ file_items = inputs
180
+ else:
181
+ raise ValueError(
182
+ "df_ledger must be a DataFrame, a tuple (file_source, document_type), or a list of such tuples."
183
+ )
184
+
185
+ file_sources: List[Union[str, BytesIO]] = []
186
+ source_names: List[str] = []
187
+ source_ids: List[str] = []
188
+ doc_types: List[str] = []
189
+
190
+ for item in file_items:
191
+ if not (isinstance(item, tuple) and len(item) == 2):
192
+ raise ValueError("Each item must be a tuple of (file_source, document_type).")
193
+ file_source, doc_type = item
194
+ file_sources.append(file_source)
195
+ # Use the file_source string as the identifier if available; else construct one.
196
+ if isinstance(file_source, str):
197
+ identifier = file_source
198
+ else:
199
+ identifier = f"bytesio_{doc_type}"
200
+ source_names.append(identifier)
201
+ source_ids.append(identifier)
202
+ doc_types.append(doc_type)
203
+
204
+ inputs = build_dataframe_from_files(file_sources, source_names, source_ids, doc_types)
205
+
206
+ task_config: Dict[str, Optional[str]] = {
207
+ "api_key": api_key,
208
+ "prompt": prompt,
209
+ "endpoint_url": endpoint_url,
210
+ "model_name": model_name,
211
+ }
212
+ filtered_task_config: Dict[str, str] = {k: v for k, v in task_config.items() if v is not None}
213
+
214
+ transform_config = ImageCaptionExtractionSchema(**filtered_task_config)
215
+
216
+ result = transform_image_create_vlm_caption_internal(
217
+ df_transform_ledger=inputs,
218
+ task_config=filtered_task_config,
219
+ transform_config=transform_config,
220
+ execution_trace_log=None,
221
+ )
222
+
223
+ return result
224
+
225
+
226
+ @unified_exception_handler
227
+ def transform_text_split_and_tokenize(
228
+ *,
229
+ inputs: Union[pd.DataFrame, str, List[str]],
230
+ tokenizer: str,
231
+ chunk_size: int,
232
+ chunk_overlap: int,
233
+ split_source_types: Optional[List[str]] = None,
234
+ hugging_face_access_token: Optional[str] = None,
235
+ ) -> pd.DataFrame:
236
+ """
237
+ Transform and tokenize text documents by splitting them into smaller chunks.
238
+
239
+ This function prepares the configuration parameters for text splitting and tokenization,
240
+ and then delegates the splitting and asynchronous tokenization to an internal function.
241
+
242
+ The function accepts input in one of two forms:
243
+
244
+ 1. A pandas DataFrame that already follows the required structure:
245
+
246
+ Required DataFrame Structure:
247
+ - source_name (str): Identifier for the source document.
248
+ - source_id (str): Unique identifier for the document.
249
+ - content (str): The document content (typically as a base64-encoded string).
250
+ - document_type (str): For plain text, set to DocumentTypeEnum.TXT.
251
+ - metadata (dict): Must contain:
252
+ * content: The original text content.
253
+ * content_metadata: A dictionary with a key "type" (e.g., "text").
254
+ * source_metadata: A dictionary with source-specific metadata (e.g., file path, timestamps).
255
+ * Other keys (audio_metadata, image_metadata, etc.) set to None or empty as appropriate.
256
+ * raise_on_failure: Boolean (typically False).
257
+
258
+ 2. A plain text string or a list of plain text strings.
259
+ In this case, the function converts each text into a BytesIO object (encoding it as UTF-8)
260
+ and then uses the helper function `build_dataframe_from_files` to construct a DataFrame where:
261
+ - source_name and source_id are generated as "text_0", "text_1", etc.
262
+ - content is the base64-encoded representation of the UTF-8 encoded text.
263
+ - document_type is set to DocumentTypeEnum.TXT.
264
+ - metadata is constructed using helper functions (for source and content metadata),
265
+ with content_metadata's "type" set to "text".
266
+
267
+ Parameters
268
+ ----------
269
+ inputs : Union[pd.DataFrame, str, List[str]]
270
+ Either a DataFrame following the required structure, a single plain text string,
271
+ or a list of plain text strings.
272
+ tokenizer : str
273
+ Identifier or path of the tokenizer to be used (e.g., "bert-base-uncased").
274
+ chunk_size : int
275
+ Maximum number of tokens per chunk.
276
+ chunk_overlap : int
277
+ Number of tokens to overlap between consecutive chunks.
278
+ split_source_types : Optional[List[str]], default=["text"]
279
+ List of source types to filter for text splitting. If None or empty, defaults to ["text"].
280
+ hugging_face_access_token : Optional[str], default=None
281
+ Access token for Hugging Face authentication, if required.
282
+
283
+ Returns
284
+ -------
285
+ pd.DataFrame
286
+ A DataFrame with the processed documents, where text content has been split into smaller chunks.
287
+ The returned DataFrame retains the original columns and updates the "metadata" field with
288
+ generated tokenized segments and embedding information.
289
+
290
+ Raises
291
+ ------
292
+ Exception
293
+ Propagates any exceptions encountered during text splitting and tokenization, with additional
294
+ context provided by the unified exception handler.
295
+
296
+ Examples
297
+ --------
298
+ >>> # Using a DataFrame:
299
+ >>> import pandas as pd
300
+ >>> df = pd.DataFrame({
301
+ ... "source_name": ["doc1.txt"],
302
+ ... "source_id": ["doc1.txt"],
303
+ ... "content": ["<base64-encoded text>"],
304
+ ... "document_type": ["text"],
305
+ ... "metadata": [{
306
+ ... "content": "This is a document.",
307
+ ... "content_metadata": {"type": "text"},
308
+ ... "source_metadata": {"source_id": "doc1.txt", "source_name": "doc1.txt", "source_type": "txt"},
309
+ ... "audio_metadata": None,
310
+ ... "image_metadata": None,
311
+ ... "text_metadata": None,
312
+ ... "raise_on_failure": False,
313
+ ... }],
314
+ ... })
315
+ >>> transform_text_split_and_tokenize(
316
+ ... inputs=df,
317
+ ... tokenizer="bert-base-uncased",
318
+ ... chunk_size=512,
319
+ ... chunk_overlap=50
320
+ ... )
321
+
322
+ >>> # Using a single plain text string:
323
+ >>> transform_text_split_and_tokenize(
324
+ ... inputs="This is a plain text document.",
325
+ ... tokenizer="bert-base-uncased",
326
+ ... chunk_size=512,
327
+ ... chunk_overlap=50
328
+ ... )
329
+
330
+ >>> # Using a list of plain text strings:
331
+ >>> texts = ["Document one text.", "Document two text."]
332
+ >>> transform_text_split_and_tokenize(
333
+ ... inputs=texts,
334
+ ... tokenizer="bert-base-uncased",
335
+ ... chunk_size=512,
336
+ ... chunk_overlap=50
337
+ ... )
338
+ """
339
+ # If input is not a DataFrame, assume it is a string or list of strings and construct a DataFrame.
340
+ if not isinstance(inputs, pd.DataFrame):
341
+ if isinstance(inputs, str):
342
+ texts = [inputs]
343
+ elif isinstance(inputs, list) and all(isinstance(t, str) for t in inputs):
344
+ texts = inputs
345
+ else:
346
+ raise ValueError("df_ledger must be a DataFrame, a string, or a list of strings.")
347
+ # Convert each text string to a BytesIO object with UTF-8 encoding.
348
+ file_sources = [BytesIO(text.encode("utf-8")) for text in texts]
349
+ # Generate unique identifiers for source_name and source_id.
350
+ source_names = [f"text_{i}" for i in range(len(texts))]
351
+ source_ids = source_names.copy()
352
+ # For plain text, document type is set to DocumentTypeEnum.TXT.
353
+ doc_types = [DocumentTypeEnum.TXT for _ in texts]
354
+ inputs = build_dataframe_from_files(file_sources, source_names, source_ids, doc_types)
355
+
356
+ if not split_source_types:
357
+ split_source_types = ["text"]
358
+
359
+ task_config: Dict[str, any] = {
360
+ "chunk_overlap": chunk_overlap,
361
+ "chunk_size": chunk_size,
362
+ "params": {
363
+ "hf_access_token": hugging_face_access_token,
364
+ "split_source_types": split_source_types,
365
+ },
366
+ "tokenizer": tokenizer,
367
+ }
368
+
369
+ transform_config: TextSplitterSchema = TextSplitterSchema(
370
+ chunk_overlap=chunk_overlap,
371
+ chunk_size=chunk_size,
372
+ tokenizer=tokenizer,
373
+ )
374
+
375
+ result = transform_text_split_and_tokenize_internal(
376
+ df_transform_ledger=inputs,
377
+ task_config=task_config,
378
+ transform_config=transform_config,
379
+ execution_trace_log=None,
380
+ )
381
+
382
+ return result
@@ -0,0 +1,200 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import base64
6
+ import os
7
+ from io import BytesIO
8
+
9
+ import pandas as pd
10
+ from datetime import datetime
11
+ from typing import List, Union
12
+
13
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum, DocumentTypeEnum
14
+
15
+ # ------------------------------------------------------------------------------
16
+ # Mapping from DocumentTypeEnum to ContentTypeEnum
17
+ # ------------------------------------------------------------------------------
18
+ DOCUMENT_TO_CONTENT_MAPPING = {
19
+ DocumentTypeEnum.BMP: ContentTypeEnum.IMAGE,
20
+ DocumentTypeEnum.DOCX: ContentTypeEnum.STRUCTURED,
21
+ DocumentTypeEnum.HTML: ContentTypeEnum.TEXT,
22
+ DocumentTypeEnum.JPEG: ContentTypeEnum.IMAGE,
23
+ DocumentTypeEnum.PDF: ContentTypeEnum.STRUCTURED,
24
+ DocumentTypeEnum.PNG: ContentTypeEnum.IMAGE,
25
+ DocumentTypeEnum.PPTX: ContentTypeEnum.STRUCTURED,
26
+ DocumentTypeEnum.SVG: ContentTypeEnum.IMAGE,
27
+ DocumentTypeEnum.TIFF: ContentTypeEnum.IMAGE,
28
+ DocumentTypeEnum.TXT: ContentTypeEnum.TEXT,
29
+ DocumentTypeEnum.MD: ContentTypeEnum.TEXT,
30
+ DocumentTypeEnum.MP3: ContentTypeEnum.AUDIO,
31
+ DocumentTypeEnum.WAV: ContentTypeEnum.AUDIO,
32
+ DocumentTypeEnum.UNKNOWN: ContentTypeEnum.UNKNOWN,
33
+ }
34
+
35
+
36
+ # ------------------------------------------------------------------------------
37
+ # Helper function to get the document type from a file extension.
38
+ # ------------------------------------------------------------------------------
39
+ def get_document_type_from_extension(file_path: str) -> str:
40
+ ext = os.path.splitext(file_path)[1].lower()
41
+ mapping = {
42
+ ".png": DocumentTypeEnum.PNG,
43
+ ".jpg": DocumentTypeEnum.JPEG,
44
+ ".jpeg": DocumentTypeEnum.JPEG,
45
+ ".tiff": DocumentTypeEnum.TIFF,
46
+ ".svg": DocumentTypeEnum.SVG,
47
+ }
48
+ return mapping.get(ext, DocumentTypeEnum.UNKNOWN)
49
+
50
+
51
+ # ------------------------------------------------------------------------------
52
+ # Helper function to read a file and return its base64-encoded string.
53
+ # ------------------------------------------------------------------------------
54
+ def read_file_as_base64(file_path: str) -> str:
55
+ """
56
+ Reads the file at file_path in binary mode and returns its base64-encoded string.
57
+ """
58
+ with open(file_path, "rb") as f:
59
+ file_bytes = f.read()
60
+ return base64.b64encode(file_bytes).decode("utf-8")
61
+
62
+
63
+ # ------------------------------------------------------------------------------
64
+ # Helper function to read a BytesIO object and return its base64-encoded string.
65
+ # ------------------------------------------------------------------------------
66
+ def read_bytesio_as_base64(file_io: BytesIO) -> str:
67
+ """
68
+ Reads a BytesIO object and returns its base64-encoded string.
69
+
70
+ Parameters:
71
+ file_io (BytesIO): A file-like object containing binary data.
72
+
73
+ Returns:
74
+ str: The base64-encoded string representation of the file's contents.
75
+ """
76
+ file_bytes = file_io.getvalue()
77
+ return base64.b64encode(file_bytes).decode("utf-8")
78
+
79
+
80
+ # ------------------------------------------------------------------------------
81
+ # Helper function to create source metadata.
82
+ # ------------------------------------------------------------------------------
83
+ def create_source_metadata(source_name: str, source_id: str, document_type: str) -> dict:
84
+ """
85
+ Creates a source metadata dictionary for a file.
86
+
87
+ The source_type is set to the provided document_type.
88
+ The date_created and last_modified fields are set to the current ISO timestamp.
89
+ """
90
+ now_iso = datetime.now().isoformat()
91
+ return {
92
+ "source_name": source_name,
93
+ "source_id": source_id,
94
+ "source_location": "",
95
+ "source_type": document_type, # e.g., "pdf", "png", etc.
96
+ "collection_id": "",
97
+ "date_created": now_iso,
98
+ "last_modified": now_iso,
99
+ "summary": "",
100
+ "partition_id": -1,
101
+ "access_level": "unknown", # You may wish to adjust this if needed.
102
+ }
103
+
104
+
105
+ # ------------------------------------------------------------------------------
106
+ # Helper function to create content metadata.
107
+ # ------------------------------------------------------------------------------
108
+ def create_content_metadata(document_type: str) -> dict:
109
+ """
110
+ Creates a content metadata dictionary for a file based on its document type.
111
+
112
+ It maps the document type to the corresponding content type.
113
+ """
114
+ # Use the mapping; if document_type is not found, fallback to "unknown".
115
+ content_type = DOCUMENT_TO_CONTENT_MAPPING.get(document_type, ContentTypeEnum.UNKNOWN)
116
+ return {
117
+ "type": content_type,
118
+ "description": "",
119
+ "page_number": -1,
120
+ "hierarchy": {
121
+ "page_count": -1,
122
+ "page": -1,
123
+ "block": -1,
124
+ "line": -1,
125
+ "span": -1,
126
+ "nearby_objects": {
127
+ "text": {"content": [], "bbox": [], "type": []},
128
+ "images": {"content": [], "bbox": [], "type": []},
129
+ "structured": {"content": [], "bbox": [], "type": []},
130
+ },
131
+ },
132
+ "subtype": "",
133
+ }
134
+
135
+
136
+ # ------------------------------------------------------------------------------
137
+ # Main helper function to build a DataFrame from lists of files.
138
+ # ------------------------------------------------------------------------------
139
+ def build_dataframe_from_files(
140
+ file_paths: List[Union[str, BytesIO]],
141
+ source_names: List[str],
142
+ source_ids: List[str],
143
+ document_types: List[str],
144
+ ) -> pd.DataFrame:
145
+ """
146
+ Given lists of file paths (or BytesIO objects), source names, source IDs, and document types,
147
+ reads each file (base64-encoding its contents) and constructs a DataFrame.
148
+
149
+ For image content, 'image_metadata' is initialized as an empty dict, so it can later be updated.
150
+ """
151
+ rows = []
152
+ # Validate that all lists have the same length.
153
+ if not (len(file_paths) == len(source_names) == len(source_ids) == len(document_types)):
154
+ raise ValueError("All input lists must have the same length.")
155
+
156
+ for fp, sname, sid, d_type in zip(file_paths, source_names, source_ids, document_types):
157
+ # Determine if fp is a file path (str) or a file-like object (e.g., BytesIO).
158
+ if isinstance(fp, str):
159
+ encoded_content = read_file_as_base64(fp)
160
+ elif hasattr(fp, "read"):
161
+ encoded_content = read_bytesio_as_base64(fp)
162
+ else:
163
+ raise ValueError("Each element in file_paths must be a string or a file-like object.")
164
+
165
+ # Build metadata components.
166
+ source_meta = create_source_metadata(sname, sid, d_type)
167
+ content_meta = create_content_metadata(d_type)
168
+ # If the content type is image, initialize image_metadata as {}.
169
+ image_metadata = {} if content_meta.get("type") == ContentTypeEnum.IMAGE else None
170
+
171
+ # Assemble the complete metadata dictionary.
172
+ metadata = {
173
+ "content": encoded_content,
174
+ "content_url": "",
175
+ "embedding": None,
176
+ "source_metadata": source_meta,
177
+ "content_metadata": content_meta,
178
+ "audio_metadata": None,
179
+ "text_metadata": None,
180
+ "image_metadata": image_metadata,
181
+ "table_metadata": None,
182
+ "chart_metadata": None,
183
+ "error_metadata": None,
184
+ "info_message_metadata": None,
185
+ "debug_metadata": None,
186
+ "raise_on_failure": False,
187
+ }
188
+
189
+ # Build the row dictionary.
190
+ row = {
191
+ "source_name": sname,
192
+ "source_id": sid,
193
+ "content": encoded_content,
194
+ "document_type": d_type,
195
+ "metadata": metadata,
196
+ }
197
+ rows.append(row)
198
+
199
+ # Create and return the DataFrame.
200
+ return pd.DataFrame(rows)
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0