nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (38) hide show
  1. nv_ingest_client/cli/util/click.py +182 -30
  2. nv_ingest_client/cli/util/processing.py +0 -393
  3. nv_ingest_client/client/client.py +561 -207
  4. nv_ingest_client/client/ingest_job_handler.py +412 -0
  5. nv_ingest_client/client/interface.py +466 -59
  6. nv_ingest_client/client/util/processing.py +11 -1
  7. nv_ingest_client/nv_ingest_cli.py +58 -6
  8. nv_ingest_client/primitives/jobs/job_spec.py +32 -10
  9. nv_ingest_client/primitives/tasks/__init__.py +6 -4
  10. nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
  11. nv_ingest_client/primitives/tasks/caption.py +10 -16
  12. nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
  13. nv_ingest_client/primitives/tasks/dedup.py +12 -21
  14. nv_ingest_client/primitives/tasks/embed.py +37 -76
  15. nv_ingest_client/primitives/tasks/extract.py +68 -169
  16. nv_ingest_client/primitives/tasks/filter.py +22 -28
  17. nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
  18. nv_ingest_client/primitives/tasks/split.py +17 -18
  19. nv_ingest_client/primitives/tasks/store.py +29 -29
  20. nv_ingest_client/primitives/tasks/task_base.py +1 -72
  21. nv_ingest_client/primitives/tasks/task_factory.py +10 -11
  22. nv_ingest_client/primitives/tasks/udf.py +349 -0
  23. nv_ingest_client/util/dataset.py +8 -2
  24. nv_ingest_client/util/document_analysis.py +314 -0
  25. nv_ingest_client/util/image_disk_utils.py +300 -0
  26. nv_ingest_client/util/transport.py +12 -6
  27. nv_ingest_client/util/util.py +66 -0
  28. nv_ingest_client/util/vdb/milvus.py +220 -75
  29. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
  30. nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
  31. nv_ingest_client/cli/util/tasks.py +0 -3
  32. nv_ingest_client/primitives/exceptions.py +0 -0
  33. nv_ingest_client/primitives/tasks/transform.py +0 -0
  34. nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
  35. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
  36. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
  37. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
  38. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,314 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ Utility functions for analyzing document-level chunk composition from nv-ingest results.
7
+
8
+ This module provides analysis capabilities for understanding the distribution and types
9
+ of extracted content elements across individual documents. It enables customers to
10
+ gain visibility into their document composition for performance optimization and
11
+ capacity planning decisions.
12
+ """
13
+
14
+ import logging
15
+ import os
16
+ from collections import defaultdict
17
+ from typing import Any, Dict, List, Union
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def analyze_document_chunks(
23
+ results: Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]],
24
+ ) -> Dict[str, Dict[str, Dict[str, int]]]:
25
+ """
26
+ Analyze ingestor results to count elements by type and page for each document.
27
+
28
+ This function processes results from nv-ingest ingestion and provides a per-document,
29
+ per-page breakdown of extracted content types, enabling customers to understand document
30
+ composition and page-level distribution for optimization and planning purposes.
31
+
32
+ Parameters
33
+ ----------
34
+ results : Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
35
+ Ingestor results from ingestor.ingest() in standard List[List[Dict]] format,
36
+ or flattened List[Dict] format. Handles both regular lists and
37
+ LazyLoadedList objects automatically.
38
+
39
+ Returns
40
+ -------
41
+ Dict[str, Dict[str, Dict[str, int]]]
42
+ Dictionary mapping document names to page-level element type counts with structure:
43
+ {
44
+ "document1.pdf": {
45
+ "total": {
46
+ "text": 7, "charts": 1, "tables": 1,
47
+ "unstructured_images": 0, "infographics": 0, "page_images": 0
48
+ },
49
+ "1": {
50
+ "text": 3, "charts": 1, "tables": 0,
51
+ "unstructured_images": 0, "infographics": 0, "page_images": 0
52
+ },
53
+ "2": {
54
+ "text": 4, "charts": 0, "tables": 1,
55
+ "unstructured_images": 0, "infographics": 0, "page_images": 0
56
+ }
57
+ },
58
+ "document2.pdf": {...}
59
+ }
60
+
61
+ Notes
62
+ -----
63
+ - Requires purge_results_after_upload=False in vdb_upload() configuration
64
+ - Automatically handles LazyLoadedList objects from nv-ingest client
65
+ - Returns zero counts for missing element types
66
+ - Assumes valid nv-ingest output format with guaranteed metadata structure
67
+
68
+ Examples
69
+ --------
70
+ >>> from nv_ingest_client.util.document_analysis import analyze_document_chunks
71
+ >>>
72
+ >>> # After running ingestion
73
+ >>> results, failures = ingestor.ingest(show_progress=True, return_failures=True)
74
+ >>>
75
+ >>> # Analyze document composition by page
76
+ >>> breakdown = analyze_document_chunks(results)
77
+ >>>
78
+ >>> for doc_name, pages in breakdown.items():
79
+ ... total_counts = pages["total"]
80
+ ... total_elements = sum(total_counts.values())
81
+ ... page_count = len(pages) - 1 # Subtract 1 for "total" key
82
+ ... print(f"{doc_name}: {total_elements} elements across {page_count} pages")
83
+ ... print(f" total: {total_elements} elements ({total_counts['text']} text, {total_counts['charts']} charts)")
84
+ ... for page_name, counts in pages.items():
85
+ ... if page_name != "total": # Skip total when listing pages
86
+ ... page_total = sum(counts.values())
87
+ ... print(
88
+ f" page {page_name}: {page_total} elements "
89
+ f"({counts['text']} text, {counts['charts']} charts)"
90
+ )
91
+ """
92
+
93
+ if not results:
94
+ logger.warning("No results provided for analysis")
95
+ return {}
96
+
97
+ # Normalize input format to handle both List[List[Dict]] and List[Dict] structures
98
+ normalized_results = _normalize_results_format(results)
99
+
100
+ # Group elements by document name and page number
101
+ document_page_elements = defaultdict(lambda: defaultdict(list))
102
+
103
+ for doc_results in normalized_results:
104
+ # Handle LazyLoadedList and other iterable types
105
+ elements = _extract_elements_from_doc(doc_results)
106
+
107
+ for element in elements:
108
+ doc_name = _extract_document_name(element)
109
+ page_key = _extract_page_key(element)
110
+ document_page_elements[doc_name][page_key].append(element)
111
+
112
+ # Count element types per page within each document and calculate totals
113
+ document_page_counts = {}
114
+
115
+ for doc_name, pages in document_page_elements.items():
116
+ document_page_counts[doc_name] = {}
117
+ total_counts = _initialize_element_counts()
118
+
119
+ for page_key, elements in pages.items():
120
+ counts = _initialize_element_counts()
121
+
122
+ for element in elements:
123
+ element_type = _categorize_element(element)
124
+ counts[element_type] += 1
125
+ total_counts[element_type] += 1 # Add to document total
126
+
127
+ document_page_counts[doc_name][page_key] = counts
128
+
129
+ # Add the total counts for this document
130
+ document_page_counts[doc_name]["total"] = total_counts
131
+
132
+ if document_page_counts:
133
+ total_docs = len(document_page_counts)
134
+ total_pages = sum(len(pages) - 1 for pages in document_page_counts.values()) # Subtract 1 for "total" key
135
+ total_elements = sum(sum(page_counts["total"].values()) for page_counts in document_page_counts.values())
136
+ logger.info(f"Analyzed {total_elements} elements across {total_pages} pages in {total_docs} documents")
137
+ else:
138
+ logger.warning("No valid documents found for analysis")
139
+
140
+ return document_page_counts
141
+
142
+
143
+ def _normalize_results_format(results: Union[List[List[Dict]], List[Dict]]) -> List[List[Dict]]:
144
+ """
145
+ Normalize various input formats to consistent List[List[Dict]] structure.
146
+
147
+ Parameters
148
+ ----------
149
+ results : Union[List[List[Dict]], List[Dict]]
150
+ Input results in various formats
151
+
152
+ Returns
153
+ -------
154
+ List[List[Dict]]
155
+ Normalized results in standard format
156
+ """
157
+
158
+ if not results:
159
+ return []
160
+
161
+ # Handle List[List[Dict]] or List[LazyLoadedList] formats
162
+ if isinstance(results, list) and len(results) > 0:
163
+ first_elem = results[0]
164
+ # Check for list, LazyLoadedList, or any sequence-like object
165
+ if isinstance(first_elem, list) or (
166
+ hasattr(first_elem, "__iter__") and hasattr(first_elem, "__len__") and not isinstance(first_elem, dict)
167
+ ):
168
+ return results
169
+
170
+ # Handle flattened List[Dict] format by grouping elements by document
171
+ if isinstance(results, list) and len(results) > 0 and isinstance(results[0], dict):
172
+ doc_groups = defaultdict(list)
173
+ for element in results:
174
+ doc_name = _extract_document_name(element)
175
+ doc_groups[doc_name].append(element)
176
+
177
+ return list(doc_groups.values())
178
+
179
+ # Fallback for unexpected formats
180
+ return [[item] for item in results if item]
181
+
182
+
183
+ def _extract_elements_from_doc(doc_results) -> List[Dict]:
184
+ """
185
+ Extract elements from document results, handling various data types.
186
+
187
+ Parameters
188
+ ----------
189
+ doc_results : Any
190
+ Document results which may be a list, LazyLoadedList, or other iterable
191
+
192
+ Returns
193
+ -------
194
+ List[Dict]
195
+ List of element dictionaries
196
+ """
197
+
198
+ if isinstance(doc_results, list):
199
+ return doc_results
200
+ elif hasattr(doc_results, "__iter__") and hasattr(doc_results, "__len__"):
201
+ # Handle LazyLoadedList and other sequence-like objects
202
+ return list(doc_results)
203
+ else:
204
+ # Single element case
205
+ return [doc_results] if doc_results else []
206
+
207
+
208
+ def _extract_document_name(element: Dict[str, Any]) -> str:
209
+ """
210
+ Extract clean document name from element metadata.
211
+
212
+ Parameters
213
+ ----------
214
+ element : Dict[str, Any]
215
+ Element dictionary containing metadata
216
+
217
+ Returns
218
+ -------
219
+ str
220
+ Clean document filename (basename of source_id)
221
+ """
222
+
223
+ # nv-ingest guarantees this structure exists
224
+ source_id = element["metadata"]["source_metadata"]["source_id"]
225
+ return os.path.basename(source_id)
226
+
227
+
228
+ def _extract_page_key(element: Dict[str, Any]) -> str:
229
+ """
230
+ Extract page key from element metadata for consistent page naming.
231
+
232
+ Parameters
233
+ ----------
234
+ element : Dict[str, Any]
235
+ Element dictionary containing metadata
236
+
237
+ Returns
238
+ -------
239
+ str
240
+ Page number as string (e.g., "1", "2", or "unknown")
241
+ """
242
+
243
+ try:
244
+ page_number = element["metadata"]["content_metadata"]["page_number"]
245
+ if page_number is not None and page_number >= 0:
246
+ return str(page_number)
247
+ else:
248
+ return "unknown"
249
+ except (KeyError, TypeError):
250
+ logger.warning("Missing or invalid page_number in element metadata")
251
+ return "unknown"
252
+
253
+
254
+ def _categorize_element(element: Dict[str, Any]) -> str:
255
+ """
256
+ Categorize element by type using document_type and content metadata.
257
+
258
+ Parameters
259
+ ----------
260
+ element : Dict[str, Any]
261
+ Element dictionary with document_type and metadata fields
262
+
263
+ Returns
264
+ -------
265
+ str
266
+ Element category: "text", "charts", "tables", "unstructured_images",
267
+ "infographics", or "page_images"
268
+ """
269
+
270
+ doc_type = element["document_type"]
271
+
272
+ # Text elements
273
+ if doc_type == "text":
274
+ return "text"
275
+
276
+ # Structured elements with subtypes
277
+ elif doc_type == "structured":
278
+ subtype = element["metadata"]["content_metadata"]["subtype"]
279
+ if subtype == "chart":
280
+ return "charts"
281
+ elif subtype == "table":
282
+ return "tables"
283
+ elif subtype == "infographic":
284
+ return "infographics"
285
+ elif subtype == "page_image":
286
+ return "page_images"
287
+
288
+ # Image elements (unstructured)
289
+ elif doc_type == "image":
290
+ return "unstructured_images"
291
+
292
+ # Should not reach here with valid nv-ingest output
293
+ logger.warning(f"Unexpected element type: {doc_type}")
294
+ return "text" # Default to text for safety
295
+
296
+
297
+ def _initialize_element_counts() -> Dict[str, int]:
298
+ """
299
+ Initialize element counts dictionary with all supported types.
300
+
301
+ Returns
302
+ -------
303
+ Dict[str, int]
304
+ Dictionary with zero counts for all element types
305
+ """
306
+
307
+ return {
308
+ "text": 0,
309
+ "charts": 0,
310
+ "tables": 0,
311
+ "unstructured_images": 0,
312
+ "infographics": 0,
313
+ "page_images": 0,
314
+ }
@@ -0,0 +1,300 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ Utility functions for saving images from ingestion results to disk as actual image files.
7
+
8
+ This module provides comprehensive utilities for extracting and saving base64-encoded
9
+ images from nv-ingest results to local filesystem. Features include:
10
+ - Configurable filtering by image type (charts, tables, infographics, etc.)
11
+ - Descriptive filename generation with source and page information
12
+ - Organized directory structure by image type
13
+ - Detailed image counting and statistics
14
+
15
+ Typical use cases:
16
+ - Debugging and visual inspection of extracted content
17
+ - Quality assessment of image extraction pipeline
18
+ """
19
+
20
+ import logging
21
+ import os
22
+ from typing import Any, Dict, List
23
+
24
+ from nv_ingest_client.client.util.processing import get_valid_filename
25
+ from nv_ingest_api.util.image_processing.transforms import save_image_to_disk, _detect_base64_image_format
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ def _detect_extension_from_content(image_content: str) -> str:
31
+ """
32
+ Get file extension by detecting original image format.
33
+ Falls back to .jpeg if detection fails or format is unknown.
34
+ """
35
+ DEFAULT_EXT = "jpg" # must be either "jpg" or "png"
36
+ try:
37
+ fmt = _detect_base64_image_format(image_content).upper()
38
+ except Exception:
39
+ logger.warning("Image format detection failed; falling back to default '%s'.", DEFAULT_EXT)
40
+ return DEFAULT_EXT
41
+ ext_map = {
42
+ "JPEG": "jpg",
43
+ "JPG": "jpg",
44
+ "PNG": "png",
45
+ }
46
+ ext = ext_map.get(fmt, None)
47
+ if ext:
48
+ return ext
49
+ logger.warning("Unsupported image format '%s'; falling back to default '%s'.", fmt, DEFAULT_EXT)
50
+ return DEFAULT_EXT
51
+
52
+
53
+ def save_images_to_disk(
54
+ response_data: List[Dict[str, Any]],
55
+ output_directory: str,
56
+ save_charts: bool = True,
57
+ save_tables: bool = True,
58
+ save_infographics: bool = True,
59
+ save_page_images: bool = False,
60
+ save_raw_images: bool = False,
61
+ organize_by_type: bool = True,
62
+ output_format: str = "auto",
63
+ ) -> Dict[str, int]:
64
+ """
65
+ Save base64-encoded images from ingestion results to disk as actual image files.
66
+
67
+ This utility extracts images from ingestion response data and saves them to disk
68
+ with descriptive filenames that include the image subtype and page information.
69
+ It provides granular control over which types of images to save.
70
+
71
+ Parameters
72
+ ----------
73
+ response_data : List[Dict[str, Any]]
74
+ List of document results from ingestion, each containing metadata with base64 images.
75
+ output_directory : str
76
+ Base directory where images will be saved.
77
+ save_charts : bool, optional
78
+ Whether to save chart images. Default is True.
79
+ save_tables : bool, optional
80
+ Whether to save table images. Default is True.
81
+ save_infographics : bool, optional
82
+ Whether to save infographic images. Default is True.
83
+ save_page_images : bool, optional
84
+ Whether to save page-as-image files. Default is False.
85
+ save_raw_images : bool, optional
86
+ Whether to save raw/natural images. Default is False.
87
+ organize_by_type : bool, optional
88
+ Whether to organize images into subdirectories by type. Default is True.
89
+ output_format : str, optional
90
+ Output image format for saved files. Default is "auto".
91
+ - "auto": Preserve original format (fastest, no conversion)
92
+ - "jpeg": Convert to JPEG (smaller files, good compression)
93
+ - "png": Convert to PNG (lossless quality)
94
+ Use "auto" for maximum speed by avoiding format conversion.
95
+
96
+ Returns
97
+ -------
98
+ Dict[str, int]
99
+ Dictionary with counts of images saved by type.
100
+
101
+ Raises
102
+ ------
103
+ ValueError
104
+ If output_format is not supported.
105
+
106
+ Examples
107
+ --------
108
+ >>> from nv_ingest_client.util.image_disk_utils import save_images_to_disk
109
+ >>>
110
+ >>> # Save only charts and tables
111
+ >>> counts = save_images_to_disk(
112
+ ... response_data,
113
+ ... "./output/images",
114
+ ... save_charts=True,
115
+ ... save_tables=True,
116
+ ... save_page_images=False
117
+ ... )
118
+ >>> print(f"Saved {counts['chart']} charts and {counts['table']} tables")
119
+ """
120
+
121
+ if not response_data:
122
+ logger.warning("No response data provided")
123
+ return {}
124
+
125
+ # Validate format upfront to fail fast
126
+ normalized_format = output_format.lower()
127
+ if normalized_format not in ["auto", "png", "jpeg", "jpg"]:
128
+ raise ValueError(
129
+ f"Unsupported output format: '{output_format}'. Supported formats: 'auto', 'png', 'jpeg', 'jpg'"
130
+ )
131
+
132
+ # Initialize counters
133
+ image_counts = {"chart": 0, "table": 0, "infographic": 0, "page_image": 0, "image": 0, "total": 0}
134
+
135
+ # Create output directory
136
+ os.makedirs(output_directory, exist_ok=True)
137
+
138
+ for doc_idx, document in enumerate(response_data):
139
+ try:
140
+ metadata = document.get("metadata", {})
141
+ doc_type = document.get("document_type", "unknown")
142
+
143
+ # Skip documents without image content
144
+ image_content = metadata.get("content")
145
+ if not image_content:
146
+ continue
147
+
148
+ # Get document info for naming
149
+ source_metadata = metadata.get("source_metadata", {})
150
+ source_id = source_metadata.get("source_id", f"document_{doc_idx}")
151
+ clean_source_name = get_valid_filename(os.path.basename(source_id))
152
+
153
+ content_metadata = metadata.get("content_metadata", {})
154
+ subtype = content_metadata.get("subtype", "image")
155
+ page_number = content_metadata.get("page_number", 0)
156
+
157
+ # Apply filtering based on image subtype and user preferences
158
+ should_save = False
159
+ if subtype == "chart" and save_charts:
160
+ should_save = True
161
+ elif subtype == "table" and save_tables:
162
+ should_save = True
163
+ elif subtype == "infographic" and save_infographics:
164
+ should_save = True
165
+ elif subtype == "page_image" and save_page_images:
166
+ should_save = True
167
+ elif (
168
+ doc_type == "image"
169
+ and subtype not in ["chart", "table", "infographic", "page_image"]
170
+ and save_raw_images
171
+ ):
172
+ should_save = True
173
+ subtype = "image" # Normalize subtype for consistent counting
174
+
175
+ if not should_save:
176
+ continue
177
+
178
+ # Determine file extension and target format (format already validated upfront)
179
+ if normalized_format in ["jpeg", "jpg"]:
180
+ file_ext, target_format = "jpeg", "jpeg"
181
+ elif normalized_format == "png":
182
+ file_ext, target_format = "png", "png"
183
+ else: # normalized_format == "auto" - detect once and use result
184
+ detected_ext = _detect_extension_from_content(image_content)
185
+ if detected_ext == "png":
186
+ file_ext, target_format = "png", "png"
187
+ else: # detected_ext == "jpeg"
188
+ file_ext, target_format = "jpeg", "jpeg"
189
+
190
+ if organize_by_type:
191
+ # Organize into subdirectories by image type
192
+ type_dir = os.path.join(output_directory, subtype)
193
+ os.makedirs(type_dir, exist_ok=True)
194
+ image_filename = f"{clean_source_name}_p{page_number}_{doc_idx}.{file_ext}"
195
+ image_path = os.path.join(type_dir, image_filename)
196
+ else:
197
+ # Flat directory structure with type in filename
198
+ image_filename = f"{clean_source_name}_{subtype}_p{page_number}_{doc_idx}.{file_ext}"
199
+ image_path = os.path.join(output_directory, image_filename)
200
+
201
+ # Save image using centralized API function
202
+ try:
203
+ success = save_image_to_disk(image_content, image_path, target_format)
204
+
205
+ if success:
206
+ # Update image type counters
207
+ image_counts[subtype] += 1
208
+ image_counts["total"] += 1
209
+ logger.debug(f"Saved {subtype} image: {image_path}")
210
+ else:
211
+ logger.error(f"Failed to save {subtype} image for {clean_source_name}")
212
+
213
+ except Exception as e:
214
+ logger.error(f"Failed to save {subtype} image for {clean_source_name}: {e}")
215
+
216
+ except Exception as e:
217
+ logger.error(f"Failed to process document {doc_idx}: {e}")
218
+ continue
219
+
220
+ # Log summary statistics
221
+ if image_counts["total"] > 0:
222
+ logger.info(f"Successfully saved {image_counts['total']} images to {output_directory}")
223
+ for img_type, count in image_counts.items():
224
+ if img_type != "total" and count > 0:
225
+ logger.info(f" - {img_type}: {count}")
226
+ else:
227
+ logger.info("No images were saved (none met filter criteria)")
228
+
229
+ return image_counts
230
+
231
+
232
+ def save_images_from_response(response: Dict[str, Any], output_directory: str, **kwargs) -> Dict[str, int]:
233
+ """
234
+ Convenience function to save images from a full API response.
235
+
236
+ Parameters
237
+ ----------
238
+ response : Dict[str, Any]
239
+ Full API response containing a "data" field with document results.
240
+ output_directory : str
241
+ Directory where images will be saved.
242
+ **kwargs
243
+ Additional arguments passed to save_images_to_disk().
244
+ Includes output_format ("auto", "png", or "jpeg") and other filtering options.
245
+
246
+ Returns
247
+ -------
248
+ Dict[str, int]
249
+ Dictionary with counts of images saved by type.
250
+ """
251
+
252
+ if "data" not in response or not response["data"]:
253
+ logger.warning("No data found in response")
254
+ return {}
255
+
256
+ return save_images_to_disk(response["data"], output_directory, **kwargs)
257
+
258
+
259
+ def save_images_from_ingestor_results(
260
+ results: List[List[Dict[str, Any]]], output_directory: str, **kwargs
261
+ ) -> Dict[str, int]:
262
+ """
263
+ Save images from Ingestor.ingest() results.
264
+
265
+ Parameters
266
+ ----------
267
+ results : List[List[Dict[str, Any]]]
268
+ Results from Ingestor.ingest(), where each inner list contains
269
+ document results for one source file. Can also handle LazyLoadedList
270
+ objects when save_to_disk=True is used.
271
+ output_directory : str
272
+ Directory where images will be saved.
273
+ **kwargs
274
+ Additional arguments passed to save_images_to_disk().
275
+ Includes output_format ("auto", "png", or "jpeg") and other filtering options.
276
+
277
+ Returns
278
+ -------
279
+ Dict[str, int]
280
+ Dictionary with counts of images saved by type.
281
+ """
282
+
283
+ # Flatten results from multiple documents into single list
284
+ all_documents = []
285
+ for doc_results in results:
286
+ if isinstance(doc_results, list):
287
+ # Standard list of document results
288
+ all_documents.extend(doc_results)
289
+ elif hasattr(doc_results, "__iter__") and hasattr(doc_results, "__len__"):
290
+ # Handle LazyLoadedList or other sequence-like objects
291
+ try:
292
+ all_documents.extend(list(doc_results))
293
+ except Exception as e:
294
+ logger.warning(f"Failed to process document results: {e}")
295
+ continue
296
+ else:
297
+ # Handle single document case
298
+ all_documents.append(doc_results)
299
+
300
+ return save_images_to_disk(all_documents, output_directory, **kwargs)
@@ -15,35 +15,41 @@ def infer_microservice(
15
15
  truncate: str = "END",
16
16
  batch_size: int = 8191,
17
17
  grpc: bool = False,
18
+ input_names: list = ["text"],
19
+ output_names: list = ["embeddings"],
20
+ dtypes: list = ["BYTES"],
18
21
  ):
19
22
  """
20
23
  This function takes the input data and creates a list of embeddings
21
24
  using the NVIDIA embedding microservice.
22
25
  """
23
- data = {"prompts": [res["metadata"]["content"] for res in data]}
26
+ if isinstance(data[0], str):
27
+ data = {"prompts": data}
28
+ else:
29
+ data = {"prompts": [res["metadata"]["content"] for res in data]}
24
30
  if grpc:
25
31
  model_name = re.sub(r"[^a-zA-Z0-9]", "_", model_name)
26
32
  client = NimClient(
27
33
  model_interface=EmbeddingModelInterface(),
28
34
  protocol="grpc",
29
- endpoints=(embedding_endpoint, embedding_endpoint),
35
+ endpoints=(embedding_endpoint, None),
30
36
  auth_token=nvidia_api_key,
31
37
  )
32
38
  return client.infer(
33
39
  data,
34
40
  model_name,
35
41
  parameters={"input_type": input_type, "truncate": truncate},
36
- outputs=["embeddings"],
37
- dtype=["BYTES"],
38
- input_name=["text"],
42
+ dtypes=dtypes,
43
+ input_names=input_names,
39
44
  batch_size=batch_size,
45
+ output_names=output_names,
40
46
  )
41
47
  else:
42
48
  embedding_endpoint = f"{embedding_endpoint}/embeddings"
43
49
  client = NimClient(
44
50
  model_interface=EmbeddingModelInterface(),
45
51
  protocol="http",
46
- endpoints=(embedding_endpoint, embedding_endpoint),
52
+ endpoints=(None, embedding_endpoint),
47
53
  auth_token=nvidia_api_key,
48
54
  )
49
55
  return client.infer(data, model_name, input_type=input_type, truncate=truncate, batch_size=batch_size)