nv-ingest-client 2025.9.8.dev20250908__tar.gz → 2025.9.10.dev20250910__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (61) hide show
  1. {nv_ingest_client-2025.9.8.dev20250908/src/nv_ingest_client.egg-info → nv_ingest_client-2025.9.10.dev20250910}/PKG-INFO +1 -1
  2. nv_ingest_client-2025.9.10.dev20250910/src/nv_ingest_client/util/document_analysis.py +314 -0
  3. nv_ingest_client-2025.9.10.dev20250910/src/nv_ingest_client/util/image_disk_utils.py +300 -0
  4. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/util.py +1 -0
  5. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/vdb/milvus.py +58 -38
  6. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
  7. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client.egg-info/SOURCES.txt +2 -0
  8. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/LICENSE +0 -0
  9. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/MANIFEST.in +0 -0
  10. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/README.md +0 -0
  11. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/pyproject.toml +0 -0
  12. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/setup.cfg +0 -0
  13. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/__init__.py +0 -0
  14. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/cli/__init__.py +0 -0
  15. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/cli/util/__init__.py +0 -0
  16. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/cli/util/click.py +0 -0
  17. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/cli/util/processing.py +0 -0
  18. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/cli/util/system.py +0 -0
  19. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/client/__init__.py +0 -0
  20. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/client/client.py +0 -0
  21. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/client/interface.py +0 -0
  22. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/client/util/processing.py +0 -0
  23. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/nv_ingest_cli.py +0 -0
  24. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/__init__.py +0 -0
  25. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
  26. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/jobs/job_spec.py +0 -0
  27. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
  28. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
  29. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
  30. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/caption.py +0 -0
  31. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
  32. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
  33. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/embed.py +0 -0
  34. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/extract.py +0 -0
  35. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/filter.py +0 -0
  36. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
  37. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
  38. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/store.py +0 -0
  39. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
  40. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
  41. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
  42. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/udf.py +0 -0
  43. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
  44. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/__init__.py +0 -0
  45. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/dataset.py +0 -0
  46. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
  47. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/file_processing/extract.py +0 -0
  48. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/milvus.py +0 -0
  49. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/process_json_files.py +0 -0
  50. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/processing.py +0 -0
  51. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/system.py +0 -0
  52. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/transport.py +0 -0
  53. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
  54. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
  55. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
  56. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/zipkin.py +0 -0
  57. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
  58. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
  59. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client.egg-info/requires.txt +0 -0
  60. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
  61. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.9.8.dev20250908
3
+ Version: 2025.9.10.dev20250910
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -0,0 +1,314 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ Utility functions for analyzing document-level chunk composition from nv-ingest results.
7
+
8
+ This module provides analysis capabilities for understanding the distribution and types
9
+ of extracted content elements across individual documents. It enables customers to
10
+ gain visibility into their document composition for performance optimization and
11
+ capacity planning decisions.
12
+ """
13
+
14
+ import logging
15
+ import os
16
+ from collections import defaultdict
17
+ from typing import Any, Dict, List, Union
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def analyze_document_chunks(
23
+ results: Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
24
+ ) -> Dict[str, Dict[str, Dict[str, int]]]:
25
+ """
26
+ Analyze ingestor results to count elements by type and page for each document.
27
+
28
+ This function processes results from nv-ingest ingestion and provides a per-document,
29
+ per-page breakdown of extracted content types, enabling customers to understand document
30
+ composition and page-level distribution for optimization and planning purposes.
31
+
32
+ Parameters
33
+ ----------
34
+ results : Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
35
+ Ingestor results from ingestor.ingest() in standard List[List[Dict]] format,
36
+ or flattened List[Dict] format. Handles both regular lists and
37
+ LazyLoadedList objects automatically.
38
+
39
+ Returns
40
+ -------
41
+ Dict[str, Dict[str, Dict[str, int]]]
42
+ Dictionary mapping document names to page-level element type counts with structure:
43
+ {
44
+ "document1.pdf": {
45
+ "total": {
46
+ "text": 7, "charts": 1, "tables": 1,
47
+ "unstructured_images": 0, "infographics": 0, "page_images": 0
48
+ },
49
+ "1": {
50
+ "text": 3, "charts": 1, "tables": 0,
51
+ "unstructured_images": 0, "infographics": 0, "page_images": 0
52
+ },
53
+ "2": {
54
+ "text": 4, "charts": 0, "tables": 1,
55
+ "unstructured_images": 0, "infographics": 0, "page_images": 0
56
+ }
57
+ },
58
+ "document2.pdf": {...}
59
+ }
60
+
61
+ Notes
62
+ -----
63
+ - Requires purge_results_after_upload=False in vdb_upload() configuration
64
+ - Automatically handles LazyLoadedList objects from nv-ingest client
65
+ - Returns zero counts for missing element types
66
+ - Assumes valid nv-ingest output format with guaranteed metadata structure
67
+
68
+ Examples
69
+ --------
70
+ >>> from nv_ingest_client.util.document_analysis import analyze_document_chunks
71
+ >>>
72
+ >>> # After running ingestion
73
+ >>> results, failures = ingestor.ingest(show_progress=True, return_failures=True)
74
+ >>>
75
+ >>> # Analyze document composition by page
76
+ >>> breakdown = analyze_document_chunks(results)
77
+ >>>
78
+ >>> for doc_name, pages in breakdown.items():
79
+ ... total_counts = pages["total"]
80
+ ... total_elements = sum(total_counts.values())
81
+ ... page_count = len(pages) - 1 # Subtract 1 for "total" key
82
+ ... print(f"{doc_name}: {total_elements} elements across {page_count} pages")
83
+ ... print(f" total: {total_elements} elements ({total_counts['text']} text, {total_counts['charts']} charts)")
84
+ ... for page_name, counts in pages.items():
85
+ ... if page_name != "total": # Skip total when listing pages
86
+ ... page_total = sum(counts.values())
87
+ ... print(
88
+ f" page {page_name}: {page_total} elements "
89
+ f"({counts['text']} text, {counts['charts']} charts)"
90
+ )
91
+ """
92
+
93
+ if not results:
94
+ logger.warning("No results provided for analysis")
95
+ return {}
96
+
97
+ # Normalize input format to handle both List[List[Dict]] and List[Dict] structures
98
+ normalized_results = _normalize_results_format(results)
99
+
100
+ # Group elements by document name and page number
101
+ document_page_elements = defaultdict(lambda: defaultdict(list))
102
+
103
+ for doc_results in normalized_results:
104
+ # Handle LazyLoadedList and other iterable types
105
+ elements = _extract_elements_from_doc(doc_results)
106
+
107
+ for element in elements:
108
+ doc_name = _extract_document_name(element)
109
+ page_key = _extract_page_key(element)
110
+ document_page_elements[doc_name][page_key].append(element)
111
+
112
+ # Count element types per page within each document and calculate totals
113
+ document_page_counts = {}
114
+
115
+ for doc_name, pages in document_page_elements.items():
116
+ document_page_counts[doc_name] = {}
117
+ total_counts = _initialize_element_counts()
118
+
119
+ for page_key, elements in pages.items():
120
+ counts = _initialize_element_counts()
121
+
122
+ for element in elements:
123
+ element_type = _categorize_element(element)
124
+ counts[element_type] += 1
125
+ total_counts[element_type] += 1 # Add to document total
126
+
127
+ document_page_counts[doc_name][page_key] = counts
128
+
129
+ # Add the total counts for this document
130
+ document_page_counts[doc_name]["total"] = total_counts
131
+
132
+ if document_page_counts:
133
+ total_docs = len(document_page_counts)
134
+ total_pages = sum(len(pages) - 1 for pages in document_page_counts.values()) # Subtract 1 for "total" key
135
+ total_elements = sum(sum(page_counts["total"].values()) for page_counts in document_page_counts.values())
136
+ logger.info(f"Analyzed {total_elements} elements across {total_pages} pages in {total_docs} documents")
137
+ else:
138
+ logger.warning("No valid documents found for analysis")
139
+
140
+ return document_page_counts
141
+
142
+
143
+ def _normalize_results_format(results: Union[List[List[Dict]], List[Dict]]) -> List[List[Dict]]:
144
+ """
145
+ Normalize various input formats to consistent List[List[Dict]] structure.
146
+
147
+ Parameters
148
+ ----------
149
+ results : Union[List[List[Dict]], List[Dict]]
150
+ Input results in various formats
151
+
152
+ Returns
153
+ -------
154
+ List[List[Dict]]
155
+ Normalized results in standard format
156
+ """
157
+
158
+ if not results:
159
+ return []
160
+
161
+ # Handle List[List[Dict]] or List[LazyLoadedList] formats
162
+ if isinstance(results, list) and len(results) > 0:
163
+ first_elem = results[0]
164
+ # Check for list, LazyLoadedList, or any sequence-like object
165
+ if isinstance(first_elem, list) or (
166
+ hasattr(first_elem, "__iter__") and hasattr(first_elem, "__len__") and not isinstance(first_elem, dict)
167
+ ):
168
+ return results
169
+
170
+ # Handle flattened List[Dict] format by grouping elements by document
171
+ if isinstance(results, list) and len(results) > 0 and isinstance(results[0], dict):
172
+ doc_groups = defaultdict(list)
173
+ for element in results:
174
+ doc_name = _extract_document_name(element)
175
+ doc_groups[doc_name].append(element)
176
+
177
+ return list(doc_groups.values())
178
+
179
+ # Fallback for unexpected formats
180
+ return [[item] for item in results if item]
181
+
182
+
183
+ def _extract_elements_from_doc(doc_results) -> List[Dict]:
184
+ """
185
+ Extract elements from document results, handling various data types.
186
+
187
+ Parameters
188
+ ----------
189
+ doc_results : Any
190
+ Document results which may be a list, LazyLoadedList, or other iterable
191
+
192
+ Returns
193
+ -------
194
+ List[Dict]
195
+ List of element dictionaries
196
+ """
197
+
198
+ if isinstance(doc_results, list):
199
+ return doc_results
200
+ elif hasattr(doc_results, "__iter__") and hasattr(doc_results, "__len__"):
201
+ # Handle LazyLoadedList and other sequence-like objects
202
+ return list(doc_results)
203
+ else:
204
+ # Single element case
205
+ return [doc_results] if doc_results else []
206
+
207
+
208
+ def _extract_document_name(element: Dict[str, Any]) -> str:
209
+ """
210
+ Extract clean document name from element metadata.
211
+
212
+ Parameters
213
+ ----------
214
+ element : Dict[str, Any]
215
+ Element dictionary containing metadata
216
+
217
+ Returns
218
+ -------
219
+ str
220
+ Clean document filename (basename of source_id)
221
+ """
222
+
223
+ # nv-ingest guarantees this structure exists
224
+ source_id = element["metadata"]["source_metadata"]["source_id"]
225
+ return os.path.basename(source_id)
226
+
227
+
228
+ def _extract_page_key(element: Dict[str, Any]) -> str:
229
+ """
230
+ Extract page key from element metadata for consistent page naming.
231
+
232
+ Parameters
233
+ ----------
234
+ element : Dict[str, Any]
235
+ Element dictionary containing metadata
236
+
237
+ Returns
238
+ -------
239
+ str
240
+ Page number as string (e.g., "1", "2", or "unknown")
241
+ """
242
+
243
+ try:
244
+ page_number = element["metadata"]["content_metadata"]["page_number"]
245
+ if page_number is not None and page_number >= 0:
246
+ return str(page_number)
247
+ else:
248
+ return "unknown"
249
+ except (KeyError, TypeError):
250
+ logger.warning("Missing or invalid page_number in element metadata")
251
+ return "unknown"
252
+
253
+
254
+ def _categorize_element(element: Dict[str, Any]) -> str:
255
+ """
256
+ Categorize element by type using document_type and content metadata.
257
+
258
+ Parameters
259
+ ----------
260
+ element : Dict[str, Any]
261
+ Element dictionary with document_type and metadata fields
262
+
263
+ Returns
264
+ -------
265
+ str
266
+ Element category: "text", "charts", "tables", "unstructured_images",
267
+ "infographics", or "page_images"
268
+ """
269
+
270
+ doc_type = element["document_type"]
271
+
272
+ # Text elements
273
+ if doc_type == "text":
274
+ return "text"
275
+
276
+ # Structured elements with subtypes
277
+ elif doc_type == "structured":
278
+ subtype = element["metadata"]["content_metadata"]["subtype"]
279
+ if subtype == "chart":
280
+ return "charts"
281
+ elif subtype == "table":
282
+ return "tables"
283
+ elif subtype == "infographic":
284
+ return "infographics"
285
+ elif subtype == "page_image":
286
+ return "page_images"
287
+
288
+ # Image elements (unstructured)
289
+ elif doc_type == "image":
290
+ return "unstructured_images"
291
+
292
+ # Should not reach here with valid nv-ingest output
293
+ logger.warning(f"Unexpected element type: {doc_type}")
294
+ return "text" # Default to text for safety
295
+
296
+
297
+ def _initialize_element_counts() -> Dict[str, int]:
298
+ """
299
+ Initialize element counts dictionary with all supported types.
300
+
301
+ Returns
302
+ -------
303
+ Dict[str, int]
304
+ Dictionary with zero counts for all element types
305
+ """
306
+
307
+ return {
308
+ "text": 0,
309
+ "charts": 0,
310
+ "tables": 0,
311
+ "unstructured_images": 0,
312
+ "infographics": 0,
313
+ "page_images": 0,
314
+ }
@@ -0,0 +1,300 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ Utility functions for saving images from ingestion results to disk as actual image files.
7
+
8
+ This module provides comprehensive utilities for extracting and saving base64-encoded
9
+ images from nv-ingest results to local filesystem. Features include:
10
+ - Configurable filtering by image type (charts, tables, infographics, etc.)
11
+ - Descriptive filename generation with source and page information
12
+ - Organized directory structure by image type
13
+ - Detailed image counting and statistics
14
+
15
+ Typical use cases:
16
+ - Debugging and visual inspection of extracted content
17
+ - Quality assessment of image extraction pipeline
18
+ """
19
+
20
+ import logging
21
+ import os
22
+ from typing import Any, Dict, List
23
+
24
+ from nv_ingest_client.client.util.processing import get_valid_filename
25
+ from nv_ingest_api.util.image_processing.transforms import save_image_to_disk, _detect_base64_image_format
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ def _detect_extension_from_content(image_content: str) -> str:
31
+ """
32
+ Get file extension by detecting original image format.
33
+ Falls back to .jpeg if detection fails or format is unknown.
34
+ """
35
+ DEFAULT_EXT = "jpg" # must be either "jpg" or "png"
36
+ try:
37
+ fmt = _detect_base64_image_format(image_content).upper()
38
+ except Exception:
39
+ logger.warning("Image format detection failed; falling back to default '%s'.", DEFAULT_EXT)
40
+ return DEFAULT_EXT
41
+ ext_map = {
42
+ "JPEG": "jpg",
43
+ "JPG": "jpg",
44
+ "PNG": "png",
45
+ }
46
+ ext = ext_map.get(fmt, None)
47
+ if ext:
48
+ return ext
49
+ logger.warning("Unsupported image format '%s'; falling back to default '%s'.", fmt, DEFAULT_EXT)
50
+ return DEFAULT_EXT
51
+
52
+
53
+ def save_images_to_disk(
54
+ response_data: List[Dict[str, Any]],
55
+ output_directory: str,
56
+ save_charts: bool = True,
57
+ save_tables: bool = True,
58
+ save_infographics: bool = True,
59
+ save_page_images: bool = False,
60
+ save_raw_images: bool = False,
61
+ organize_by_type: bool = True,
62
+ output_format: str = "auto",
63
+ ) -> Dict[str, int]:
64
+ """
65
+ Save base64-encoded images from ingestion results to disk as actual image files.
66
+
67
+ This utility extracts images from ingestion response data and saves them to disk
68
+ with descriptive filenames that include the image subtype and page information.
69
+ It provides granular control over which types of images to save.
70
+
71
+ Parameters
72
+ ----------
73
+ response_data : List[Dict[str, Any]]
74
+ List of document results from ingestion, each containing metadata with base64 images.
75
+ output_directory : str
76
+ Base directory where images will be saved.
77
+ save_charts : bool, optional
78
+ Whether to save chart images. Default is True.
79
+ save_tables : bool, optional
80
+ Whether to save table images. Default is True.
81
+ save_infographics : bool, optional
82
+ Whether to save infographic images. Default is True.
83
+ save_page_images : bool, optional
84
+ Whether to save page-as-image files. Default is False.
85
+ save_raw_images : bool, optional
86
+ Whether to save raw/natural images. Default is False.
87
+ organize_by_type : bool, optional
88
+ Whether to organize images into subdirectories by type. Default is True.
89
+ output_format : str, optional
90
+ Output image format for saved files. Default is "auto".
91
+ - "auto": Preserve original format (fastest, no conversion)
92
+ - "jpeg": Convert to JPEG (smaller files, good compression)
93
+ - "png": Convert to PNG (lossless quality)
94
+ Use "auto" for maximum speed by avoiding format conversion.
95
+
96
+ Returns
97
+ -------
98
+ Dict[str, int]
99
+ Dictionary with counts of images saved by type.
100
+
101
+ Raises
102
+ ------
103
+ ValueError
104
+ If output_format is not supported.
105
+
106
+ Examples
107
+ --------
108
+ >>> from nv_ingest_client.util.image_disk_utils import save_images_to_disk
109
+ >>>
110
+ >>> # Save only charts and tables
111
+ >>> counts = save_images_to_disk(
112
+ ... response_data,
113
+ ... "./output/images",
114
+ ... save_charts=True,
115
+ ... save_tables=True,
116
+ ... save_page_images=False
117
+ ... )
118
+ >>> print(f"Saved {counts['chart']} charts and {counts['table']} tables")
119
+ """
120
+
121
+ if not response_data:
122
+ logger.warning("No response data provided")
123
+ return {}
124
+
125
+ # Validate format upfront to fail fast
126
+ normalized_format = output_format.lower()
127
+ if normalized_format not in ["auto", "png", "jpeg", "jpg"]:
128
+ raise ValueError(
129
+ f"Unsupported output format: '{output_format}'. Supported formats: 'auto', 'png', 'jpeg', 'jpg'"
130
+ )
131
+
132
+ # Initialize counters
133
+ image_counts = {"chart": 0, "table": 0, "infographic": 0, "page_image": 0, "image": 0, "total": 0}
134
+
135
+ # Create output directory
136
+ os.makedirs(output_directory, exist_ok=True)
137
+
138
+ for doc_idx, document in enumerate(response_data):
139
+ try:
140
+ metadata = document.get("metadata", {})
141
+ doc_type = document.get("document_type", "unknown")
142
+
143
+ # Skip documents without image content
144
+ image_content = metadata.get("content")
145
+ if not image_content:
146
+ continue
147
+
148
+ # Get document info for naming
149
+ source_metadata = metadata.get("source_metadata", {})
150
+ source_id = source_metadata.get("source_id", f"document_{doc_idx}")
151
+ clean_source_name = get_valid_filename(os.path.basename(source_id))
152
+
153
+ content_metadata = metadata.get("content_metadata", {})
154
+ subtype = content_metadata.get("subtype", "image")
155
+ page_number = content_metadata.get("page_number", 0)
156
+
157
+ # Apply filtering based on image subtype and user preferences
158
+ should_save = False
159
+ if subtype == "chart" and save_charts:
160
+ should_save = True
161
+ elif subtype == "table" and save_tables:
162
+ should_save = True
163
+ elif subtype == "infographic" and save_infographics:
164
+ should_save = True
165
+ elif subtype == "page_image" and save_page_images:
166
+ should_save = True
167
+ elif (
168
+ doc_type == "image"
169
+ and subtype not in ["chart", "table", "infographic", "page_image"]
170
+ and save_raw_images
171
+ ):
172
+ should_save = True
173
+ subtype = "image" # Normalize subtype for consistent counting
174
+
175
+ if not should_save:
176
+ continue
177
+
178
+ # Determine file extension and target format (format already validated upfront)
179
+ if normalized_format in ["jpeg", "jpg"]:
180
+ file_ext, target_format = "jpeg", "jpeg"
181
+ elif normalized_format == "png":
182
+ file_ext, target_format = "png", "png"
183
+ else: # normalized_format == "auto" - detect once and use result
184
+ detected_ext = _detect_extension_from_content(image_content)
185
+ if detected_ext == "png":
186
+ file_ext, target_format = "png", "png"
187
+ else: # detected_ext == "jpeg"
188
+ file_ext, target_format = "jpeg", "jpeg"
189
+
190
+ if organize_by_type:
191
+ # Organize into subdirectories by image type
192
+ type_dir = os.path.join(output_directory, subtype)
193
+ os.makedirs(type_dir, exist_ok=True)
194
+ image_filename = f"{clean_source_name}_p{page_number}_{doc_idx}.{file_ext}"
195
+ image_path = os.path.join(type_dir, image_filename)
196
+ else:
197
+ # Flat directory structure with type in filename
198
+ image_filename = f"{clean_source_name}_{subtype}_p{page_number}_{doc_idx}.{file_ext}"
199
+ image_path = os.path.join(output_directory, image_filename)
200
+
201
+ # Save image using centralized API function
202
+ try:
203
+ success = save_image_to_disk(image_content, image_path, target_format)
204
+
205
+ if success:
206
+ # Update image type counters
207
+ image_counts[subtype] += 1
208
+ image_counts["total"] += 1
209
+ logger.debug(f"Saved {subtype} image: {image_path}")
210
+ else:
211
+ logger.error(f"Failed to save {subtype} image for {clean_source_name}")
212
+
213
+ except Exception as e:
214
+ logger.error(f"Failed to save {subtype} image for {clean_source_name}: {e}")
215
+
216
+ except Exception as e:
217
+ logger.error(f"Failed to process document {doc_idx}: {e}")
218
+ continue
219
+
220
+ # Log summary statistics
221
+ if image_counts["total"] > 0:
222
+ logger.info(f"Successfully saved {image_counts['total']} images to {output_directory}")
223
+ for img_type, count in image_counts.items():
224
+ if img_type != "total" and count > 0:
225
+ logger.info(f" - {img_type}: {count}")
226
+ else:
227
+ logger.info("No images were saved (none met filter criteria)")
228
+
229
+ return image_counts
230
+
231
+
232
+ def save_images_from_response(response: Dict[str, Any], output_directory: str, **kwargs) -> Dict[str, int]:
233
+ """
234
+ Convenience function to save images from a full API response.
235
+
236
+ Parameters
237
+ ----------
238
+ response : Dict[str, Any]
239
+ Full API response containing a "data" field with document results.
240
+ output_directory : str
241
+ Directory where images will be saved.
242
+ **kwargs
243
+ Additional arguments passed to save_images_to_disk().
244
+ Includes output_format ("auto", "png", or "jpeg") and other filtering options.
245
+
246
+ Returns
247
+ -------
248
+ Dict[str, int]
249
+ Dictionary with counts of images saved by type.
250
+ """
251
+
252
+ if "data" not in response or not response["data"]:
253
+ logger.warning("No data found in response")
254
+ return {}
255
+
256
+ return save_images_to_disk(response["data"], output_directory, **kwargs)
257
+
258
+
259
+ def save_images_from_ingestor_results(
260
+ results: List[List[Dict[str, Any]]], output_directory: str, **kwargs
261
+ ) -> Dict[str, int]:
262
+ """
263
+ Save images from Ingestor.ingest() results.
264
+
265
+ Parameters
266
+ ----------
267
+ results : List[List[Dict[str, Any]]]
268
+ Results from Ingestor.ingest(), where each inner list contains
269
+ document results for one source file. Can also handle LazyLoadedList
270
+ objects when save_to_disk=True is used.
271
+ output_directory : str
272
+ Directory where images will be saved.
273
+ **kwargs
274
+ Additional arguments passed to save_images_to_disk().
275
+ Includes output_format ("auto", "png", or "jpeg") and other filtering options.
276
+
277
+ Returns
278
+ -------
279
+ Dict[str, int]
280
+ Dictionary with counts of images saved by type.
281
+ """
282
+
283
+ # Flatten results from multiple documents into single list
284
+ all_documents = []
285
+ for doc_results in results:
286
+ if isinstance(doc_results, list):
287
+ # Standard list of document results
288
+ all_documents.extend(doc_results)
289
+ elif hasattr(doc_results, "__iter__") and hasattr(doc_results, "__len__"):
290
+ # Handle LazyLoadedList or other sequence-like objects
291
+ try:
292
+ all_documents.extend(list(doc_results))
293
+ except Exception as e:
294
+ logger.warning(f"Failed to process document results: {e}")
295
+ continue
296
+ else:
297
+ # Handle single document case
298
+ all_documents.append(doc_results)
299
+
300
+ return save_images_to_disk(all_documents, output_directory, **kwargs)
@@ -35,6 +35,7 @@ class ClientConfigSchema:
35
35
  "https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking",
36
36
  )
37
37
  self.nv_ranker_nim_model_name: str = os.getenv("RERANKER_NIM_MODEL_NAME", "nvidia/llama-3.2-nv-rerankqa-1b-v2")
38
+ self.minio_bucket_name: str = os.getenv("MINIO_BUCKET", "nv-ingest")
38
39
 
39
40
 
40
41
  @unified_exception_handler
@@ -17,8 +17,6 @@ import numpy as np
17
17
  import pandas as pd
18
18
  import requests
19
19
  from minio import Minio
20
- from minio.commonconfig import CopySource
21
- from minio.deleteobjects import DeleteObject
22
20
  from nv_ingest_client.util.process_json_files import ingest_json_results_to_blob
23
21
  from nv_ingest_client.util.transport import infer_microservice
24
22
  from nv_ingest_client.util.util import ClientConfigSchema
@@ -42,10 +40,10 @@ from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer
42
40
  from pymilvus.orm.types import CONSISTENCY_BOUNDED
43
41
  from scipy.sparse import csr_array
44
42
 
43
+
45
44
  logger = logging.getLogger(__name__)
46
45
 
47
46
  CONSISTENCY = CONSISTENCY_BOUNDED
48
- MINIO_DEFAULT_BUCKET_NAME = "a-bucket"
49
47
 
50
48
  pandas_reader_map = {
51
49
  ".json": pd.read_json,
@@ -750,7 +748,7 @@ def bulk_insert_milvus(
750
748
  minio_endpoint: str = "localhost:9000",
751
749
  access_key: str = "minioadmin",
752
750
  secret_key: str = "minioadmin",
753
- bucket_name: str = "nv-ingest",
751
+ bucket_name: str = None,
754
752
  username: str = None,
755
753
  password: str = None,
756
754
  ):
@@ -774,29 +772,16 @@ def bulk_insert_milvus(
774
772
  password : str, optional
775
773
  Milvus password.
776
774
  """
777
- minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
778
-
779
775
  connections.connect(uri=milvus_uri, token=f"{username}:{password}")
780
776
  t_bulk_start = time.time()
781
777
  task_ids = []
782
- uploaded_files = []
783
- for files in writer.batch_files:
784
- for f in files:
785
- # Hack: do_bulk_insert only reads from the default bucket ('a-bucket'),
786
- # so we first copy objects from the source bucket into 'a-bucket' before inserting.
787
- try:
788
- minio_client.copy_object(MINIO_DEFAULT_BUCKET_NAME, f, CopySource(bucket_name, f))
789
- uploaded_files.append(f)
790
- except Exception as e:
791
- logger.error(f"Error copying {f} from {bucket_name} to {MINIO_DEFAULT_BUCKET_NAME}: {e}")
792
-
793
- task_id = utility.do_bulk_insert(
794
- collection_name=collection_name,
795
- files=files,
796
- consistency_level=CONSISTENCY,
797
- )
798
- task_ids.append(task_id)
799
- # list_bulk_insert_tasks = utility.list_bulk_insert_tasks(collection_name=collection_name)
778
+
779
+ task_id = utility.do_bulk_insert(
780
+ collection_name=collection_name,
781
+ files=[file for files in writer.batch_files for file in files],
782
+ consistency_level=CONSISTENCY,
783
+ )
784
+
800
785
  while len(task_ids) > 0:
801
786
  time.sleep(1)
802
787
  for task_id in task_ids:
@@ -812,9 +797,6 @@ def bulk_insert_milvus(
812
797
  logger.error(f"Failed reason: {task.failed_reason}")
813
798
  task_ids.remove(task_id)
814
799
 
815
- # Cleanup: remove the copied files to undo the temporary workaround before bulk insert.
816
- minio_client.remove_objects(MINIO_DEFAULT_BUCKET_NAME, [DeleteObject(f) for f in uploaded_files])
817
-
818
800
  t_bulk_end = time.time()
819
801
  logger.info(f"Bulk {collection_name} upload took {t_bulk_end - t_bulk_start} s")
820
802
 
@@ -881,7 +863,7 @@ def create_bm25_model(
881
863
  return bm25_ef
882
864
 
883
865
 
884
- def stream_insert_milvus(records, client: MilvusClient, collection_name: str):
866
+ def stream_insert_milvus(records, client: MilvusClient, collection_name: str, batch_size: int = 5000):
885
867
  """
886
868
  This function takes the input records and creates a corpus,
887
869
  factoring in filters (i.e. texts, charts, tables) and fits
@@ -899,12 +881,46 @@ def stream_insert_milvus(records, client: MilvusClient, collection_name: str):
899
881
  Milvus Collection to search against
900
882
  """
901
883
  count = 0
902
- for element in records:
903
- client.insert(collection_name=collection_name, data=[element])
904
- count += 1
884
+ for idx in range(0, len(records), batch_size):
885
+ client.insert(collection_name=collection_name, data=records[idx : idx + batch_size])
886
+ count += len(records[idx : idx + batch_size])
887
+ client.flush(collection_name)
905
888
  logger.info(f"streamed {count} records")
906
889
 
907
890
 
891
+ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient):
892
+ """
893
+ This function waits for the index to be built. It checks
894
+ the indexed_rows of the index and waits for it to be equal
895
+ to the number of records. This only works for streaming inserts,
896
+ bulk inserts are not supported by this function
897
+ (refer to MilvusClient.refresh_load for bulk inserts).
898
+ """
899
+ index_names = utility.list_indexes(collection_name)
900
+ indexed_rows = 0
901
+ for index_name in index_names:
902
+ indexed_rows = 0
903
+ while indexed_rows < num_elements:
904
+ pos_movement = 10 # number of iteration allowed without noticing an increase in indexed_rows
905
+ for i in range(20):
906
+ new_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
907
+ time.sleep(1)
908
+ logger.info(
909
+ f"polling for indexed rows, {collection_name}, {index_name} - {new_indexed_rows} / {num_elements}"
910
+ )
911
+ if new_indexed_rows == num_elements:
912
+ indexed_rows = new_indexed_rows
913
+ break
914
+ # check if indexed_rows is staying the same, too many times means something is wrong
915
+ if new_indexed_rows == indexed_rows:
916
+ pos_movement = -1
917
+ # if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
918
+ if pos_movement == 0:
919
+ raise ValueError("Rows are not getting indexed as expected")
920
+ indexed_rows = new_indexed_rows
921
+ return indexed_rows
922
+
923
+
908
924
  def write_to_nvingest_collection(
909
925
  records,
910
926
  collection_name: str,
@@ -920,7 +936,7 @@ def write_to_nvingest_collection(
920
936
  compute_bm25_stats: bool = True,
921
937
  access_key: str = "minioadmin",
922
938
  secret_key: str = "minioadmin",
923
- bucket_name: str = "nv-ingest",
939
+ bucket_name: str = None,
924
940
  threshold: int = 1000,
925
941
  meta_dataframe=None,
926
942
  meta_source_field=None,
@@ -1026,8 +1042,12 @@ def write_to_nvingest_collection(
1026
1042
  client,
1027
1043
  collection_name,
1028
1044
  )
1045
+ # Make sure all rows are indexed, decided not to wrap in a timeout because we dont
1046
+ # know how long this should take, it is num_elements dependent.
1047
+ wait_for_index(collection_name, num_elements, client)
1029
1048
  else:
1030
1049
  minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
1050
+ bucket_name = bucket_name if bucket_name else ClientConfigSchema().minio_bucket_name
1031
1051
  if not minio_client.bucket_exists(bucket_name):
1032
1052
  minio_client.make_bucket(bucket_name)
1033
1053
 
@@ -1618,7 +1638,7 @@ def embed_index_collection(
1618
1638
  compute_bm25_stats: bool = True,
1619
1639
  access_key: str = "minioadmin",
1620
1640
  secret_key: str = "minioadmin",
1621
- bucket_name: str = "nv-ingest",
1641
+ bucket_name: str = None,
1622
1642
  meta_dataframe: Union[str, pd.DataFrame] = None,
1623
1643
  meta_source_field: str = None,
1624
1644
  meta_fields: list[str] = None,
@@ -1658,7 +1678,7 @@ def embed_index_collection(
1658
1678
  compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
1659
1679
  access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
1660
1680
  secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
1661
- bucket_name (str, optional): The name of the MinIO bucket. Defaults to "nv-ingest".
1681
+ bucket_name (str, optional): The name of the MinIO bucket.
1662
1682
  meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
1663
1683
  containing metadata. Defaults to None.
1664
1684
  meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
@@ -1774,7 +1794,7 @@ def reindex_collection(
1774
1794
  compute_bm25_stats: bool = True,
1775
1795
  access_key: str = "minioadmin",
1776
1796
  secret_key: str = "minioadmin",
1777
- bucket_name: str = "nv-ingest",
1797
+ bucket_name: str = None,
1778
1798
  meta_dataframe: Union[str, pd.DataFrame] = None,
1779
1799
  meta_source_field: str = None,
1780
1800
  meta_fields: list[str] = None,
@@ -1815,7 +1835,7 @@ def reindex_collection(
1815
1835
  compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
1816
1836
  access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
1817
1837
  secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
1818
- bucket_name (str, optional): The name of the MinIO bucket. Defaults to "nv-ingest".
1838
+ bucket_name (str, optional): The name of the MinIO bucket.
1819
1839
  meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
1820
1840
  containing metadata. Defaults to None.
1821
1841
  meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
@@ -1923,7 +1943,7 @@ class Milvus(VDB):
1923
1943
  compute_bm25_stats: bool = True,
1924
1944
  access_key: str = "minioadmin",
1925
1945
  secret_key: str = "minioadmin",
1926
- bucket_name: str = "nv-ingest",
1946
+ bucket_name: str = None,
1927
1947
  meta_dataframe: Union[str, pd.DataFrame] = None,
1928
1948
  meta_source_field: str = None,
1929
1949
  meta_fields: list[str] = None,
@@ -1954,7 +1974,7 @@ class Milvus(VDB):
1954
1974
  compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
1955
1975
  access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
1956
1976
  secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
1957
- bucket_name (str, optional): The name of the MinIO bucket. Defaults to "nv-ingest".
1977
+ bucket_name (str, optional): The name of the MinIO bucket.
1958
1978
  meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
1959
1979
  containing metadata. Defaults to None.
1960
1980
  meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.9.8.dev20250908
3
+ Version: 2025.9.10.dev20250910
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -42,6 +42,8 @@ src/nv_ingest_client/primitives/tasks/udf.py
42
42
  src/nv_ingest_client/primitives/tasks/vdb_upload.py
43
43
  src/nv_ingest_client/util/__init__.py
44
44
  src/nv_ingest_client/util/dataset.py
45
+ src/nv_ingest_client/util/document_analysis.py
46
+ src/nv_ingest_client/util/image_disk_utils.py
45
47
  src/nv_ingest_client/util/milvus.py
46
48
  src/nv_ingest_client/util/process_json_files.py
47
49
  src/nv_ingest_client/util/processing.py