nv-ingest-client 2025.9.9.dev20250909__tar.gz → 2025.9.11.dev20250911__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (61) hide show
  1. {nv_ingest_client-2025.9.9.dev20250909/src/nv_ingest_client.egg-info → nv_ingest_client-2025.9.11.dev20250911}/PKG-INFO +1 -1
  2. nv_ingest_client-2025.9.11.dev20250911/src/nv_ingest_client/util/document_analysis.py +314 -0
  3. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/util.py +1 -0
  4. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/vdb/milvus.py +20 -34
  5. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
  6. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client.egg-info/SOURCES.txt +1 -0
  7. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/LICENSE +0 -0
  8. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/MANIFEST.in +0 -0
  9. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/README.md +0 -0
  10. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/pyproject.toml +0 -0
  11. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/setup.cfg +0 -0
  12. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/__init__.py +0 -0
  13. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/cli/__init__.py +0 -0
  14. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/cli/util/__init__.py +0 -0
  15. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/cli/util/click.py +0 -0
  16. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/cli/util/processing.py +0 -0
  17. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/cli/util/system.py +0 -0
  18. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/client/__init__.py +0 -0
  19. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/client/client.py +0 -0
  20. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/client/interface.py +0 -0
  21. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/client/util/processing.py +0 -0
  22. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/nv_ingest_cli.py +0 -0
  23. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/__init__.py +0 -0
  24. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
  25. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/jobs/job_spec.py +0 -0
  26. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
  27. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
  28. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
  29. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/caption.py +0 -0
  30. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
  31. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
  32. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/embed.py +0 -0
  33. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/extract.py +0 -0
  34. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/filter.py +0 -0
  35. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
  36. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
  37. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/store.py +0 -0
  38. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
  39. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
  40. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
  41. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/udf.py +0 -0
  42. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
  43. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/__init__.py +0 -0
  44. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/dataset.py +0 -0
  45. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
  46. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/file_processing/extract.py +0 -0
  47. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
  48. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/milvus.py +0 -0
  49. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/process_json_files.py +0 -0
  50. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/processing.py +0 -0
  51. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/system.py +0 -0
  52. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/transport.py +0 -0
  53. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
  54. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
  55. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
  56. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/zipkin.py +0 -0
  57. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
  58. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
  59. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client.egg-info/requires.txt +0 -0
  60. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
  61. {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.9.9.dev20250909
3
+ Version: 2025.9.11.dev20250911
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -0,0 +1,314 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ Utility functions for analyzing document-level chunk composition from nv-ingest results.
7
+
8
+ This module provides analysis capabilities for understanding the distribution and types
9
+ of extracted content elements across individual documents. It enables customers to
10
+ gain visibility into their document composition for performance optimization and
11
+ capacity planning decisions.
12
+ """
13
+
14
+ import logging
15
+ import os
16
+ from collections import defaultdict
17
+ from typing import Any, Dict, List, Union
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def analyze_document_chunks(
23
+ results: Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
24
+ ) -> Dict[str, Dict[str, Dict[str, int]]]:
25
+ """
26
+ Analyze ingestor results to count elements by type and page for each document.
27
+
28
+ This function processes results from nv-ingest ingestion and provides a per-document,
29
+ per-page breakdown of extracted content types, enabling customers to understand document
30
+ composition and page-level distribution for optimization and planning purposes.
31
+
32
+ Parameters
33
+ ----------
34
+ results : Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
35
+ Ingestor results from ingestor.ingest() in standard List[List[Dict]] format,
36
+ or flattened List[Dict] format. Handles both regular lists and
37
+ LazyLoadedList objects automatically.
38
+
39
+ Returns
40
+ -------
41
+ Dict[str, Dict[str, Dict[str, int]]]
42
+ Dictionary mapping document names to page-level element type counts with structure:
43
+ {
44
+ "document1.pdf": {
45
+ "total": {
46
+ "text": 7, "charts": 1, "tables": 1,
47
+ "unstructured_images": 0, "infographics": 0, "page_images": 0
48
+ },
49
+ "1": {
50
+ "text": 3, "charts": 1, "tables": 0,
51
+ "unstructured_images": 0, "infographics": 0, "page_images": 0
52
+ },
53
+ "2": {
54
+ "text": 4, "charts": 0, "tables": 1,
55
+ "unstructured_images": 0, "infographics": 0, "page_images": 0
56
+ }
57
+ },
58
+ "document2.pdf": {...}
59
+ }
60
+
61
+ Notes
62
+ -----
63
+ - Requires purge_results_after_upload=False in vdb_upload() configuration
64
+ - Automatically handles LazyLoadedList objects from nv-ingest client
65
+ - Returns zero counts for missing element types
66
+ - Assumes valid nv-ingest output format with guaranteed metadata structure
67
+
68
+ Examples
69
+ --------
70
+ >>> from nv_ingest_client.util.document_analysis import analyze_document_chunks
71
+ >>>
72
+ >>> # After running ingestion
73
+ >>> results, failures = ingestor.ingest(show_progress=True, return_failures=True)
74
+ >>>
75
+ >>> # Analyze document composition by page
76
+ >>> breakdown = analyze_document_chunks(results)
77
+ >>>
78
+ >>> for doc_name, pages in breakdown.items():
79
+ ... total_counts = pages["total"]
80
+ ... total_elements = sum(total_counts.values())
81
+ ... page_count = len(pages) - 1 # Subtract 1 for "total" key
82
+ ... print(f"{doc_name}: {total_elements} elements across {page_count} pages")
83
+ ... print(f" total: {total_elements} elements ({total_counts['text']} text, {total_counts['charts']} charts)")
84
+ ... for page_name, counts in pages.items():
85
+ ... if page_name != "total": # Skip total when listing pages
86
+ ... page_total = sum(counts.values())
87
+ ... print(
88
+ f" page {page_name}: {page_total} elements "
89
+ f"({counts['text']} text, {counts['charts']} charts)"
90
+ )
91
+ """
92
+
93
+ if not results:
94
+ logger.warning("No results provided for analysis")
95
+ return {}
96
+
97
+ # Normalize input format to handle both List[List[Dict]] and List[Dict] structures
98
+ normalized_results = _normalize_results_format(results)
99
+
100
+ # Group elements by document name and page number
101
+ document_page_elements = defaultdict(lambda: defaultdict(list))
102
+
103
+ for doc_results in normalized_results:
104
+ # Handle LazyLoadedList and other iterable types
105
+ elements = _extract_elements_from_doc(doc_results)
106
+
107
+ for element in elements:
108
+ doc_name = _extract_document_name(element)
109
+ page_key = _extract_page_key(element)
110
+ document_page_elements[doc_name][page_key].append(element)
111
+
112
+ # Count element types per page within each document and calculate totals
113
+ document_page_counts = {}
114
+
115
+ for doc_name, pages in document_page_elements.items():
116
+ document_page_counts[doc_name] = {}
117
+ total_counts = _initialize_element_counts()
118
+
119
+ for page_key, elements in pages.items():
120
+ counts = _initialize_element_counts()
121
+
122
+ for element in elements:
123
+ element_type = _categorize_element(element)
124
+ counts[element_type] += 1
125
+ total_counts[element_type] += 1 # Add to document total
126
+
127
+ document_page_counts[doc_name][page_key] = counts
128
+
129
+ # Add the total counts for this document
130
+ document_page_counts[doc_name]["total"] = total_counts
131
+
132
+ if document_page_counts:
133
+ total_docs = len(document_page_counts)
134
+ total_pages = sum(len(pages) - 1 for pages in document_page_counts.values()) # Subtract 1 for "total" key
135
+ total_elements = sum(sum(page_counts["total"].values()) for page_counts in document_page_counts.values())
136
+ logger.info(f"Analyzed {total_elements} elements across {total_pages} pages in {total_docs} documents")
137
+ else:
138
+ logger.warning("No valid documents found for analysis")
139
+
140
+ return document_page_counts
141
+
142
+
143
+ def _normalize_results_format(results: Union[List[List[Dict]], List[Dict]]) -> List[List[Dict]]:
144
+ """
145
+ Normalize various input formats to consistent List[List[Dict]] structure.
146
+
147
+ Parameters
148
+ ----------
149
+ results : Union[List[List[Dict]], List[Dict]]
150
+ Input results in various formats
151
+
152
+ Returns
153
+ -------
154
+ List[List[Dict]]
155
+ Normalized results in standard format
156
+ """
157
+
158
+ if not results:
159
+ return []
160
+
161
+ # Handle List[List[Dict]] or List[LazyLoadedList] formats
162
+ if isinstance(results, list) and len(results) > 0:
163
+ first_elem = results[0]
164
+ # Check for list, LazyLoadedList, or any sequence-like object
165
+ if isinstance(first_elem, list) or (
166
+ hasattr(first_elem, "__iter__") and hasattr(first_elem, "__len__") and not isinstance(first_elem, dict)
167
+ ):
168
+ return results
169
+
170
+ # Handle flattened List[Dict] format by grouping elements by document
171
+ if isinstance(results, list) and len(results) > 0 and isinstance(results[0], dict):
172
+ doc_groups = defaultdict(list)
173
+ for element in results:
174
+ doc_name = _extract_document_name(element)
175
+ doc_groups[doc_name].append(element)
176
+
177
+ return list(doc_groups.values())
178
+
179
+ # Fallback for unexpected formats
180
+ return [[item] for item in results if item]
181
+
182
+
183
+ def _extract_elements_from_doc(doc_results) -> List[Dict]:
184
+ """
185
+ Extract elements from document results, handling various data types.
186
+
187
+ Parameters
188
+ ----------
189
+ doc_results : Any
190
+ Document results which may be a list, LazyLoadedList, or other iterable
191
+
192
+ Returns
193
+ -------
194
+ List[Dict]
195
+ List of element dictionaries
196
+ """
197
+
198
+ if isinstance(doc_results, list):
199
+ return doc_results
200
+ elif hasattr(doc_results, "__iter__") and hasattr(doc_results, "__len__"):
201
+ # Handle LazyLoadedList and other sequence-like objects
202
+ return list(doc_results)
203
+ else:
204
+ # Single element case
205
+ return [doc_results] if doc_results else []
206
+
207
+
208
+ def _extract_document_name(element: Dict[str, Any]) -> str:
209
+ """
210
+ Extract clean document name from element metadata.
211
+
212
+ Parameters
213
+ ----------
214
+ element : Dict[str, Any]
215
+ Element dictionary containing metadata
216
+
217
+ Returns
218
+ -------
219
+ str
220
+ Clean document filename (basename of source_id)
221
+ """
222
+
223
+ # nv-ingest guarantees this structure exists
224
+ source_id = element["metadata"]["source_metadata"]["source_id"]
225
+ return os.path.basename(source_id)
226
+
227
+
228
+ def _extract_page_key(element: Dict[str, Any]) -> str:
229
+ """
230
+ Extract page key from element metadata for consistent page naming.
231
+
232
+ Parameters
233
+ ----------
234
+ element : Dict[str, Any]
235
+ Element dictionary containing metadata
236
+
237
+ Returns
238
+ -------
239
+ str
240
+ Page number as string (e.g., "1", "2", or "unknown")
241
+ """
242
+
243
+ try:
244
+ page_number = element["metadata"]["content_metadata"]["page_number"]
245
+ if page_number is not None and page_number >= 0:
246
+ return str(page_number)
247
+ else:
248
+ return "unknown"
249
+ except (KeyError, TypeError):
250
+ logger.warning("Missing or invalid page_number in element metadata")
251
+ return "unknown"
252
+
253
+
254
+ def _categorize_element(element: Dict[str, Any]) -> str:
255
+ """
256
+ Categorize element by type using document_type and content metadata.
257
+
258
+ Parameters
259
+ ----------
260
+ element : Dict[str, Any]
261
+ Element dictionary with document_type and metadata fields
262
+
263
+ Returns
264
+ -------
265
+ str
266
+ Element category: "text", "charts", "tables", "unstructured_images",
267
+ "infographics", or "page_images"
268
+ """
269
+
270
+ doc_type = element["document_type"]
271
+
272
+ # Text elements
273
+ if doc_type == "text":
274
+ return "text"
275
+
276
+ # Structured elements with subtypes
277
+ elif doc_type == "structured":
278
+ subtype = element["metadata"]["content_metadata"]["subtype"]
279
+ if subtype == "chart":
280
+ return "charts"
281
+ elif subtype == "table":
282
+ return "tables"
283
+ elif subtype == "infographic":
284
+ return "infographics"
285
+ elif subtype == "page_image":
286
+ return "page_images"
287
+
288
+ # Image elements (unstructured)
289
+ elif doc_type == "image":
290
+ return "unstructured_images"
291
+
292
+ # Should not reach here with valid nv-ingest output
293
+ logger.warning(f"Unexpected element type: {doc_type}")
294
+ return "text" # Default to text for safety
295
+
296
+
297
+ def _initialize_element_counts() -> Dict[str, int]:
298
+ """
299
+ Initialize element counts dictionary with all supported types.
300
+
301
+ Returns
302
+ -------
303
+ Dict[str, int]
304
+ Dictionary with zero counts for all element types
305
+ """
306
+
307
+ return {
308
+ "text": 0,
309
+ "charts": 0,
310
+ "tables": 0,
311
+ "unstructured_images": 0,
312
+ "infographics": 0,
313
+ "page_images": 0,
314
+ }
@@ -35,6 +35,7 @@ class ClientConfigSchema:
35
35
  "https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking",
36
36
  )
37
37
  self.nv_ranker_nim_model_name: str = os.getenv("RERANKER_NIM_MODEL_NAME", "nvidia/llama-3.2-nv-rerankqa-1b-v2")
38
+ self.minio_bucket_name: str = os.getenv("MINIO_BUCKET", "nv-ingest")
38
39
 
39
40
 
40
41
  @unified_exception_handler
@@ -17,8 +17,6 @@ import numpy as np
17
17
  import pandas as pd
18
18
  import requests
19
19
  from minio import Minio
20
- from minio.commonconfig import CopySource
21
- from minio.deleteobjects import DeleteObject
22
20
  from nv_ingest_client.util.process_json_files import ingest_json_results_to_blob
23
21
  from nv_ingest_client.util.transport import infer_microservice
24
22
  from nv_ingest_client.util.util import ClientConfigSchema
@@ -46,7 +44,6 @@ from scipy.sparse import csr_array
46
44
  logger = logging.getLogger(__name__)
47
45
 
48
46
  CONSISTENCY = CONSISTENCY_BOUNDED
49
- MINIO_DEFAULT_BUCKET_NAME = "a-bucket"
50
47
 
51
48
  pandas_reader_map = {
52
49
  ".json": pd.read_json,
@@ -751,7 +748,7 @@ def bulk_insert_milvus(
751
748
  minio_endpoint: str = "localhost:9000",
752
749
  access_key: str = "minioadmin",
753
750
  secret_key: str = "minioadmin",
754
- bucket_name: str = "nv-ingest",
751
+ bucket_name: str = None,
755
752
  username: str = None,
756
753
  password: str = None,
757
754
  ):
@@ -775,29 +772,16 @@ def bulk_insert_milvus(
775
772
  password : str, optional
776
773
  Milvus password.
777
774
  """
778
- minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
779
-
780
775
  connections.connect(uri=milvus_uri, token=f"{username}:{password}")
781
776
  t_bulk_start = time.time()
782
777
  task_ids = []
783
- uploaded_files = []
784
- for files in writer.batch_files:
785
- for f in files:
786
- # Hack: do_bulk_insert only reads from the default bucket ('a-bucket'),
787
- # so we first copy objects from the source bucket into 'a-bucket' before inserting.
788
- try:
789
- minio_client.copy_object(MINIO_DEFAULT_BUCKET_NAME, f, CopySource(bucket_name, f))
790
- uploaded_files.append(f)
791
- except Exception as e:
792
- logger.error(f"Error copying {f} from {bucket_name} to {MINIO_DEFAULT_BUCKET_NAME}: {e}")
793
-
794
- task_id = utility.do_bulk_insert(
795
- collection_name=collection_name,
796
- files=files,
797
- consistency_level=CONSISTENCY,
798
- )
799
- task_ids.append(task_id)
800
- # list_bulk_insert_tasks = utility.list_bulk_insert_tasks(collection_name=collection_name)
778
+
779
+ task_id = utility.do_bulk_insert(
780
+ collection_name=collection_name,
781
+ files=[file for files in writer.batch_files for file in files],
782
+ consistency_level=CONSISTENCY,
783
+ )
784
+
801
785
  while len(task_ids) > 0:
802
786
  time.sleep(1)
803
787
  for task_id in task_ids:
@@ -813,9 +797,6 @@ def bulk_insert_milvus(
813
797
  logger.error(f"Failed reason: {task.failed_reason}")
814
798
  task_ids.remove(task_id)
815
799
 
816
- # Cleanup: remove the copied files to undo the temporary workaround before bulk insert.
817
- minio_client.remove_objects(MINIO_DEFAULT_BUCKET_NAME, [DeleteObject(f) for f in uploaded_files])
818
-
819
800
  t_bulk_end = time.time()
820
801
  logger.info(f"Bulk {collection_name} upload took {t_bulk_end - t_bulk_start} s")
821
802
 
@@ -903,6 +884,7 @@ def stream_insert_milvus(records, client: MilvusClient, collection_name: str, ba
903
884
  for idx in range(0, len(records), batch_size):
904
885
  client.insert(collection_name=collection_name, data=records[idx : idx + batch_size])
905
886
  count += len(records[idx : idx + batch_size])
887
+ client.flush(collection_name)
906
888
  logger.info(f"streamed {count} records")
907
889
 
908
890
 
@@ -923,6 +905,9 @@ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient
923
905
  for i in range(20):
924
906
  new_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
925
907
  time.sleep(1)
908
+ logger.info(
909
+ f"polling for indexed rows, {collection_name}, {index_name} - {new_indexed_rows} / {num_elements}"
910
+ )
926
911
  if new_indexed_rows == num_elements:
927
912
  indexed_rows = new_indexed_rows
928
913
  break
@@ -951,7 +936,7 @@ def write_to_nvingest_collection(
951
936
  compute_bm25_stats: bool = True,
952
937
  access_key: str = "minioadmin",
953
938
  secret_key: str = "minioadmin",
954
- bucket_name: str = "nv-ingest",
939
+ bucket_name: str = None,
955
940
  threshold: int = 1000,
956
941
  meta_dataframe=None,
957
942
  meta_source_field=None,
@@ -1062,6 +1047,7 @@ def write_to_nvingest_collection(
1062
1047
  wait_for_index(collection_name, num_elements, client)
1063
1048
  else:
1064
1049
  minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
1050
+ bucket_name = bucket_name if bucket_name else ClientConfigSchema().minio_bucket_name
1065
1051
  if not minio_client.bucket_exists(bucket_name):
1066
1052
  minio_client.make_bucket(bucket_name)
1067
1053
 
@@ -1652,7 +1638,7 @@ def embed_index_collection(
1652
1638
  compute_bm25_stats: bool = True,
1653
1639
  access_key: str = "minioadmin",
1654
1640
  secret_key: str = "minioadmin",
1655
- bucket_name: str = "nv-ingest",
1641
+ bucket_name: str = None,
1656
1642
  meta_dataframe: Union[str, pd.DataFrame] = None,
1657
1643
  meta_source_field: str = None,
1658
1644
  meta_fields: list[str] = None,
@@ -1692,7 +1678,7 @@ def embed_index_collection(
1692
1678
  compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
1693
1679
  access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
1694
1680
  secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
1695
- bucket_name (str, optional): The name of the MinIO bucket. Defaults to "nv-ingest".
1681
+ bucket_name (str, optional): The name of the MinIO bucket.
1696
1682
  meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
1697
1683
  containing metadata. Defaults to None.
1698
1684
  meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
@@ -1808,7 +1794,7 @@ def reindex_collection(
1808
1794
  compute_bm25_stats: bool = True,
1809
1795
  access_key: str = "minioadmin",
1810
1796
  secret_key: str = "minioadmin",
1811
- bucket_name: str = "nv-ingest",
1797
+ bucket_name: str = None,
1812
1798
  meta_dataframe: Union[str, pd.DataFrame] = None,
1813
1799
  meta_source_field: str = None,
1814
1800
  meta_fields: list[str] = None,
@@ -1849,7 +1835,7 @@ def reindex_collection(
1849
1835
  compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
1850
1836
  access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
1851
1837
  secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
1852
- bucket_name (str, optional): The name of the MinIO bucket. Defaults to "nv-ingest".
1838
+ bucket_name (str, optional): The name of the MinIO bucket.
1853
1839
  meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
1854
1840
  containing metadata. Defaults to None.
1855
1841
  meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
@@ -1957,7 +1943,7 @@ class Milvus(VDB):
1957
1943
  compute_bm25_stats: bool = True,
1958
1944
  access_key: str = "minioadmin",
1959
1945
  secret_key: str = "minioadmin",
1960
- bucket_name: str = "nv-ingest",
1946
+ bucket_name: str = None,
1961
1947
  meta_dataframe: Union[str, pd.DataFrame] = None,
1962
1948
  meta_source_field: str = None,
1963
1949
  meta_fields: list[str] = None,
@@ -1988,7 +1974,7 @@ class Milvus(VDB):
1988
1974
  compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
1989
1975
  access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
1990
1976
  secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
1991
- bucket_name (str, optional): The name of the MinIO bucket. Defaults to "nv-ingest".
1977
+ bucket_name (str, optional): The name of the MinIO bucket.
1992
1978
  meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
1993
1979
  containing metadata. Defaults to None.
1994
1980
  meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.9.9.dev20250909
3
+ Version: 2025.9.11.dev20250911
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -42,6 +42,7 @@ src/nv_ingest_client/primitives/tasks/udf.py
42
42
  src/nv_ingest_client/primitives/tasks/vdb_upload.py
43
43
  src/nv_ingest_client/util/__init__.py
44
44
  src/nv_ingest_client/util/dataset.py
45
+ src/nv_ingest_client/util/document_analysis.py
45
46
  src/nv_ingest_client/util/image_disk_utils.py
46
47
  src/nv_ingest_client/util/milvus.py
47
48
  src/nv_ingest_client/util/process_json_files.py