nv-ingest-client 2025.9.8.dev20250908__tar.gz → 2025.9.9.dev20250909__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (60) hide show
  1. {nv_ingest_client-2025.9.8.dev20250908/src/nv_ingest_client.egg-info → nv_ingest_client-2025.9.9.dev20250909}/PKG-INFO +1 -1
  2. nv_ingest_client-2025.9.9.dev20250909/src/nv_ingest_client/util/image_disk_utils.py +300 -0
  3. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/util/vdb/milvus.py +38 -4
  4. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
  5. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client.egg-info/SOURCES.txt +1 -0
  6. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/LICENSE +0 -0
  7. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/MANIFEST.in +0 -0
  8. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/README.md +0 -0
  9. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/pyproject.toml +0 -0
  10. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/setup.cfg +0 -0
  11. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/__init__.py +0 -0
  12. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/cli/__init__.py +0 -0
  13. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/cli/util/__init__.py +0 -0
  14. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/cli/util/click.py +0 -0
  15. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/cli/util/processing.py +0 -0
  16. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/cli/util/system.py +0 -0
  17. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/client/__init__.py +0 -0
  18. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/client/client.py +0 -0
  19. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/client/interface.py +0 -0
  20. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/client/util/processing.py +0 -0
  21. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/nv_ingest_cli.py +0 -0
  22. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/__init__.py +0 -0
  23. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
  24. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/jobs/job_spec.py +0 -0
  25. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
  26. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
  27. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
  28. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/tasks/caption.py +0 -0
  29. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
  30. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
  31. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/tasks/embed.py +0 -0
  32. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/tasks/extract.py +0 -0
  33. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/tasks/filter.py +0 -0
  34. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
  35. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
  36. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/tasks/store.py +0 -0
  37. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
  38. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
  39. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
  40. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/tasks/udf.py +0 -0
  41. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
  42. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/util/__init__.py +0 -0
  43. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/util/dataset.py +0 -0
  44. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
  45. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/util/file_processing/extract.py +0 -0
  46. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/util/milvus.py +0 -0
  47. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/util/process_json_files.py +0 -0
  48. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/util/processing.py +0 -0
  49. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/util/system.py +0 -0
  50. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/util/transport.py +0 -0
  51. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/util/util.py +0 -0
  52. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
  53. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
  54. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
  55. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client/util/zipkin.py +0 -0
  56. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
  57. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
  58. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client.egg-info/requires.txt +0 -0
  59. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
  60. {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.9.dev20250909}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.9.8.dev20250908
3
+ Version: 2025.9.9.dev20250909
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -0,0 +1,300 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ Utility functions for saving images from ingestion results to disk as actual image files.
7
+
8
+ This module provides comprehensive utilities for extracting and saving base64-encoded
9
+ images from nv-ingest results to local filesystem. Features include:
10
+ - Configurable filtering by image type (charts, tables, infographics, etc.)
11
+ - Descriptive filename generation with source and page information
12
+ - Organized directory structure by image type
13
+ - Detailed image counting and statistics
14
+
15
+ Typical use cases:
16
+ - Debugging and visual inspection of extracted content
17
+ - Quality assessment of image extraction pipeline
18
+ """
19
+
20
+ import logging
21
+ import os
22
+ from typing import Any, Dict, List
23
+
24
+ from nv_ingest_client.client.util.processing import get_valid_filename
25
+ from nv_ingest_api.util.image_processing.transforms import save_image_to_disk, _detect_base64_image_format
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ def _detect_extension_from_content(image_content: str) -> str:
31
+ """
32
+ Get file extension by detecting original image format.
33
+ Falls back to .jpeg if detection fails or format is unknown.
34
+ """
35
+ DEFAULT_EXT = "jpg" # must be either "jpg" or "png"
36
+ try:
37
+ fmt = _detect_base64_image_format(image_content).upper()
38
+ except Exception:
39
+ logger.warning("Image format detection failed; falling back to default '%s'.", DEFAULT_EXT)
40
+ return DEFAULT_EXT
41
+ ext_map = {
42
+ "JPEG": "jpg",
43
+ "JPG": "jpg",
44
+ "PNG": "png",
45
+ }
46
+ ext = ext_map.get(fmt, None)
47
+ if ext:
48
+ return ext
49
+ logger.warning("Unsupported image format '%s'; falling back to default '%s'.", fmt, DEFAULT_EXT)
50
+ return DEFAULT_EXT
51
+
52
+
53
+ def save_images_to_disk(
54
+ response_data: List[Dict[str, Any]],
55
+ output_directory: str,
56
+ save_charts: bool = True,
57
+ save_tables: bool = True,
58
+ save_infographics: bool = True,
59
+ save_page_images: bool = False,
60
+ save_raw_images: bool = False,
61
+ organize_by_type: bool = True,
62
+ output_format: str = "auto",
63
+ ) -> Dict[str, int]:
64
+ """
65
+ Save base64-encoded images from ingestion results to disk as actual image files.
66
+
67
+ This utility extracts images from ingestion response data and saves them to disk
68
+ with descriptive filenames that include the image subtype and page information.
69
+ It provides granular control over which types of images to save.
70
+
71
+ Parameters
72
+ ----------
73
+ response_data : List[Dict[str, Any]]
74
+ List of document results from ingestion, each containing metadata with base64 images.
75
+ output_directory : str
76
+ Base directory where images will be saved.
77
+ save_charts : bool, optional
78
+ Whether to save chart images. Default is True.
79
+ save_tables : bool, optional
80
+ Whether to save table images. Default is True.
81
+ save_infographics : bool, optional
82
+ Whether to save infographic images. Default is True.
83
+ save_page_images : bool, optional
84
+ Whether to save page-as-image files. Default is False.
85
+ save_raw_images : bool, optional
86
+ Whether to save raw/natural images. Default is False.
87
+ organize_by_type : bool, optional
88
+ Whether to organize images into subdirectories by type. Default is True.
89
+ output_format : str, optional
90
+ Output image format for saved files. Default is "auto".
91
+ - "auto": Preserve original format (fastest, no conversion)
92
+ - "jpeg": Convert to JPEG (smaller files, good compression)
93
+ - "png": Convert to PNG (lossless quality)
94
+ Use "auto" for maximum speed by avoiding format conversion.
95
+
96
+ Returns
97
+ -------
98
+ Dict[str, int]
99
+ Dictionary with counts of images saved by type.
100
+
101
+ Raises
102
+ ------
103
+ ValueError
104
+ If output_format is not supported.
105
+
106
+ Examples
107
+ --------
108
+ >>> from nv_ingest_client.util.image_disk_utils import save_images_to_disk
109
+ >>>
110
+ >>> # Save only charts and tables
111
+ >>> counts = save_images_to_disk(
112
+ ... response_data,
113
+ ... "./output/images",
114
+ ... save_charts=True,
115
+ ... save_tables=True,
116
+ ... save_page_images=False
117
+ ... )
118
+ >>> print(f"Saved {counts['chart']} charts and {counts['table']} tables")
119
+ """
120
+
121
+ if not response_data:
122
+ logger.warning("No response data provided")
123
+ return {}
124
+
125
+ # Validate format upfront to fail fast
126
+ normalized_format = output_format.lower()
127
+ if normalized_format not in ["auto", "png", "jpeg", "jpg"]:
128
+ raise ValueError(
129
+ f"Unsupported output format: '{output_format}'. Supported formats: 'auto', 'png', 'jpeg', 'jpg'"
130
+ )
131
+
132
+ # Initialize counters
133
+ image_counts = {"chart": 0, "table": 0, "infographic": 0, "page_image": 0, "image": 0, "total": 0}
134
+
135
+ # Create output directory
136
+ os.makedirs(output_directory, exist_ok=True)
137
+
138
+ for doc_idx, document in enumerate(response_data):
139
+ try:
140
+ metadata = document.get("metadata", {})
141
+ doc_type = document.get("document_type", "unknown")
142
+
143
+ # Skip documents without image content
144
+ image_content = metadata.get("content")
145
+ if not image_content:
146
+ continue
147
+
148
+ # Get document info for naming
149
+ source_metadata = metadata.get("source_metadata", {})
150
+ source_id = source_metadata.get("source_id", f"document_{doc_idx}")
151
+ clean_source_name = get_valid_filename(os.path.basename(source_id))
152
+
153
+ content_metadata = metadata.get("content_metadata", {})
154
+ subtype = content_metadata.get("subtype", "image")
155
+ page_number = content_metadata.get("page_number", 0)
156
+
157
+ # Apply filtering based on image subtype and user preferences
158
+ should_save = False
159
+ if subtype == "chart" and save_charts:
160
+ should_save = True
161
+ elif subtype == "table" and save_tables:
162
+ should_save = True
163
+ elif subtype == "infographic" and save_infographics:
164
+ should_save = True
165
+ elif subtype == "page_image" and save_page_images:
166
+ should_save = True
167
+ elif (
168
+ doc_type == "image"
169
+ and subtype not in ["chart", "table", "infographic", "page_image"]
170
+ and save_raw_images
171
+ ):
172
+ should_save = True
173
+ subtype = "image" # Normalize subtype for consistent counting
174
+
175
+ if not should_save:
176
+ continue
177
+
178
+ # Determine file extension and target format (format already validated upfront)
179
+ if normalized_format in ["jpeg", "jpg"]:
180
+ file_ext, target_format = "jpeg", "jpeg"
181
+ elif normalized_format == "png":
182
+ file_ext, target_format = "png", "png"
183
+ else: # normalized_format == "auto" - detect once and use result
184
+ detected_ext = _detect_extension_from_content(image_content)
185
+ if detected_ext == "png":
186
+ file_ext, target_format = "png", "png"
187
+ else: # detected_ext == "jpeg"
188
+ file_ext, target_format = "jpeg", "jpeg"
189
+
190
+ if organize_by_type:
191
+ # Organize into subdirectories by image type
192
+ type_dir = os.path.join(output_directory, subtype)
193
+ os.makedirs(type_dir, exist_ok=True)
194
+ image_filename = f"{clean_source_name}_p{page_number}_{doc_idx}.{file_ext}"
195
+ image_path = os.path.join(type_dir, image_filename)
196
+ else:
197
+ # Flat directory structure with type in filename
198
+ image_filename = f"{clean_source_name}_{subtype}_p{page_number}_{doc_idx}.{file_ext}"
199
+ image_path = os.path.join(output_directory, image_filename)
200
+
201
+ # Save image using centralized API function
202
+ try:
203
+ success = save_image_to_disk(image_content, image_path, target_format)
204
+
205
+ if success:
206
+ # Update image type counters
207
+ image_counts[subtype] += 1
208
+ image_counts["total"] += 1
209
+ logger.debug(f"Saved {subtype} image: {image_path}")
210
+ else:
211
+ logger.error(f"Failed to save {subtype} image for {clean_source_name}")
212
+
213
+ except Exception as e:
214
+ logger.error(f"Failed to save {subtype} image for {clean_source_name}: {e}")
215
+
216
+ except Exception as e:
217
+ logger.error(f"Failed to process document {doc_idx}: {e}")
218
+ continue
219
+
220
+ # Log summary statistics
221
+ if image_counts["total"] > 0:
222
+ logger.info(f"Successfully saved {image_counts['total']} images to {output_directory}")
223
+ for img_type, count in image_counts.items():
224
+ if img_type != "total" and count > 0:
225
+ logger.info(f" - {img_type}: {count}")
226
+ else:
227
+ logger.info("No images were saved (none met filter criteria)")
228
+
229
+ return image_counts
230
+
231
+
232
+ def save_images_from_response(response: Dict[str, Any], output_directory: str, **kwargs) -> Dict[str, int]:
233
+ """
234
+ Convenience function to save images from a full API response.
235
+
236
+ Parameters
237
+ ----------
238
+ response : Dict[str, Any]
239
+ Full API response containing a "data" field with document results.
240
+ output_directory : str
241
+ Directory where images will be saved.
242
+ **kwargs
243
+ Additional arguments passed to save_images_to_disk().
244
+ Includes output_format ("auto", "png", or "jpeg") and other filtering options.
245
+
246
+ Returns
247
+ -------
248
+ Dict[str, int]
249
+ Dictionary with counts of images saved by type.
250
+ """
251
+
252
+ if "data" not in response or not response["data"]:
253
+ logger.warning("No data found in response")
254
+ return {}
255
+
256
+ return save_images_to_disk(response["data"], output_directory, **kwargs)
257
+
258
+
259
+ def save_images_from_ingestor_results(
260
+ results: List[List[Dict[str, Any]]], output_directory: str, **kwargs
261
+ ) -> Dict[str, int]:
262
+ """
263
+ Save images from Ingestor.ingest() results.
264
+
265
+ Parameters
266
+ ----------
267
+ results : List[List[Dict[str, Any]]]
268
+ Results from Ingestor.ingest(), where each inner list contains
269
+ document results for one source file. Can also handle LazyLoadedList
270
+ objects when save_to_disk=True is used.
271
+ output_directory : str
272
+ Directory where images will be saved.
273
+ **kwargs
274
+ Additional arguments passed to save_images_to_disk().
275
+ Includes output_format ("auto", "png", or "jpeg") and other filtering options.
276
+
277
+ Returns
278
+ -------
279
+ Dict[str, int]
280
+ Dictionary with counts of images saved by type.
281
+ """
282
+
283
+ # Flatten results from multiple documents into single list
284
+ all_documents = []
285
+ for doc_results in results:
286
+ if isinstance(doc_results, list):
287
+ # Standard list of document results
288
+ all_documents.extend(doc_results)
289
+ elif hasattr(doc_results, "__iter__") and hasattr(doc_results, "__len__"):
290
+ # Handle LazyLoadedList or other sequence-like objects
291
+ try:
292
+ all_documents.extend(list(doc_results))
293
+ except Exception as e:
294
+ logger.warning(f"Failed to process document results: {e}")
295
+ continue
296
+ else:
297
+ # Handle single document case
298
+ all_documents.append(doc_results)
299
+
300
+ return save_images_to_disk(all_documents, output_directory, **kwargs)
@@ -42,6 +42,7 @@ from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer
42
42
  from pymilvus.orm.types import CONSISTENCY_BOUNDED
43
43
  from scipy.sparse import csr_array
44
44
 
45
+
45
46
  logger = logging.getLogger(__name__)
46
47
 
47
48
  CONSISTENCY = CONSISTENCY_BOUNDED
@@ -881,7 +882,7 @@ def create_bm25_model(
881
882
  return bm25_ef
882
883
 
883
884
 
884
- def stream_insert_milvus(records, client: MilvusClient, collection_name: str):
885
+ def stream_insert_milvus(records, client: MilvusClient, collection_name: str, batch_size: int = 5000):
885
886
  """
886
887
  This function takes the input records and creates a corpus,
887
888
  factoring in filters (i.e. texts, charts, tables) and fits
@@ -899,12 +900,42 @@ def stream_insert_milvus(records, client: MilvusClient, collection_name: str):
899
900
  Milvus Collection to search against
900
901
  """
901
902
  count = 0
902
- for element in records:
903
- client.insert(collection_name=collection_name, data=[element])
904
- count += 1
903
+ for idx in range(0, len(records), batch_size):
904
+ client.insert(collection_name=collection_name, data=records[idx : idx + batch_size])
905
+ count += len(records[idx : idx + batch_size])
905
906
  logger.info(f"streamed {count} records")
906
907
 
907
908
 
909
+ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient):
910
+ """
911
+ This function waits for the index to be built. It checks
912
+ the indexed_rows of the index and waits for it to be equal
913
+ to the number of records. This only works for streaming inserts,
914
+ bulk inserts are not supported by this function
915
+ (refer to MilvusClient.refresh_load for bulk inserts).
916
+ """
917
+ index_names = utility.list_indexes(collection_name)
918
+ indexed_rows = 0
919
+ for index_name in index_names:
920
+ indexed_rows = 0
921
+ while indexed_rows < num_elements:
922
+ pos_movement = 10 # number of iteration allowed without noticing an increase in indexed_rows
923
+ for i in range(20):
924
+ new_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
925
+ time.sleep(1)
926
+ if new_indexed_rows == num_elements:
927
+ indexed_rows = new_indexed_rows
928
+ break
929
+ # check if indexed_rows is staying the same, too many times means something is wrong
930
+ if new_indexed_rows == indexed_rows:
931
+ pos_movement = -1
932
+ # if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
933
+ if pos_movement == 0:
934
+ raise ValueError("Rows are not getting indexed as expected")
935
+ indexed_rows = new_indexed_rows
936
+ return indexed_rows
937
+
938
+
908
939
  def write_to_nvingest_collection(
909
940
  records,
910
941
  collection_name: str,
@@ -1026,6 +1057,9 @@ def write_to_nvingest_collection(
1026
1057
  client,
1027
1058
  collection_name,
1028
1059
  )
1060
+ # Make sure all rows are indexed, decided not to wrap in a timeout because we dont
1061
+ # know how long this should take, it is num_elements dependent.
1062
+ wait_for_index(collection_name, num_elements, client)
1029
1063
  else:
1030
1064
  minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
1031
1065
  if not minio_client.bucket_exists(bucket_name):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.9.8.dev20250908
3
+ Version: 2025.9.9.dev20250909
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -42,6 +42,7 @@ src/nv_ingest_client/primitives/tasks/udf.py
42
42
  src/nv_ingest_client/primitives/tasks/vdb_upload.py
43
43
  src/nv_ingest_client/util/__init__.py
44
44
  src/nv_ingest_client/util/dataset.py
45
+ src/nv_ingest_client/util/image_disk_utils.py
45
46
  src/nv_ingest_client/util/milvus.py
46
47
  src/nv_ingest_client/util/process_json_files.py
47
48
  src/nv_ingest_client/util/processing.py