nv-ingest-client 2025.9.8.dev20250908__tar.gz → 2025.9.10.dev20250910__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- {nv_ingest_client-2025.9.8.dev20250908/src/nv_ingest_client.egg-info → nv_ingest_client-2025.9.10.dev20250910}/PKG-INFO +1 -1
- nv_ingest_client-2025.9.10.dev20250910/src/nv_ingest_client/util/document_analysis.py +314 -0
- nv_ingest_client-2025.9.10.dev20250910/src/nv_ingest_client/util/image_disk_utils.py +300 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/util.py +1 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/vdb/milvus.py +58 -38
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client.egg-info/SOURCES.txt +2 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/LICENSE +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/MANIFEST.in +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/README.md +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/pyproject.toml +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/setup.cfg +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/__init__.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/cli/__init__.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/cli/util/__init__.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/cli/util/click.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/cli/util/processing.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/cli/util/system.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/client/__init__.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/client/client.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/client/interface.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/client/util/processing.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/nv_ingest_cli.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/__init__.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/jobs/job_spec.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/caption.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/embed.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/extract.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/filter.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/store.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/udf.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/__init__.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/dataset.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/file_processing/extract.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/milvus.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/process_json_files.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/processing.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/system.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/transport.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client/util/zipkin.py +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client.egg-info/requires.txt +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
- {nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/version.py +0 -0
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Utility functions for analyzing document-level chunk composition from nv-ingest results.
|
|
7
|
+
|
|
8
|
+
This module provides analysis capabilities for understanding the distribution and types
|
|
9
|
+
of extracted content elements across individual documents. It enables customers to
|
|
10
|
+
gain visibility into their document composition for performance optimization and
|
|
11
|
+
capacity planning decisions.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import os
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
from typing import Any, Dict, List, Union
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def analyze_document_chunks(
|
|
23
|
+
results: Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
|
|
24
|
+
) -> Dict[str, Dict[str, Dict[str, int]]]:
|
|
25
|
+
"""
|
|
26
|
+
Analyze ingestor results to count elements by type and page for each document.
|
|
27
|
+
|
|
28
|
+
This function processes results from nv-ingest ingestion and provides a per-document,
|
|
29
|
+
per-page breakdown of extracted content types, enabling customers to understand document
|
|
30
|
+
composition and page-level distribution for optimization and planning purposes.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
results : Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
|
|
35
|
+
Ingestor results from ingestor.ingest() in standard List[List[Dict]] format,
|
|
36
|
+
or flattened List[Dict] format. Handles both regular lists and
|
|
37
|
+
LazyLoadedList objects automatically.
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
Dict[str, Dict[str, Dict[str, int]]]
|
|
42
|
+
Dictionary mapping document names to page-level element type counts with structure:
|
|
43
|
+
{
|
|
44
|
+
"document1.pdf": {
|
|
45
|
+
"total": {
|
|
46
|
+
"text": 7, "charts": 1, "tables": 1,
|
|
47
|
+
"unstructured_images": 0, "infographics": 0, "page_images": 0
|
|
48
|
+
},
|
|
49
|
+
"1": {
|
|
50
|
+
"text": 3, "charts": 1, "tables": 0,
|
|
51
|
+
"unstructured_images": 0, "infographics": 0, "page_images": 0
|
|
52
|
+
},
|
|
53
|
+
"2": {
|
|
54
|
+
"text": 4, "charts": 0, "tables": 1,
|
|
55
|
+
"unstructured_images": 0, "infographics": 0, "page_images": 0
|
|
56
|
+
}
|
|
57
|
+
},
|
|
58
|
+
"document2.pdf": {...}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
Notes
|
|
62
|
+
-----
|
|
63
|
+
- Requires purge_results_after_upload=False in vdb_upload() configuration
|
|
64
|
+
- Automatically handles LazyLoadedList objects from nv-ingest client
|
|
65
|
+
- Returns zero counts for missing element types
|
|
66
|
+
- Assumes valid nv-ingest output format with guaranteed metadata structure
|
|
67
|
+
|
|
68
|
+
Examples
|
|
69
|
+
--------
|
|
70
|
+
>>> from nv_ingest_client.util.document_analysis import analyze_document_chunks
|
|
71
|
+
>>>
|
|
72
|
+
>>> # After running ingestion
|
|
73
|
+
>>> results, failures = ingestor.ingest(show_progress=True, return_failures=True)
|
|
74
|
+
>>>
|
|
75
|
+
>>> # Analyze document composition by page
|
|
76
|
+
>>> breakdown = analyze_document_chunks(results)
|
|
77
|
+
>>>
|
|
78
|
+
>>> for doc_name, pages in breakdown.items():
|
|
79
|
+
... total_counts = pages["total"]
|
|
80
|
+
... total_elements = sum(total_counts.values())
|
|
81
|
+
... page_count = len(pages) - 1 # Subtract 1 for "total" key
|
|
82
|
+
... print(f"{doc_name}: {total_elements} elements across {page_count} pages")
|
|
83
|
+
... print(f" total: {total_elements} elements ({total_counts['text']} text, {total_counts['charts']} charts)")
|
|
84
|
+
... for page_name, counts in pages.items():
|
|
85
|
+
... if page_name != "total": # Skip total when listing pages
|
|
86
|
+
... page_total = sum(counts.values())
|
|
87
|
+
... print(
|
|
88
|
+
f" page {page_name}: {page_total} elements "
|
|
89
|
+
f"({counts['text']} text, {counts['charts']} charts)"
|
|
90
|
+
)
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
if not results:
|
|
94
|
+
logger.warning("No results provided for analysis")
|
|
95
|
+
return {}
|
|
96
|
+
|
|
97
|
+
# Normalize input format to handle both List[List[Dict]] and List[Dict] structures
|
|
98
|
+
normalized_results = _normalize_results_format(results)
|
|
99
|
+
|
|
100
|
+
# Group elements by document name and page number
|
|
101
|
+
document_page_elements = defaultdict(lambda: defaultdict(list))
|
|
102
|
+
|
|
103
|
+
for doc_results in normalized_results:
|
|
104
|
+
# Handle LazyLoadedList and other iterable types
|
|
105
|
+
elements = _extract_elements_from_doc(doc_results)
|
|
106
|
+
|
|
107
|
+
for element in elements:
|
|
108
|
+
doc_name = _extract_document_name(element)
|
|
109
|
+
page_key = _extract_page_key(element)
|
|
110
|
+
document_page_elements[doc_name][page_key].append(element)
|
|
111
|
+
|
|
112
|
+
# Count element types per page within each document and calculate totals
|
|
113
|
+
document_page_counts = {}
|
|
114
|
+
|
|
115
|
+
for doc_name, pages in document_page_elements.items():
|
|
116
|
+
document_page_counts[doc_name] = {}
|
|
117
|
+
total_counts = _initialize_element_counts()
|
|
118
|
+
|
|
119
|
+
for page_key, elements in pages.items():
|
|
120
|
+
counts = _initialize_element_counts()
|
|
121
|
+
|
|
122
|
+
for element in elements:
|
|
123
|
+
element_type = _categorize_element(element)
|
|
124
|
+
counts[element_type] += 1
|
|
125
|
+
total_counts[element_type] += 1 # Add to document total
|
|
126
|
+
|
|
127
|
+
document_page_counts[doc_name][page_key] = counts
|
|
128
|
+
|
|
129
|
+
# Add the total counts for this document
|
|
130
|
+
document_page_counts[doc_name]["total"] = total_counts
|
|
131
|
+
|
|
132
|
+
if document_page_counts:
|
|
133
|
+
total_docs = len(document_page_counts)
|
|
134
|
+
total_pages = sum(len(pages) - 1 for pages in document_page_counts.values()) # Subtract 1 for "total" key
|
|
135
|
+
total_elements = sum(sum(page_counts["total"].values()) for page_counts in document_page_counts.values())
|
|
136
|
+
logger.info(f"Analyzed {total_elements} elements across {total_pages} pages in {total_docs} documents")
|
|
137
|
+
else:
|
|
138
|
+
logger.warning("No valid documents found for analysis")
|
|
139
|
+
|
|
140
|
+
return document_page_counts
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _normalize_results_format(results: Union[List[List[Dict]], List[Dict]]) -> List[List[Dict]]:
|
|
144
|
+
"""
|
|
145
|
+
Normalize various input formats to consistent List[List[Dict]] structure.
|
|
146
|
+
|
|
147
|
+
Parameters
|
|
148
|
+
----------
|
|
149
|
+
results : Union[List[List[Dict]], List[Dict]]
|
|
150
|
+
Input results in various formats
|
|
151
|
+
|
|
152
|
+
Returns
|
|
153
|
+
-------
|
|
154
|
+
List[List[Dict]]
|
|
155
|
+
Normalized results in standard format
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
if not results:
|
|
159
|
+
return []
|
|
160
|
+
|
|
161
|
+
# Handle List[List[Dict]] or List[LazyLoadedList] formats
|
|
162
|
+
if isinstance(results, list) and len(results) > 0:
|
|
163
|
+
first_elem = results[0]
|
|
164
|
+
# Check for list, LazyLoadedList, or any sequence-like object
|
|
165
|
+
if isinstance(first_elem, list) or (
|
|
166
|
+
hasattr(first_elem, "__iter__") and hasattr(first_elem, "__len__") and not isinstance(first_elem, dict)
|
|
167
|
+
):
|
|
168
|
+
return results
|
|
169
|
+
|
|
170
|
+
# Handle flattened List[Dict] format by grouping elements by document
|
|
171
|
+
if isinstance(results, list) and len(results) > 0 and isinstance(results[0], dict):
|
|
172
|
+
doc_groups = defaultdict(list)
|
|
173
|
+
for element in results:
|
|
174
|
+
doc_name = _extract_document_name(element)
|
|
175
|
+
doc_groups[doc_name].append(element)
|
|
176
|
+
|
|
177
|
+
return list(doc_groups.values())
|
|
178
|
+
|
|
179
|
+
# Fallback for unexpected formats
|
|
180
|
+
return [[item] for item in results if item]
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _extract_elements_from_doc(doc_results) -> List[Dict]:
|
|
184
|
+
"""
|
|
185
|
+
Extract elements from document results, handling various data types.
|
|
186
|
+
|
|
187
|
+
Parameters
|
|
188
|
+
----------
|
|
189
|
+
doc_results : Any
|
|
190
|
+
Document results which may be a list, LazyLoadedList, or other iterable
|
|
191
|
+
|
|
192
|
+
Returns
|
|
193
|
+
-------
|
|
194
|
+
List[Dict]
|
|
195
|
+
List of element dictionaries
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
if isinstance(doc_results, list):
|
|
199
|
+
return doc_results
|
|
200
|
+
elif hasattr(doc_results, "__iter__") and hasattr(doc_results, "__len__"):
|
|
201
|
+
# Handle LazyLoadedList and other sequence-like objects
|
|
202
|
+
return list(doc_results)
|
|
203
|
+
else:
|
|
204
|
+
# Single element case
|
|
205
|
+
return [doc_results] if doc_results else []
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _extract_document_name(element: Dict[str, Any]) -> str:
|
|
209
|
+
"""
|
|
210
|
+
Extract clean document name from element metadata.
|
|
211
|
+
|
|
212
|
+
Parameters
|
|
213
|
+
----------
|
|
214
|
+
element : Dict[str, Any]
|
|
215
|
+
Element dictionary containing metadata
|
|
216
|
+
|
|
217
|
+
Returns
|
|
218
|
+
-------
|
|
219
|
+
str
|
|
220
|
+
Clean document filename (basename of source_id)
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
# nv-ingest guarantees this structure exists
|
|
224
|
+
source_id = element["metadata"]["source_metadata"]["source_id"]
|
|
225
|
+
return os.path.basename(source_id)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _extract_page_key(element: Dict[str, Any]) -> str:
|
|
229
|
+
"""
|
|
230
|
+
Extract page key from element metadata for consistent page naming.
|
|
231
|
+
|
|
232
|
+
Parameters
|
|
233
|
+
----------
|
|
234
|
+
element : Dict[str, Any]
|
|
235
|
+
Element dictionary containing metadata
|
|
236
|
+
|
|
237
|
+
Returns
|
|
238
|
+
-------
|
|
239
|
+
str
|
|
240
|
+
Page number as string (e.g., "1", "2", or "unknown")
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
page_number = element["metadata"]["content_metadata"]["page_number"]
|
|
245
|
+
if page_number is not None and page_number >= 0:
|
|
246
|
+
return str(page_number)
|
|
247
|
+
else:
|
|
248
|
+
return "unknown"
|
|
249
|
+
except (KeyError, TypeError):
|
|
250
|
+
logger.warning("Missing or invalid page_number in element metadata")
|
|
251
|
+
return "unknown"
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _categorize_element(element: Dict[str, Any]) -> str:
|
|
255
|
+
"""
|
|
256
|
+
Categorize element by type using document_type and content metadata.
|
|
257
|
+
|
|
258
|
+
Parameters
|
|
259
|
+
----------
|
|
260
|
+
element : Dict[str, Any]
|
|
261
|
+
Element dictionary with document_type and metadata fields
|
|
262
|
+
|
|
263
|
+
Returns
|
|
264
|
+
-------
|
|
265
|
+
str
|
|
266
|
+
Element category: "text", "charts", "tables", "unstructured_images",
|
|
267
|
+
"infographics", or "page_images"
|
|
268
|
+
"""
|
|
269
|
+
|
|
270
|
+
doc_type = element["document_type"]
|
|
271
|
+
|
|
272
|
+
# Text elements
|
|
273
|
+
if doc_type == "text":
|
|
274
|
+
return "text"
|
|
275
|
+
|
|
276
|
+
# Structured elements with subtypes
|
|
277
|
+
elif doc_type == "structured":
|
|
278
|
+
subtype = element["metadata"]["content_metadata"]["subtype"]
|
|
279
|
+
if subtype == "chart":
|
|
280
|
+
return "charts"
|
|
281
|
+
elif subtype == "table":
|
|
282
|
+
return "tables"
|
|
283
|
+
elif subtype == "infographic":
|
|
284
|
+
return "infographics"
|
|
285
|
+
elif subtype == "page_image":
|
|
286
|
+
return "page_images"
|
|
287
|
+
|
|
288
|
+
# Image elements (unstructured)
|
|
289
|
+
elif doc_type == "image":
|
|
290
|
+
return "unstructured_images"
|
|
291
|
+
|
|
292
|
+
# Should not reach here with valid nv-ingest output
|
|
293
|
+
logger.warning(f"Unexpected element type: {doc_type}")
|
|
294
|
+
return "text" # Default to text for safety
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _initialize_element_counts() -> Dict[str, int]:
|
|
298
|
+
"""
|
|
299
|
+
Initialize element counts dictionary with all supported types.
|
|
300
|
+
|
|
301
|
+
Returns
|
|
302
|
+
-------
|
|
303
|
+
Dict[str, int]
|
|
304
|
+
Dictionary with zero counts for all element types
|
|
305
|
+
"""
|
|
306
|
+
|
|
307
|
+
return {
|
|
308
|
+
"text": 0,
|
|
309
|
+
"charts": 0,
|
|
310
|
+
"tables": 0,
|
|
311
|
+
"unstructured_images": 0,
|
|
312
|
+
"infographics": 0,
|
|
313
|
+
"page_images": 0,
|
|
314
|
+
}
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Utility functions for saving images from ingestion results to disk as actual image files.
|
|
7
|
+
|
|
8
|
+
This module provides comprehensive utilities for extracting and saving base64-encoded
|
|
9
|
+
images from nv-ingest results to local filesystem. Features include:
|
|
10
|
+
- Configurable filtering by image type (charts, tables, infographics, etc.)
|
|
11
|
+
- Descriptive filename generation with source and page information
|
|
12
|
+
- Organized directory structure by image type
|
|
13
|
+
- Detailed image counting and statistics
|
|
14
|
+
|
|
15
|
+
Typical use cases:
|
|
16
|
+
- Debugging and visual inspection of extracted content
|
|
17
|
+
- Quality assessment of image extraction pipeline
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import logging
|
|
21
|
+
import os
|
|
22
|
+
from typing import Any, Dict, List
|
|
23
|
+
|
|
24
|
+
from nv_ingest_client.client.util.processing import get_valid_filename
|
|
25
|
+
from nv_ingest_api.util.image_processing.transforms import save_image_to_disk, _detect_base64_image_format
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _detect_extension_from_content(image_content: str) -> str:
|
|
31
|
+
"""
|
|
32
|
+
Get file extension by detecting original image format.
|
|
33
|
+
Falls back to .jpeg if detection fails or format is unknown.
|
|
34
|
+
"""
|
|
35
|
+
DEFAULT_EXT = "jpg" # must be either "jpg" or "png"
|
|
36
|
+
try:
|
|
37
|
+
fmt = _detect_base64_image_format(image_content).upper()
|
|
38
|
+
except Exception:
|
|
39
|
+
logger.warning("Image format detection failed; falling back to default '%s'.", DEFAULT_EXT)
|
|
40
|
+
return DEFAULT_EXT
|
|
41
|
+
ext_map = {
|
|
42
|
+
"JPEG": "jpg",
|
|
43
|
+
"JPG": "jpg",
|
|
44
|
+
"PNG": "png",
|
|
45
|
+
}
|
|
46
|
+
ext = ext_map.get(fmt, None)
|
|
47
|
+
if ext:
|
|
48
|
+
return ext
|
|
49
|
+
logger.warning("Unsupported image format '%s'; falling back to default '%s'.", fmt, DEFAULT_EXT)
|
|
50
|
+
return DEFAULT_EXT
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def save_images_to_disk(
|
|
54
|
+
response_data: List[Dict[str, Any]],
|
|
55
|
+
output_directory: str,
|
|
56
|
+
save_charts: bool = True,
|
|
57
|
+
save_tables: bool = True,
|
|
58
|
+
save_infographics: bool = True,
|
|
59
|
+
save_page_images: bool = False,
|
|
60
|
+
save_raw_images: bool = False,
|
|
61
|
+
organize_by_type: bool = True,
|
|
62
|
+
output_format: str = "auto",
|
|
63
|
+
) -> Dict[str, int]:
|
|
64
|
+
"""
|
|
65
|
+
Save base64-encoded images from ingestion results to disk as actual image files.
|
|
66
|
+
|
|
67
|
+
This utility extracts images from ingestion response data and saves them to disk
|
|
68
|
+
with descriptive filenames that include the image subtype and page information.
|
|
69
|
+
It provides granular control over which types of images to save.
|
|
70
|
+
|
|
71
|
+
Parameters
|
|
72
|
+
----------
|
|
73
|
+
response_data : List[Dict[str, Any]]
|
|
74
|
+
List of document results from ingestion, each containing metadata with base64 images.
|
|
75
|
+
output_directory : str
|
|
76
|
+
Base directory where images will be saved.
|
|
77
|
+
save_charts : bool, optional
|
|
78
|
+
Whether to save chart images. Default is True.
|
|
79
|
+
save_tables : bool, optional
|
|
80
|
+
Whether to save table images. Default is True.
|
|
81
|
+
save_infographics : bool, optional
|
|
82
|
+
Whether to save infographic images. Default is True.
|
|
83
|
+
save_page_images : bool, optional
|
|
84
|
+
Whether to save page-as-image files. Default is False.
|
|
85
|
+
save_raw_images : bool, optional
|
|
86
|
+
Whether to save raw/natural images. Default is False.
|
|
87
|
+
organize_by_type : bool, optional
|
|
88
|
+
Whether to organize images into subdirectories by type. Default is True.
|
|
89
|
+
output_format : str, optional
|
|
90
|
+
Output image format for saved files. Default is "auto".
|
|
91
|
+
- "auto": Preserve original format (fastest, no conversion)
|
|
92
|
+
- "jpeg": Convert to JPEG (smaller files, good compression)
|
|
93
|
+
- "png": Convert to PNG (lossless quality)
|
|
94
|
+
Use "auto" for maximum speed by avoiding format conversion.
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
Dict[str, int]
|
|
99
|
+
Dictionary with counts of images saved by type.
|
|
100
|
+
|
|
101
|
+
Raises
|
|
102
|
+
------
|
|
103
|
+
ValueError
|
|
104
|
+
If output_format is not supported.
|
|
105
|
+
|
|
106
|
+
Examples
|
|
107
|
+
--------
|
|
108
|
+
>>> from nv_ingest_client.util.image_disk_utils import save_images_to_disk
|
|
109
|
+
>>>
|
|
110
|
+
>>> # Save only charts and tables
|
|
111
|
+
>>> counts = save_images_to_disk(
|
|
112
|
+
... response_data,
|
|
113
|
+
... "./output/images",
|
|
114
|
+
... save_charts=True,
|
|
115
|
+
... save_tables=True,
|
|
116
|
+
... save_page_images=False
|
|
117
|
+
... )
|
|
118
|
+
>>> print(f"Saved {counts['chart']} charts and {counts['table']} tables")
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
if not response_data:
|
|
122
|
+
logger.warning("No response data provided")
|
|
123
|
+
return {}
|
|
124
|
+
|
|
125
|
+
# Validate format upfront to fail fast
|
|
126
|
+
normalized_format = output_format.lower()
|
|
127
|
+
if normalized_format not in ["auto", "png", "jpeg", "jpg"]:
|
|
128
|
+
raise ValueError(
|
|
129
|
+
f"Unsupported output format: '{output_format}'. Supported formats: 'auto', 'png', 'jpeg', 'jpg'"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Initialize counters
|
|
133
|
+
image_counts = {"chart": 0, "table": 0, "infographic": 0, "page_image": 0, "image": 0, "total": 0}
|
|
134
|
+
|
|
135
|
+
# Create output directory
|
|
136
|
+
os.makedirs(output_directory, exist_ok=True)
|
|
137
|
+
|
|
138
|
+
for doc_idx, document in enumerate(response_data):
|
|
139
|
+
try:
|
|
140
|
+
metadata = document.get("metadata", {})
|
|
141
|
+
doc_type = document.get("document_type", "unknown")
|
|
142
|
+
|
|
143
|
+
# Skip documents without image content
|
|
144
|
+
image_content = metadata.get("content")
|
|
145
|
+
if not image_content:
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
# Get document info for naming
|
|
149
|
+
source_metadata = metadata.get("source_metadata", {})
|
|
150
|
+
source_id = source_metadata.get("source_id", f"document_{doc_idx}")
|
|
151
|
+
clean_source_name = get_valid_filename(os.path.basename(source_id))
|
|
152
|
+
|
|
153
|
+
content_metadata = metadata.get("content_metadata", {})
|
|
154
|
+
subtype = content_metadata.get("subtype", "image")
|
|
155
|
+
page_number = content_metadata.get("page_number", 0)
|
|
156
|
+
|
|
157
|
+
# Apply filtering based on image subtype and user preferences
|
|
158
|
+
should_save = False
|
|
159
|
+
if subtype == "chart" and save_charts:
|
|
160
|
+
should_save = True
|
|
161
|
+
elif subtype == "table" and save_tables:
|
|
162
|
+
should_save = True
|
|
163
|
+
elif subtype == "infographic" and save_infographics:
|
|
164
|
+
should_save = True
|
|
165
|
+
elif subtype == "page_image" and save_page_images:
|
|
166
|
+
should_save = True
|
|
167
|
+
elif (
|
|
168
|
+
doc_type == "image"
|
|
169
|
+
and subtype not in ["chart", "table", "infographic", "page_image"]
|
|
170
|
+
and save_raw_images
|
|
171
|
+
):
|
|
172
|
+
should_save = True
|
|
173
|
+
subtype = "image" # Normalize subtype for consistent counting
|
|
174
|
+
|
|
175
|
+
if not should_save:
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
# Determine file extension and target format (format already validated upfront)
|
|
179
|
+
if normalized_format in ["jpeg", "jpg"]:
|
|
180
|
+
file_ext, target_format = "jpeg", "jpeg"
|
|
181
|
+
elif normalized_format == "png":
|
|
182
|
+
file_ext, target_format = "png", "png"
|
|
183
|
+
else: # normalized_format == "auto" - detect once and use result
|
|
184
|
+
detected_ext = _detect_extension_from_content(image_content)
|
|
185
|
+
if detected_ext == "png":
|
|
186
|
+
file_ext, target_format = "png", "png"
|
|
187
|
+
else: # detected_ext == "jpeg"
|
|
188
|
+
file_ext, target_format = "jpeg", "jpeg"
|
|
189
|
+
|
|
190
|
+
if organize_by_type:
|
|
191
|
+
# Organize into subdirectories by image type
|
|
192
|
+
type_dir = os.path.join(output_directory, subtype)
|
|
193
|
+
os.makedirs(type_dir, exist_ok=True)
|
|
194
|
+
image_filename = f"{clean_source_name}_p{page_number}_{doc_idx}.{file_ext}"
|
|
195
|
+
image_path = os.path.join(type_dir, image_filename)
|
|
196
|
+
else:
|
|
197
|
+
# Flat directory structure with type in filename
|
|
198
|
+
image_filename = f"{clean_source_name}_{subtype}_p{page_number}_{doc_idx}.{file_ext}"
|
|
199
|
+
image_path = os.path.join(output_directory, image_filename)
|
|
200
|
+
|
|
201
|
+
# Save image using centralized API function
|
|
202
|
+
try:
|
|
203
|
+
success = save_image_to_disk(image_content, image_path, target_format)
|
|
204
|
+
|
|
205
|
+
if success:
|
|
206
|
+
# Update image type counters
|
|
207
|
+
image_counts[subtype] += 1
|
|
208
|
+
image_counts["total"] += 1
|
|
209
|
+
logger.debug(f"Saved {subtype} image: {image_path}")
|
|
210
|
+
else:
|
|
211
|
+
logger.error(f"Failed to save {subtype} image for {clean_source_name}")
|
|
212
|
+
|
|
213
|
+
except Exception as e:
|
|
214
|
+
logger.error(f"Failed to save {subtype} image for {clean_source_name}: {e}")
|
|
215
|
+
|
|
216
|
+
except Exception as e:
|
|
217
|
+
logger.error(f"Failed to process document {doc_idx}: {e}")
|
|
218
|
+
continue
|
|
219
|
+
|
|
220
|
+
# Log summary statistics
|
|
221
|
+
if image_counts["total"] > 0:
|
|
222
|
+
logger.info(f"Successfully saved {image_counts['total']} images to {output_directory}")
|
|
223
|
+
for img_type, count in image_counts.items():
|
|
224
|
+
if img_type != "total" and count > 0:
|
|
225
|
+
logger.info(f" - {img_type}: {count}")
|
|
226
|
+
else:
|
|
227
|
+
logger.info("No images were saved (none met filter criteria)")
|
|
228
|
+
|
|
229
|
+
return image_counts
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def save_images_from_response(response: Dict[str, Any], output_directory: str, **kwargs) -> Dict[str, int]:
|
|
233
|
+
"""
|
|
234
|
+
Convenience function to save images from a full API response.
|
|
235
|
+
|
|
236
|
+
Parameters
|
|
237
|
+
----------
|
|
238
|
+
response : Dict[str, Any]
|
|
239
|
+
Full API response containing a "data" field with document results.
|
|
240
|
+
output_directory : str
|
|
241
|
+
Directory where images will be saved.
|
|
242
|
+
**kwargs
|
|
243
|
+
Additional arguments passed to save_images_to_disk().
|
|
244
|
+
Includes output_format ("auto", "png", or "jpeg") and other filtering options.
|
|
245
|
+
|
|
246
|
+
Returns
|
|
247
|
+
-------
|
|
248
|
+
Dict[str, int]
|
|
249
|
+
Dictionary with counts of images saved by type.
|
|
250
|
+
"""
|
|
251
|
+
|
|
252
|
+
if "data" not in response or not response["data"]:
|
|
253
|
+
logger.warning("No data found in response")
|
|
254
|
+
return {}
|
|
255
|
+
|
|
256
|
+
return save_images_to_disk(response["data"], output_directory, **kwargs)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def save_images_from_ingestor_results(
|
|
260
|
+
results: List[List[Dict[str, Any]]], output_directory: str, **kwargs
|
|
261
|
+
) -> Dict[str, int]:
|
|
262
|
+
"""
|
|
263
|
+
Save images from Ingestor.ingest() results.
|
|
264
|
+
|
|
265
|
+
Parameters
|
|
266
|
+
----------
|
|
267
|
+
results : List[List[Dict[str, Any]]]
|
|
268
|
+
Results from Ingestor.ingest(), where each inner list contains
|
|
269
|
+
document results for one source file. Can also handle LazyLoadedList
|
|
270
|
+
objects when save_to_disk=True is used.
|
|
271
|
+
output_directory : str
|
|
272
|
+
Directory where images will be saved.
|
|
273
|
+
**kwargs
|
|
274
|
+
Additional arguments passed to save_images_to_disk().
|
|
275
|
+
Includes output_format ("auto", "png", or "jpeg") and other filtering options.
|
|
276
|
+
|
|
277
|
+
Returns
|
|
278
|
+
-------
|
|
279
|
+
Dict[str, int]
|
|
280
|
+
Dictionary with counts of images saved by type.
|
|
281
|
+
"""
|
|
282
|
+
|
|
283
|
+
# Flatten results from multiple documents into single list
|
|
284
|
+
all_documents = []
|
|
285
|
+
for doc_results in results:
|
|
286
|
+
if isinstance(doc_results, list):
|
|
287
|
+
# Standard list of document results
|
|
288
|
+
all_documents.extend(doc_results)
|
|
289
|
+
elif hasattr(doc_results, "__iter__") and hasattr(doc_results, "__len__"):
|
|
290
|
+
# Handle LazyLoadedList or other sequence-like objects
|
|
291
|
+
try:
|
|
292
|
+
all_documents.extend(list(doc_results))
|
|
293
|
+
except Exception as e:
|
|
294
|
+
logger.warning(f"Failed to process document results: {e}")
|
|
295
|
+
continue
|
|
296
|
+
else:
|
|
297
|
+
# Handle single document case
|
|
298
|
+
all_documents.append(doc_results)
|
|
299
|
+
|
|
300
|
+
return save_images_to_disk(all_documents, output_directory, **kwargs)
|
|
@@ -35,6 +35,7 @@ class ClientConfigSchema:
|
|
|
35
35
|
"https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking",
|
|
36
36
|
)
|
|
37
37
|
self.nv_ranker_nim_model_name: str = os.getenv("RERANKER_NIM_MODEL_NAME", "nvidia/llama-3.2-nv-rerankqa-1b-v2")
|
|
38
|
+
self.minio_bucket_name: str = os.getenv("MINIO_BUCKET", "nv-ingest")
|
|
38
39
|
|
|
39
40
|
|
|
40
41
|
@unified_exception_handler
|
|
@@ -17,8 +17,6 @@ import numpy as np
|
|
|
17
17
|
import pandas as pd
|
|
18
18
|
import requests
|
|
19
19
|
from minio import Minio
|
|
20
|
-
from minio.commonconfig import CopySource
|
|
21
|
-
from minio.deleteobjects import DeleteObject
|
|
22
20
|
from nv_ingest_client.util.process_json_files import ingest_json_results_to_blob
|
|
23
21
|
from nv_ingest_client.util.transport import infer_microservice
|
|
24
22
|
from nv_ingest_client.util.util import ClientConfigSchema
|
|
@@ -42,10 +40,10 @@ from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer
|
|
|
42
40
|
from pymilvus.orm.types import CONSISTENCY_BOUNDED
|
|
43
41
|
from scipy.sparse import csr_array
|
|
44
42
|
|
|
43
|
+
|
|
45
44
|
logger = logging.getLogger(__name__)
|
|
46
45
|
|
|
47
46
|
CONSISTENCY = CONSISTENCY_BOUNDED
|
|
48
|
-
MINIO_DEFAULT_BUCKET_NAME = "a-bucket"
|
|
49
47
|
|
|
50
48
|
pandas_reader_map = {
|
|
51
49
|
".json": pd.read_json,
|
|
@@ -750,7 +748,7 @@ def bulk_insert_milvus(
|
|
|
750
748
|
minio_endpoint: str = "localhost:9000",
|
|
751
749
|
access_key: str = "minioadmin",
|
|
752
750
|
secret_key: str = "minioadmin",
|
|
753
|
-
bucket_name: str =
|
|
751
|
+
bucket_name: str = None,
|
|
754
752
|
username: str = None,
|
|
755
753
|
password: str = None,
|
|
756
754
|
):
|
|
@@ -774,29 +772,16 @@ def bulk_insert_milvus(
|
|
|
774
772
|
password : str, optional
|
|
775
773
|
Milvus password.
|
|
776
774
|
"""
|
|
777
|
-
minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
|
|
778
|
-
|
|
779
775
|
connections.connect(uri=milvus_uri, token=f"{username}:{password}")
|
|
780
776
|
t_bulk_start = time.time()
|
|
781
777
|
task_ids = []
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
uploaded_files.append(f)
|
|
790
|
-
except Exception as e:
|
|
791
|
-
logger.error(f"Error copying {f} from {bucket_name} to {MINIO_DEFAULT_BUCKET_NAME}: {e}")
|
|
792
|
-
|
|
793
|
-
task_id = utility.do_bulk_insert(
|
|
794
|
-
collection_name=collection_name,
|
|
795
|
-
files=files,
|
|
796
|
-
consistency_level=CONSISTENCY,
|
|
797
|
-
)
|
|
798
|
-
task_ids.append(task_id)
|
|
799
|
-
# list_bulk_insert_tasks = utility.list_bulk_insert_tasks(collection_name=collection_name)
|
|
778
|
+
|
|
779
|
+
task_id = utility.do_bulk_insert(
|
|
780
|
+
collection_name=collection_name,
|
|
781
|
+
files=[file for files in writer.batch_files for file in files],
|
|
782
|
+
consistency_level=CONSISTENCY,
|
|
783
|
+
)
|
|
784
|
+
|
|
800
785
|
while len(task_ids) > 0:
|
|
801
786
|
time.sleep(1)
|
|
802
787
|
for task_id in task_ids:
|
|
@@ -812,9 +797,6 @@ def bulk_insert_milvus(
|
|
|
812
797
|
logger.error(f"Failed reason: {task.failed_reason}")
|
|
813
798
|
task_ids.remove(task_id)
|
|
814
799
|
|
|
815
|
-
# Cleanup: remove the copied files to undo the temporary workaround before bulk insert.
|
|
816
|
-
minio_client.remove_objects(MINIO_DEFAULT_BUCKET_NAME, [DeleteObject(f) for f in uploaded_files])
|
|
817
|
-
|
|
818
800
|
t_bulk_end = time.time()
|
|
819
801
|
logger.info(f"Bulk {collection_name} upload took {t_bulk_end - t_bulk_start} s")
|
|
820
802
|
|
|
@@ -881,7 +863,7 @@ def create_bm25_model(
|
|
|
881
863
|
return bm25_ef
|
|
882
864
|
|
|
883
865
|
|
|
884
|
-
def stream_insert_milvus(records, client: MilvusClient, collection_name: str):
|
|
866
|
+
def stream_insert_milvus(records, client: MilvusClient, collection_name: str, batch_size: int = 5000):
|
|
885
867
|
"""
|
|
886
868
|
This function takes the input records and creates a corpus,
|
|
887
869
|
factoring in filters (i.e. texts, charts, tables) and fits
|
|
@@ -899,12 +881,46 @@ def stream_insert_milvus(records, client: MilvusClient, collection_name: str):
|
|
|
899
881
|
Milvus Collection to search against
|
|
900
882
|
"""
|
|
901
883
|
count = 0
|
|
902
|
-
for
|
|
903
|
-
client.insert(collection_name=collection_name, data=[
|
|
904
|
-
count +=
|
|
884
|
+
for idx in range(0, len(records), batch_size):
|
|
885
|
+
client.insert(collection_name=collection_name, data=records[idx : idx + batch_size])
|
|
886
|
+
count += len(records[idx : idx + batch_size])
|
|
887
|
+
client.flush(collection_name)
|
|
905
888
|
logger.info(f"streamed {count} records")
|
|
906
889
|
|
|
907
890
|
|
|
891
|
+
def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient):
|
|
892
|
+
"""
|
|
893
|
+
This function waits for the index to be built. It checks
|
|
894
|
+
the indexed_rows of the index and waits for it to be equal
|
|
895
|
+
to the number of records. This only works for streaming inserts,
|
|
896
|
+
bulk inserts are not supported by this function
|
|
897
|
+
(refer to MilvusClient.refresh_load for bulk inserts).
|
|
898
|
+
"""
|
|
899
|
+
index_names = utility.list_indexes(collection_name)
|
|
900
|
+
indexed_rows = 0
|
|
901
|
+
for index_name in index_names:
|
|
902
|
+
indexed_rows = 0
|
|
903
|
+
while indexed_rows < num_elements:
|
|
904
|
+
pos_movement = 10 # number of iteration allowed without noticing an increase in indexed_rows
|
|
905
|
+
for i in range(20):
|
|
906
|
+
new_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
|
|
907
|
+
time.sleep(1)
|
|
908
|
+
logger.info(
|
|
909
|
+
f"polling for indexed rows, {collection_name}, {index_name} - {new_indexed_rows} / {num_elements}"
|
|
910
|
+
)
|
|
911
|
+
if new_indexed_rows == num_elements:
|
|
912
|
+
indexed_rows = new_indexed_rows
|
|
913
|
+
break
|
|
914
|
+
# check if indexed_rows is staying the same, too many times means something is wrong
|
|
915
|
+
if new_indexed_rows == indexed_rows:
|
|
916
|
+
pos_movement = -1
|
|
917
|
+
# if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
|
|
918
|
+
if pos_movement == 0:
|
|
919
|
+
raise ValueError("Rows are not getting indexed as expected")
|
|
920
|
+
indexed_rows = new_indexed_rows
|
|
921
|
+
return indexed_rows
|
|
922
|
+
|
|
923
|
+
|
|
908
924
|
def write_to_nvingest_collection(
|
|
909
925
|
records,
|
|
910
926
|
collection_name: str,
|
|
@@ -920,7 +936,7 @@ def write_to_nvingest_collection(
|
|
|
920
936
|
compute_bm25_stats: bool = True,
|
|
921
937
|
access_key: str = "minioadmin",
|
|
922
938
|
secret_key: str = "minioadmin",
|
|
923
|
-
bucket_name: str =
|
|
939
|
+
bucket_name: str = None,
|
|
924
940
|
threshold: int = 1000,
|
|
925
941
|
meta_dataframe=None,
|
|
926
942
|
meta_source_field=None,
|
|
@@ -1026,8 +1042,12 @@ def write_to_nvingest_collection(
|
|
|
1026
1042
|
client,
|
|
1027
1043
|
collection_name,
|
|
1028
1044
|
)
|
|
1045
|
+
# Make sure all rows are indexed, decided not to wrap in a timeout because we dont
|
|
1046
|
+
# know how long this should take, it is num_elements dependent.
|
|
1047
|
+
wait_for_index(collection_name, num_elements, client)
|
|
1029
1048
|
else:
|
|
1030
1049
|
minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
|
|
1050
|
+
bucket_name = bucket_name if bucket_name else ClientConfigSchema().minio_bucket_name
|
|
1031
1051
|
if not minio_client.bucket_exists(bucket_name):
|
|
1032
1052
|
minio_client.make_bucket(bucket_name)
|
|
1033
1053
|
|
|
@@ -1618,7 +1638,7 @@ def embed_index_collection(
|
|
|
1618
1638
|
compute_bm25_stats: bool = True,
|
|
1619
1639
|
access_key: str = "minioadmin",
|
|
1620
1640
|
secret_key: str = "minioadmin",
|
|
1621
|
-
bucket_name: str =
|
|
1641
|
+
bucket_name: str = None,
|
|
1622
1642
|
meta_dataframe: Union[str, pd.DataFrame] = None,
|
|
1623
1643
|
meta_source_field: str = None,
|
|
1624
1644
|
meta_fields: list[str] = None,
|
|
@@ -1658,7 +1678,7 @@ def embed_index_collection(
|
|
|
1658
1678
|
compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
|
|
1659
1679
|
access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
|
|
1660
1680
|
secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
|
|
1661
|
-
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1681
|
+
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1662
1682
|
meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
|
|
1663
1683
|
containing metadata. Defaults to None.
|
|
1664
1684
|
meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
|
|
@@ -1774,7 +1794,7 @@ def reindex_collection(
|
|
|
1774
1794
|
compute_bm25_stats: bool = True,
|
|
1775
1795
|
access_key: str = "minioadmin",
|
|
1776
1796
|
secret_key: str = "minioadmin",
|
|
1777
|
-
bucket_name: str =
|
|
1797
|
+
bucket_name: str = None,
|
|
1778
1798
|
meta_dataframe: Union[str, pd.DataFrame] = None,
|
|
1779
1799
|
meta_source_field: str = None,
|
|
1780
1800
|
meta_fields: list[str] = None,
|
|
@@ -1815,7 +1835,7 @@ def reindex_collection(
|
|
|
1815
1835
|
compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
|
|
1816
1836
|
access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
|
|
1817
1837
|
secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
|
|
1818
|
-
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1838
|
+
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1819
1839
|
meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
|
|
1820
1840
|
containing metadata. Defaults to None.
|
|
1821
1841
|
meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
|
|
@@ -1923,7 +1943,7 @@ class Milvus(VDB):
|
|
|
1923
1943
|
compute_bm25_stats: bool = True,
|
|
1924
1944
|
access_key: str = "minioadmin",
|
|
1925
1945
|
secret_key: str = "minioadmin",
|
|
1926
|
-
bucket_name: str =
|
|
1946
|
+
bucket_name: str = None,
|
|
1927
1947
|
meta_dataframe: Union[str, pd.DataFrame] = None,
|
|
1928
1948
|
meta_source_field: str = None,
|
|
1929
1949
|
meta_fields: list[str] = None,
|
|
@@ -1954,7 +1974,7 @@ class Milvus(VDB):
|
|
|
1954
1974
|
compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
|
|
1955
1975
|
access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
|
|
1956
1976
|
secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
|
|
1957
|
-
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1977
|
+
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1958
1978
|
meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
|
|
1959
1979
|
containing metadata. Defaults to None.
|
|
1960
1980
|
meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
|
|
@@ -42,6 +42,8 @@ src/nv_ingest_client/primitives/tasks/udf.py
|
|
|
42
42
|
src/nv_ingest_client/primitives/tasks/vdb_upload.py
|
|
43
43
|
src/nv_ingest_client/util/__init__.py
|
|
44
44
|
src/nv_ingest_client/util/dataset.py
|
|
45
|
+
src/nv_ingest_client/util/document_analysis.py
|
|
46
|
+
src/nv_ingest_client/util/image_disk_utils.py
|
|
45
47
|
src/nv_ingest_client/util/milvus.py
|
|
46
48
|
src/nv_ingest_client/util/process_json_files.py
|
|
47
49
|
src/nv_ingest_client/util/processing.py
|
|
File without changes
|
{nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/MANIFEST.in
RENAMED
|
File without changes
|
|
File without changes
|
{nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/pyproject.toml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest_client-2025.9.8.dev20250908 → nv_ingest_client-2025.9.10.dev20250910}/src/version.py
RENAMED
|
File without changes
|