nv-ingest-client 2025.9.9.dev20250909__tar.gz → 2025.9.11.dev20250911__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- {nv_ingest_client-2025.9.9.dev20250909/src/nv_ingest_client.egg-info → nv_ingest_client-2025.9.11.dev20250911}/PKG-INFO +1 -1
- nv_ingest_client-2025.9.11.dev20250911/src/nv_ingest_client/util/document_analysis.py +314 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/util.py +1 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/vdb/milvus.py +20 -34
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client.egg-info/SOURCES.txt +1 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/LICENSE +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/MANIFEST.in +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/README.md +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/pyproject.toml +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/setup.cfg +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/__init__.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/cli/__init__.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/cli/util/__init__.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/cli/util/click.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/cli/util/processing.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/cli/util/system.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/client/__init__.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/client/client.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/client/interface.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/client/util/processing.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/nv_ingest_cli.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/__init__.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/jobs/job_spec.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/caption.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/embed.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/extract.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/filter.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/store.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/udf.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/__init__.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/dataset.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/file_processing/extract.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/milvus.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/process_json_files.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/processing.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/system.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/transport.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client/util/zipkin.py +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client.egg-info/requires.txt +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
- {nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/version.py +0 -0
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Utility functions for analyzing document-level chunk composition from nv-ingest results.
|
|
7
|
+
|
|
8
|
+
This module provides analysis capabilities for understanding the distribution and types
|
|
9
|
+
of extracted content elements across individual documents. It enables customers to
|
|
10
|
+
gain visibility into their document composition for performance optimization and
|
|
11
|
+
capacity planning decisions.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import os
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
from typing import Any, Dict, List, Union
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def analyze_document_chunks(
|
|
23
|
+
results: Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
|
|
24
|
+
) -> Dict[str, Dict[str, Dict[str, int]]]:
|
|
25
|
+
"""
|
|
26
|
+
Analyze ingestor results to count elements by type and page for each document.
|
|
27
|
+
|
|
28
|
+
This function processes results from nv-ingest ingestion and provides a per-document,
|
|
29
|
+
per-page breakdown of extracted content types, enabling customers to understand document
|
|
30
|
+
composition and page-level distribution for optimization and planning purposes.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
results : Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
|
|
35
|
+
Ingestor results from ingestor.ingest() in standard List[List[Dict]] format,
|
|
36
|
+
or flattened List[Dict] format. Handles both regular lists and
|
|
37
|
+
LazyLoadedList objects automatically.
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
Dict[str, Dict[str, Dict[str, int]]]
|
|
42
|
+
Dictionary mapping document names to page-level element type counts with structure:
|
|
43
|
+
{
|
|
44
|
+
"document1.pdf": {
|
|
45
|
+
"total": {
|
|
46
|
+
"text": 7, "charts": 1, "tables": 1,
|
|
47
|
+
"unstructured_images": 0, "infographics": 0, "page_images": 0
|
|
48
|
+
},
|
|
49
|
+
"1": {
|
|
50
|
+
"text": 3, "charts": 1, "tables": 0,
|
|
51
|
+
"unstructured_images": 0, "infographics": 0, "page_images": 0
|
|
52
|
+
},
|
|
53
|
+
"2": {
|
|
54
|
+
"text": 4, "charts": 0, "tables": 1,
|
|
55
|
+
"unstructured_images": 0, "infographics": 0, "page_images": 0
|
|
56
|
+
}
|
|
57
|
+
},
|
|
58
|
+
"document2.pdf": {...}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
Notes
|
|
62
|
+
-----
|
|
63
|
+
- Requires purge_results_after_upload=False in vdb_upload() configuration
|
|
64
|
+
- Automatically handles LazyLoadedList objects from nv-ingest client
|
|
65
|
+
- Returns zero counts for missing element types
|
|
66
|
+
- Assumes valid nv-ingest output format with guaranteed metadata structure
|
|
67
|
+
|
|
68
|
+
Examples
|
|
69
|
+
--------
|
|
70
|
+
>>> from nv_ingest_client.util.document_analysis import analyze_document_chunks
|
|
71
|
+
>>>
|
|
72
|
+
>>> # After running ingestion
|
|
73
|
+
>>> results, failures = ingestor.ingest(show_progress=True, return_failures=True)
|
|
74
|
+
>>>
|
|
75
|
+
>>> # Analyze document composition by page
|
|
76
|
+
>>> breakdown = analyze_document_chunks(results)
|
|
77
|
+
>>>
|
|
78
|
+
>>> for doc_name, pages in breakdown.items():
|
|
79
|
+
... total_counts = pages["total"]
|
|
80
|
+
... total_elements = sum(total_counts.values())
|
|
81
|
+
... page_count = len(pages) - 1 # Subtract 1 for "total" key
|
|
82
|
+
... print(f"{doc_name}: {total_elements} elements across {page_count} pages")
|
|
83
|
+
... print(f" total: {total_elements} elements ({total_counts['text']} text, {total_counts['charts']} charts)")
|
|
84
|
+
... for page_name, counts in pages.items():
|
|
85
|
+
... if page_name != "total": # Skip total when listing pages
|
|
86
|
+
... page_total = sum(counts.values())
|
|
87
|
+
... print(
|
|
88
|
+
f" page {page_name}: {page_total} elements "
|
|
89
|
+
f"({counts['text']} text, {counts['charts']} charts)"
|
|
90
|
+
)
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
if not results:
|
|
94
|
+
logger.warning("No results provided for analysis")
|
|
95
|
+
return {}
|
|
96
|
+
|
|
97
|
+
# Normalize input format to handle both List[List[Dict]] and List[Dict] structures
|
|
98
|
+
normalized_results = _normalize_results_format(results)
|
|
99
|
+
|
|
100
|
+
# Group elements by document name and page number
|
|
101
|
+
document_page_elements = defaultdict(lambda: defaultdict(list))
|
|
102
|
+
|
|
103
|
+
for doc_results in normalized_results:
|
|
104
|
+
# Handle LazyLoadedList and other iterable types
|
|
105
|
+
elements = _extract_elements_from_doc(doc_results)
|
|
106
|
+
|
|
107
|
+
for element in elements:
|
|
108
|
+
doc_name = _extract_document_name(element)
|
|
109
|
+
page_key = _extract_page_key(element)
|
|
110
|
+
document_page_elements[doc_name][page_key].append(element)
|
|
111
|
+
|
|
112
|
+
# Count element types per page within each document and calculate totals
|
|
113
|
+
document_page_counts = {}
|
|
114
|
+
|
|
115
|
+
for doc_name, pages in document_page_elements.items():
|
|
116
|
+
document_page_counts[doc_name] = {}
|
|
117
|
+
total_counts = _initialize_element_counts()
|
|
118
|
+
|
|
119
|
+
for page_key, elements in pages.items():
|
|
120
|
+
counts = _initialize_element_counts()
|
|
121
|
+
|
|
122
|
+
for element in elements:
|
|
123
|
+
element_type = _categorize_element(element)
|
|
124
|
+
counts[element_type] += 1
|
|
125
|
+
total_counts[element_type] += 1 # Add to document total
|
|
126
|
+
|
|
127
|
+
document_page_counts[doc_name][page_key] = counts
|
|
128
|
+
|
|
129
|
+
# Add the total counts for this document
|
|
130
|
+
document_page_counts[doc_name]["total"] = total_counts
|
|
131
|
+
|
|
132
|
+
if document_page_counts:
|
|
133
|
+
total_docs = len(document_page_counts)
|
|
134
|
+
total_pages = sum(len(pages) - 1 for pages in document_page_counts.values()) # Subtract 1 for "total" key
|
|
135
|
+
total_elements = sum(sum(page_counts["total"].values()) for page_counts in document_page_counts.values())
|
|
136
|
+
logger.info(f"Analyzed {total_elements} elements across {total_pages} pages in {total_docs} documents")
|
|
137
|
+
else:
|
|
138
|
+
logger.warning("No valid documents found for analysis")
|
|
139
|
+
|
|
140
|
+
return document_page_counts
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _normalize_results_format(results: Union[List[List[Dict]], List[Dict]]) -> List[List[Dict]]:
|
|
144
|
+
"""
|
|
145
|
+
Normalize various input formats to consistent List[List[Dict]] structure.
|
|
146
|
+
|
|
147
|
+
Parameters
|
|
148
|
+
----------
|
|
149
|
+
results : Union[List[List[Dict]], List[Dict]]
|
|
150
|
+
Input results in various formats
|
|
151
|
+
|
|
152
|
+
Returns
|
|
153
|
+
-------
|
|
154
|
+
List[List[Dict]]
|
|
155
|
+
Normalized results in standard format
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
if not results:
|
|
159
|
+
return []
|
|
160
|
+
|
|
161
|
+
# Handle List[List[Dict]] or List[LazyLoadedList] formats
|
|
162
|
+
if isinstance(results, list) and len(results) > 0:
|
|
163
|
+
first_elem = results[0]
|
|
164
|
+
# Check for list, LazyLoadedList, or any sequence-like object
|
|
165
|
+
if isinstance(first_elem, list) or (
|
|
166
|
+
hasattr(first_elem, "__iter__") and hasattr(first_elem, "__len__") and not isinstance(first_elem, dict)
|
|
167
|
+
):
|
|
168
|
+
return results
|
|
169
|
+
|
|
170
|
+
# Handle flattened List[Dict] format by grouping elements by document
|
|
171
|
+
if isinstance(results, list) and len(results) > 0 and isinstance(results[0], dict):
|
|
172
|
+
doc_groups = defaultdict(list)
|
|
173
|
+
for element in results:
|
|
174
|
+
doc_name = _extract_document_name(element)
|
|
175
|
+
doc_groups[doc_name].append(element)
|
|
176
|
+
|
|
177
|
+
return list(doc_groups.values())
|
|
178
|
+
|
|
179
|
+
# Fallback for unexpected formats
|
|
180
|
+
return [[item] for item in results if item]
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _extract_elements_from_doc(doc_results) -> List[Dict]:
|
|
184
|
+
"""
|
|
185
|
+
Extract elements from document results, handling various data types.
|
|
186
|
+
|
|
187
|
+
Parameters
|
|
188
|
+
----------
|
|
189
|
+
doc_results : Any
|
|
190
|
+
Document results which may be a list, LazyLoadedList, or other iterable
|
|
191
|
+
|
|
192
|
+
Returns
|
|
193
|
+
-------
|
|
194
|
+
List[Dict]
|
|
195
|
+
List of element dictionaries
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
if isinstance(doc_results, list):
|
|
199
|
+
return doc_results
|
|
200
|
+
elif hasattr(doc_results, "__iter__") and hasattr(doc_results, "__len__"):
|
|
201
|
+
# Handle LazyLoadedList and other sequence-like objects
|
|
202
|
+
return list(doc_results)
|
|
203
|
+
else:
|
|
204
|
+
# Single element case
|
|
205
|
+
return [doc_results] if doc_results else []
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _extract_document_name(element: Dict[str, Any]) -> str:
|
|
209
|
+
"""
|
|
210
|
+
Extract clean document name from element metadata.
|
|
211
|
+
|
|
212
|
+
Parameters
|
|
213
|
+
----------
|
|
214
|
+
element : Dict[str, Any]
|
|
215
|
+
Element dictionary containing metadata
|
|
216
|
+
|
|
217
|
+
Returns
|
|
218
|
+
-------
|
|
219
|
+
str
|
|
220
|
+
Clean document filename (basename of source_id)
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
# nv-ingest guarantees this structure exists
|
|
224
|
+
source_id = element["metadata"]["source_metadata"]["source_id"]
|
|
225
|
+
return os.path.basename(source_id)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _extract_page_key(element: Dict[str, Any]) -> str:
|
|
229
|
+
"""
|
|
230
|
+
Extract page key from element metadata for consistent page naming.
|
|
231
|
+
|
|
232
|
+
Parameters
|
|
233
|
+
----------
|
|
234
|
+
element : Dict[str, Any]
|
|
235
|
+
Element dictionary containing metadata
|
|
236
|
+
|
|
237
|
+
Returns
|
|
238
|
+
-------
|
|
239
|
+
str
|
|
240
|
+
Page number as string (e.g., "1", "2", or "unknown")
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
page_number = element["metadata"]["content_metadata"]["page_number"]
|
|
245
|
+
if page_number is not None and page_number >= 0:
|
|
246
|
+
return str(page_number)
|
|
247
|
+
else:
|
|
248
|
+
return "unknown"
|
|
249
|
+
except (KeyError, TypeError):
|
|
250
|
+
logger.warning("Missing or invalid page_number in element metadata")
|
|
251
|
+
return "unknown"
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _categorize_element(element: Dict[str, Any]) -> str:
|
|
255
|
+
"""
|
|
256
|
+
Categorize element by type using document_type and content metadata.
|
|
257
|
+
|
|
258
|
+
Parameters
|
|
259
|
+
----------
|
|
260
|
+
element : Dict[str, Any]
|
|
261
|
+
Element dictionary with document_type and metadata fields
|
|
262
|
+
|
|
263
|
+
Returns
|
|
264
|
+
-------
|
|
265
|
+
str
|
|
266
|
+
Element category: "text", "charts", "tables", "unstructured_images",
|
|
267
|
+
"infographics", or "page_images"
|
|
268
|
+
"""
|
|
269
|
+
|
|
270
|
+
doc_type = element["document_type"]
|
|
271
|
+
|
|
272
|
+
# Text elements
|
|
273
|
+
if doc_type == "text":
|
|
274
|
+
return "text"
|
|
275
|
+
|
|
276
|
+
# Structured elements with subtypes
|
|
277
|
+
elif doc_type == "structured":
|
|
278
|
+
subtype = element["metadata"]["content_metadata"]["subtype"]
|
|
279
|
+
if subtype == "chart":
|
|
280
|
+
return "charts"
|
|
281
|
+
elif subtype == "table":
|
|
282
|
+
return "tables"
|
|
283
|
+
elif subtype == "infographic":
|
|
284
|
+
return "infographics"
|
|
285
|
+
elif subtype == "page_image":
|
|
286
|
+
return "page_images"
|
|
287
|
+
|
|
288
|
+
# Image elements (unstructured)
|
|
289
|
+
elif doc_type == "image":
|
|
290
|
+
return "unstructured_images"
|
|
291
|
+
|
|
292
|
+
# Should not reach here with valid nv-ingest output
|
|
293
|
+
logger.warning(f"Unexpected element type: {doc_type}")
|
|
294
|
+
return "text" # Default to text for safety
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _initialize_element_counts() -> Dict[str, int]:
|
|
298
|
+
"""
|
|
299
|
+
Initialize element counts dictionary with all supported types.
|
|
300
|
+
|
|
301
|
+
Returns
|
|
302
|
+
-------
|
|
303
|
+
Dict[str, int]
|
|
304
|
+
Dictionary with zero counts for all element types
|
|
305
|
+
"""
|
|
306
|
+
|
|
307
|
+
return {
|
|
308
|
+
"text": 0,
|
|
309
|
+
"charts": 0,
|
|
310
|
+
"tables": 0,
|
|
311
|
+
"unstructured_images": 0,
|
|
312
|
+
"infographics": 0,
|
|
313
|
+
"page_images": 0,
|
|
314
|
+
}
|
|
@@ -35,6 +35,7 @@ class ClientConfigSchema:
|
|
|
35
35
|
"https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking",
|
|
36
36
|
)
|
|
37
37
|
self.nv_ranker_nim_model_name: str = os.getenv("RERANKER_NIM_MODEL_NAME", "nvidia/llama-3.2-nv-rerankqa-1b-v2")
|
|
38
|
+
self.minio_bucket_name: str = os.getenv("MINIO_BUCKET", "nv-ingest")
|
|
38
39
|
|
|
39
40
|
|
|
40
41
|
@unified_exception_handler
|
|
@@ -17,8 +17,6 @@ import numpy as np
|
|
|
17
17
|
import pandas as pd
|
|
18
18
|
import requests
|
|
19
19
|
from minio import Minio
|
|
20
|
-
from minio.commonconfig import CopySource
|
|
21
|
-
from minio.deleteobjects import DeleteObject
|
|
22
20
|
from nv_ingest_client.util.process_json_files import ingest_json_results_to_blob
|
|
23
21
|
from nv_ingest_client.util.transport import infer_microservice
|
|
24
22
|
from nv_ingest_client.util.util import ClientConfigSchema
|
|
@@ -46,7 +44,6 @@ from scipy.sparse import csr_array
|
|
|
46
44
|
logger = logging.getLogger(__name__)
|
|
47
45
|
|
|
48
46
|
CONSISTENCY = CONSISTENCY_BOUNDED
|
|
49
|
-
MINIO_DEFAULT_BUCKET_NAME = "a-bucket"
|
|
50
47
|
|
|
51
48
|
pandas_reader_map = {
|
|
52
49
|
".json": pd.read_json,
|
|
@@ -751,7 +748,7 @@ def bulk_insert_milvus(
|
|
|
751
748
|
minio_endpoint: str = "localhost:9000",
|
|
752
749
|
access_key: str = "minioadmin",
|
|
753
750
|
secret_key: str = "minioadmin",
|
|
754
|
-
bucket_name: str =
|
|
751
|
+
bucket_name: str = None,
|
|
755
752
|
username: str = None,
|
|
756
753
|
password: str = None,
|
|
757
754
|
):
|
|
@@ -775,29 +772,16 @@ def bulk_insert_milvus(
|
|
|
775
772
|
password : str, optional
|
|
776
773
|
Milvus password.
|
|
777
774
|
"""
|
|
778
|
-
minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
|
|
779
|
-
|
|
780
775
|
connections.connect(uri=milvus_uri, token=f"{username}:{password}")
|
|
781
776
|
t_bulk_start = time.time()
|
|
782
777
|
task_ids = []
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
uploaded_files.append(f)
|
|
791
|
-
except Exception as e:
|
|
792
|
-
logger.error(f"Error copying {f} from {bucket_name} to {MINIO_DEFAULT_BUCKET_NAME}: {e}")
|
|
793
|
-
|
|
794
|
-
task_id = utility.do_bulk_insert(
|
|
795
|
-
collection_name=collection_name,
|
|
796
|
-
files=files,
|
|
797
|
-
consistency_level=CONSISTENCY,
|
|
798
|
-
)
|
|
799
|
-
task_ids.append(task_id)
|
|
800
|
-
# list_bulk_insert_tasks = utility.list_bulk_insert_tasks(collection_name=collection_name)
|
|
778
|
+
|
|
779
|
+
task_id = utility.do_bulk_insert(
|
|
780
|
+
collection_name=collection_name,
|
|
781
|
+
files=[file for files in writer.batch_files for file in files],
|
|
782
|
+
consistency_level=CONSISTENCY,
|
|
783
|
+
)
|
|
784
|
+
|
|
801
785
|
while len(task_ids) > 0:
|
|
802
786
|
time.sleep(1)
|
|
803
787
|
for task_id in task_ids:
|
|
@@ -813,9 +797,6 @@ def bulk_insert_milvus(
|
|
|
813
797
|
logger.error(f"Failed reason: {task.failed_reason}")
|
|
814
798
|
task_ids.remove(task_id)
|
|
815
799
|
|
|
816
|
-
# Cleanup: remove the copied files to undo the temporary workaround before bulk insert.
|
|
817
|
-
minio_client.remove_objects(MINIO_DEFAULT_BUCKET_NAME, [DeleteObject(f) for f in uploaded_files])
|
|
818
|
-
|
|
819
800
|
t_bulk_end = time.time()
|
|
820
801
|
logger.info(f"Bulk {collection_name} upload took {t_bulk_end - t_bulk_start} s")
|
|
821
802
|
|
|
@@ -903,6 +884,7 @@ def stream_insert_milvus(records, client: MilvusClient, collection_name: str, ba
|
|
|
903
884
|
for idx in range(0, len(records), batch_size):
|
|
904
885
|
client.insert(collection_name=collection_name, data=records[idx : idx + batch_size])
|
|
905
886
|
count += len(records[idx : idx + batch_size])
|
|
887
|
+
client.flush(collection_name)
|
|
906
888
|
logger.info(f"streamed {count} records")
|
|
907
889
|
|
|
908
890
|
|
|
@@ -923,6 +905,9 @@ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient
|
|
|
923
905
|
for i in range(20):
|
|
924
906
|
new_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
|
|
925
907
|
time.sleep(1)
|
|
908
|
+
logger.info(
|
|
909
|
+
f"polling for indexed rows, {collection_name}, {index_name} - {new_indexed_rows} / {num_elements}"
|
|
910
|
+
)
|
|
926
911
|
if new_indexed_rows == num_elements:
|
|
927
912
|
indexed_rows = new_indexed_rows
|
|
928
913
|
break
|
|
@@ -951,7 +936,7 @@ def write_to_nvingest_collection(
|
|
|
951
936
|
compute_bm25_stats: bool = True,
|
|
952
937
|
access_key: str = "minioadmin",
|
|
953
938
|
secret_key: str = "minioadmin",
|
|
954
|
-
bucket_name: str =
|
|
939
|
+
bucket_name: str = None,
|
|
955
940
|
threshold: int = 1000,
|
|
956
941
|
meta_dataframe=None,
|
|
957
942
|
meta_source_field=None,
|
|
@@ -1062,6 +1047,7 @@ def write_to_nvingest_collection(
|
|
|
1062
1047
|
wait_for_index(collection_name, num_elements, client)
|
|
1063
1048
|
else:
|
|
1064
1049
|
minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
|
|
1050
|
+
bucket_name = bucket_name if bucket_name else ClientConfigSchema().minio_bucket_name
|
|
1065
1051
|
if not minio_client.bucket_exists(bucket_name):
|
|
1066
1052
|
minio_client.make_bucket(bucket_name)
|
|
1067
1053
|
|
|
@@ -1652,7 +1638,7 @@ def embed_index_collection(
|
|
|
1652
1638
|
compute_bm25_stats: bool = True,
|
|
1653
1639
|
access_key: str = "minioadmin",
|
|
1654
1640
|
secret_key: str = "minioadmin",
|
|
1655
|
-
bucket_name: str =
|
|
1641
|
+
bucket_name: str = None,
|
|
1656
1642
|
meta_dataframe: Union[str, pd.DataFrame] = None,
|
|
1657
1643
|
meta_source_field: str = None,
|
|
1658
1644
|
meta_fields: list[str] = None,
|
|
@@ -1692,7 +1678,7 @@ def embed_index_collection(
|
|
|
1692
1678
|
compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
|
|
1693
1679
|
access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
|
|
1694
1680
|
secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
|
|
1695
|
-
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1681
|
+
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1696
1682
|
meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
|
|
1697
1683
|
containing metadata. Defaults to None.
|
|
1698
1684
|
meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
|
|
@@ -1808,7 +1794,7 @@ def reindex_collection(
|
|
|
1808
1794
|
compute_bm25_stats: bool = True,
|
|
1809
1795
|
access_key: str = "minioadmin",
|
|
1810
1796
|
secret_key: str = "minioadmin",
|
|
1811
|
-
bucket_name: str =
|
|
1797
|
+
bucket_name: str = None,
|
|
1812
1798
|
meta_dataframe: Union[str, pd.DataFrame] = None,
|
|
1813
1799
|
meta_source_field: str = None,
|
|
1814
1800
|
meta_fields: list[str] = None,
|
|
@@ -1849,7 +1835,7 @@ def reindex_collection(
|
|
|
1849
1835
|
compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
|
|
1850
1836
|
access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
|
|
1851
1837
|
secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
|
|
1852
|
-
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1838
|
+
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1853
1839
|
meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
|
|
1854
1840
|
containing metadata. Defaults to None.
|
|
1855
1841
|
meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
|
|
@@ -1957,7 +1943,7 @@ class Milvus(VDB):
|
|
|
1957
1943
|
compute_bm25_stats: bool = True,
|
|
1958
1944
|
access_key: str = "minioadmin",
|
|
1959
1945
|
secret_key: str = "minioadmin",
|
|
1960
|
-
bucket_name: str =
|
|
1946
|
+
bucket_name: str = None,
|
|
1961
1947
|
meta_dataframe: Union[str, pd.DataFrame] = None,
|
|
1962
1948
|
meta_source_field: str = None,
|
|
1963
1949
|
meta_fields: list[str] = None,
|
|
@@ -1988,7 +1974,7 @@ class Milvus(VDB):
|
|
|
1988
1974
|
compute_bm25_stats (bool, optional): Whether to compute BM25 statistics. Defaults to True.
|
|
1989
1975
|
access_key (str, optional): The access key for MinIO authentication. Defaults to "minioadmin".
|
|
1990
1976
|
secret_key (str, optional): The secret key for MinIO authentication. Defaults to "minioadmin".
|
|
1991
|
-
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1977
|
+
bucket_name (str, optional): The name of the MinIO bucket.
|
|
1992
1978
|
meta_dataframe (Union[str, pd.DataFrame], optional): A metadata DataFrame or the path to a CSV file
|
|
1993
1979
|
containing metadata. Defaults to None.
|
|
1994
1980
|
meta_source_field (str, optional): The field in the metadata that serves as the source identifier.
|
|
@@ -42,6 +42,7 @@ src/nv_ingest_client/primitives/tasks/udf.py
|
|
|
42
42
|
src/nv_ingest_client/primitives/tasks/vdb_upload.py
|
|
43
43
|
src/nv_ingest_client/util/__init__.py
|
|
44
44
|
src/nv_ingest_client/util/dataset.py
|
|
45
|
+
src/nv_ingest_client/util/document_analysis.py
|
|
45
46
|
src/nv_ingest_client/util/image_disk_utils.py
|
|
46
47
|
src/nv_ingest_client/util/milvus.py
|
|
47
48
|
src/nv_ingest_client/util/process_json_files.py
|
|
File without changes
|
{nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/MANIFEST.in
RENAMED
|
File without changes
|
|
File without changes
|
{nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/pyproject.toml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest_client-2025.9.9.dev20250909 → nv_ingest_client-2025.9.11.dev20250911}/src/version.py
RENAMED
|
File without changes
|