nv-ingest-client 2025.11.17.dev20251117__py3-none-any.whl → 2025.12.17.dev20251217__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest_client/client/client.py +112 -2
- nv_ingest_client/client/interface.py +301 -83
- nv_ingest_client/nv_ingest_cli.py +2 -2
- nv_ingest_client/primitives/jobs/job_spec.py +26 -1
- nv_ingest_client/primitives/tasks/caption.py +12 -1
- nv_ingest_client/primitives/tasks/extract.py +50 -2
- nv_ingest_client/primitives/tasks/store.py +18 -13
- nv_ingest_client/util/file_processing/extract.py +23 -0
- nv_ingest_client/util/util.py +34 -1
- nv_ingest_client/util/vdb/adt_vdb.py +216 -0
- nv_ingest_client/util/vdb/lancedb.py +276 -0
- nv_ingest_client/util/vdb/milvus.py +44 -21
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/METADATA +2 -1
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/RECORD +18 -17
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from nv_ingest_client.util.vdb.adt_vdb import VDB
|
|
5
|
+
from datetime import timedelta
|
|
6
|
+
from functools import partial
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
from nv_ingest_client.util.transport import infer_microservice
|
|
9
|
+
import lancedb
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def create_lancedb_results(results):
|
|
16
|
+
"""Transform NV-Ingest pipeline results into LanceDB ingestible rows.
|
|
17
|
+
|
|
18
|
+
The NV-Ingest pipeline provides nested lists of record dictionaries. This
|
|
19
|
+
helper extracts the inner `metadata` dict for each record, filters out
|
|
20
|
+
entries without an embedding, and returns a list of dictionaries with the
|
|
21
|
+
exact fields expected by the LanceDB table schema used in
|
|
22
|
+
`LanceDB.create_index`.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
results : list
|
|
27
|
+
Nested list-of-lists containing record dicts in the NV-Ingest format.
|
|
28
|
+
|
|
29
|
+
Returns
|
|
30
|
+
-------
|
|
31
|
+
list
|
|
32
|
+
List of dictionaries with keys: `vector` (embedding list), `text`
|
|
33
|
+
(string content), `metadata` (page number) and `source` (source id).
|
|
34
|
+
|
|
35
|
+
Notes
|
|
36
|
+
-----
|
|
37
|
+
- The function expects each inner record to have a `metadata` mapping
|
|
38
|
+
containing `embedding`, `content`, `content_metadata.page_number`, and
|
|
39
|
+
`source_metadata.source_id`.
|
|
40
|
+
- Records with `embedding is None` are skipped.
|
|
41
|
+
"""
|
|
42
|
+
old_results = [res["metadata"] for result in results for res in result]
|
|
43
|
+
results = []
|
|
44
|
+
for result in old_results:
|
|
45
|
+
if result["embedding"] is None:
|
|
46
|
+
continue
|
|
47
|
+
results.append(
|
|
48
|
+
{
|
|
49
|
+
"vector": result["embedding"],
|
|
50
|
+
"text": result["content"],
|
|
51
|
+
"metadata": result["content_metadata"]["page_number"],
|
|
52
|
+
"source": result["source_metadata"]["source_id"],
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
return results
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class LanceDB(VDB):
|
|
59
|
+
"""LanceDB operator implementing the VDB interface.
|
|
60
|
+
|
|
61
|
+
This class adapts NV-Ingest records to LanceDB, providing index creation,
|
|
62
|
+
ingestion, and retrieval hooks. The implementation is intentionally small
|
|
63
|
+
and focuses on the example configuration used in NV-Ingest evaluation
|
|
64
|
+
scripts.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
uri=None,
|
|
70
|
+
overwrite=True,
|
|
71
|
+
table_name="nv-ingest",
|
|
72
|
+
index_type="IVF_HNSW_SQ",
|
|
73
|
+
metric="l2",
|
|
74
|
+
num_partitions=16,
|
|
75
|
+
num_sub_vectors=256,
|
|
76
|
+
**kwargs
|
|
77
|
+
):
|
|
78
|
+
"""Initialize the LanceDB VDB operator.
|
|
79
|
+
|
|
80
|
+
Parameters
|
|
81
|
+
----------
|
|
82
|
+
uri: str, optional
|
|
83
|
+
LanceDB connection URI (default is "lancedb" for local file-based
|
|
84
|
+
storage).
|
|
85
|
+
overwrite : bool, optional
|
|
86
|
+
If True, existing tables will be overwritten during index creation.
|
|
87
|
+
If False, new data will be appended to existing tables.
|
|
88
|
+
table_name : str, optional
|
|
89
|
+
Name of the LanceDB table to create/use (default is "nv-ingest").
|
|
90
|
+
index_type : str, optional
|
|
91
|
+
Type of vector index to create (default is "IVF_HNSW_SQ").
|
|
92
|
+
metric : str, optional
|
|
93
|
+
Distance metric for the vector index (default is "l2").
|
|
94
|
+
num_partitions : int, optional
|
|
95
|
+
Number of partitions for the vector index (default is 16).
|
|
96
|
+
num_sub_vectors : int, optional
|
|
97
|
+
Number of sub-vectors for the vector index (default is 256).
|
|
98
|
+
**kwargs : dict
|
|
99
|
+
Forwarded configuration options. This implementation does not
|
|
100
|
+
actively consume specific keys, but passing parameters such as
|
|
101
|
+
`uri`, `index_name`, or security options is supported by the
|
|
102
|
+
interface pattern and may be used by future enhancements.
|
|
103
|
+
"""
|
|
104
|
+
self.uri = uri or "lancedb"
|
|
105
|
+
self.overwrite = overwrite
|
|
106
|
+
self.table_name = table_name
|
|
107
|
+
self.index_type = index_type
|
|
108
|
+
self.metric = metric
|
|
109
|
+
self.num_partitions = num_partitions
|
|
110
|
+
self.num_sub_vectors = num_sub_vectors
|
|
111
|
+
super().__init__(**kwargs)
|
|
112
|
+
|
|
113
|
+
def create_index(self, records=None, table_name="nv-ingest", **kwargs):
|
|
114
|
+
"""Create a LanceDB table and populate it with transformed records.
|
|
115
|
+
|
|
116
|
+
This method connects to LanceDB, transforms NV-Ingest records using
|
|
117
|
+
`create_lancedb_results`, builds a PyArrow schema that matches the
|
|
118
|
+
expected table layout, and creates/overwrites a table named `bo`.
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
records : list, optional
|
|
123
|
+
NV-Ingest records in nested list format (the same structure passed
|
|
124
|
+
to `run`). If ``None``, an empty table will be created.
|
|
125
|
+
|
|
126
|
+
table_name : str, optional
|
|
127
|
+
Name of the LanceDB table to create (default is "nv-ingest").
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
table
|
|
132
|
+
The LanceDB table object returned by `db.create_table`.
|
|
133
|
+
"""
|
|
134
|
+
db = lancedb.connect(uri=self.uri)
|
|
135
|
+
results = create_lancedb_results(records)
|
|
136
|
+
schema = pa.schema(
|
|
137
|
+
[
|
|
138
|
+
pa.field("vector", pa.list_(pa.float32(), 2048)),
|
|
139
|
+
pa.field("text", pa.string()),
|
|
140
|
+
pa.field("metadata", pa.string()),
|
|
141
|
+
pa.field("source", pa.string()),
|
|
142
|
+
]
|
|
143
|
+
)
|
|
144
|
+
table = db.create_table(
|
|
145
|
+
table_name, data=results, schema=schema, mode="overwrite" if self.overwrite else "append"
|
|
146
|
+
)
|
|
147
|
+
return table
|
|
148
|
+
|
|
149
|
+
def write_to_index(
|
|
150
|
+
self,
|
|
151
|
+
records,
|
|
152
|
+
table=None,
|
|
153
|
+
index_type="IVF_HNSW_SQ",
|
|
154
|
+
metric="l2",
|
|
155
|
+
num_partitions=16,
|
|
156
|
+
num_sub_vectors=256,
|
|
157
|
+
**kwargs
|
|
158
|
+
):
|
|
159
|
+
"""Create an index on the LanceDB table and wait for it to become ready.
|
|
160
|
+
|
|
161
|
+
This function calls `table.create_index` with an IVF+HNSW+SQ index
|
|
162
|
+
configuration used in NV-Ingest benchmarks. After requesting index
|
|
163
|
+
construction it lists available indices and waits for each one to
|
|
164
|
+
reach a ready state using `table.wait_for_index`.
|
|
165
|
+
|
|
166
|
+
Parameters
|
|
167
|
+
----------
|
|
168
|
+
records : list
|
|
169
|
+
The original records being indexed (not used directly in this
|
|
170
|
+
implementation but kept in the signature for consistency).
|
|
171
|
+
table : object
|
|
172
|
+
LanceDB table object returned by `create_index`.
|
|
173
|
+
"""
|
|
174
|
+
table.create_index(
|
|
175
|
+
index_type=index_type,
|
|
176
|
+
metric=metric,
|
|
177
|
+
num_partitions=num_partitions,
|
|
178
|
+
num_sub_vectors=num_sub_vectors,
|
|
179
|
+
# accelerator="cuda",
|
|
180
|
+
vector_column_name="vector",
|
|
181
|
+
)
|
|
182
|
+
for index_stub in table.list_indices():
|
|
183
|
+
table.wait_for_index([index_stub.name], timeout=timedelta(seconds=600))
|
|
184
|
+
|
|
185
|
+
def retrieval(
|
|
186
|
+
self,
|
|
187
|
+
queries,
|
|
188
|
+
table=None,
|
|
189
|
+
embedding_endpoint="http://localhost:8012/v1",
|
|
190
|
+
nvidia_api_key=None,
|
|
191
|
+
model_name="nvidia/llama-3.2-nv-embedqa-1b-v2",
|
|
192
|
+
result_fields=["text", "metadata", "source"],
|
|
193
|
+
top_k=10,
|
|
194
|
+
**kwargs
|
|
195
|
+
):
|
|
196
|
+
"""Run similarity search for a list of text queries.
|
|
197
|
+
|
|
198
|
+
This method converts textual queries to embeddings by calling the
|
|
199
|
+
transport helper `infer_microservice` (configured to use an NVIDIA
|
|
200
|
+
embedding model in the example) and performs a vector search against
|
|
201
|
+
the LanceDB `table`.
|
|
202
|
+
|
|
203
|
+
Parameters
|
|
204
|
+
----------
|
|
205
|
+
queries : list[str]
|
|
206
|
+
Text queries to be embedded and searched.
|
|
207
|
+
table : object
|
|
208
|
+
LanceDB table object with a built vector index.
|
|
209
|
+
embedding_endpoint : str, optional
|
|
210
|
+
URL of the embedding microservice (default is
|
|
211
|
+
"http://localhost:8012/v1").
|
|
212
|
+
nvidia_api_key : str, optional
|
|
213
|
+
NVIDIA API key for authentication with the embedding service. If
|
|
214
|
+
``None``, no authentication is used.
|
|
215
|
+
model_name : str, optional
|
|
216
|
+
Name of the embedding model to use (default is
|
|
217
|
+
"nvidia/llama-3.2-nv-embedqa-1b-v2").
|
|
218
|
+
result_fields : list, optional
|
|
219
|
+
List of field names to retrieve from each hit document (default is
|
|
220
|
+
`["text", "metadata", "source"]`).
|
|
221
|
+
top_k : int, optional
|
|
222
|
+
Number of top results to return per query (default is 10).
|
|
223
|
+
|
|
224
|
+
Returns
|
|
225
|
+
-------
|
|
226
|
+
list[list[dict]]
|
|
227
|
+
For each input query, a list of hit documents (each document is a
|
|
228
|
+
dict with fields such as `text`, `metadata`, and `source`). The
|
|
229
|
+
example limits each query to 20 results.
|
|
230
|
+
"""
|
|
231
|
+
embed_model = partial(
|
|
232
|
+
infer_microservice,
|
|
233
|
+
model_name=model_name,
|
|
234
|
+
embedding_endpoint=embedding_endpoint,
|
|
235
|
+
nvidia_api_key=nvidia_api_key,
|
|
236
|
+
input_type="query",
|
|
237
|
+
output_names=["embeddings"],
|
|
238
|
+
grpc=not ("http" in urlparse(embedding_endpoint).scheme),
|
|
239
|
+
)
|
|
240
|
+
results = []
|
|
241
|
+
query_embeddings = embed_model(queries)
|
|
242
|
+
for query_embed in query_embeddings:
|
|
243
|
+
results.append(
|
|
244
|
+
table.search([query_embed], vector_column_name="vector").select(result_fields).limit(top_k).to_list()
|
|
245
|
+
)
|
|
246
|
+
return results
|
|
247
|
+
|
|
248
|
+
def run(self, records):
|
|
249
|
+
"""Orchestrate index creation and data ingestion.
|
|
250
|
+
|
|
251
|
+
The `run` method is the public entry point used by NV-Ingest pipeline
|
|
252
|
+
tasks. A minimal implementation first ensures the table exists by
|
|
253
|
+
calling `create_index` and then kicks off index construction with
|
|
254
|
+
`write_to_index`.
|
|
255
|
+
|
|
256
|
+
Parameters
|
|
257
|
+
----------
|
|
258
|
+
records : list
|
|
259
|
+
NV-Ingest records to index.
|
|
260
|
+
|
|
261
|
+
Returns
|
|
262
|
+
-------
|
|
263
|
+
list
|
|
264
|
+
The original `records` list is returned unchanged to make the
|
|
265
|
+
operator composable in pipelines.
|
|
266
|
+
"""
|
|
267
|
+
table = self.create_index(records=records, table_name=self.table_name)
|
|
268
|
+
self.write_to_index(
|
|
269
|
+
records,
|
|
270
|
+
table=table,
|
|
271
|
+
index_type=self.index_type,
|
|
272
|
+
metric=self.metric,
|
|
273
|
+
num_partitions=self.num_partitions,
|
|
274
|
+
num_sub_vectors=self.num_sub_vectors,
|
|
275
|
+
)
|
|
276
|
+
return records
|
|
@@ -287,6 +287,10 @@ def create_nvingest_index_params(
|
|
|
287
287
|
gpu_index: bool = True,
|
|
288
288
|
gpu_search: bool = False,
|
|
289
289
|
local_index: bool = True,
|
|
290
|
+
intermediate_graph_degree: int = 128,
|
|
291
|
+
graph_degree: int = 100,
|
|
292
|
+
m: int = 64,
|
|
293
|
+
ef_construction: int = 512,
|
|
290
294
|
) -> IndexParams:
|
|
291
295
|
"""
|
|
292
296
|
Creates index params necessary to create an index for a collection. At a minimum,
|
|
@@ -326,8 +330,8 @@ def create_nvingest_index_params(
|
|
|
326
330
|
index_type="GPU_CAGRA",
|
|
327
331
|
metric_type="L2",
|
|
328
332
|
params={
|
|
329
|
-
"intermediate_graph_degree":
|
|
330
|
-
"graph_degree":
|
|
333
|
+
"intermediate_graph_degree": intermediate_graph_degree,
|
|
334
|
+
"graph_degree": graph_degree,
|
|
331
335
|
"build_algo": "NN_DESCENT",
|
|
332
336
|
"cache_dataset_on_device": "true",
|
|
333
337
|
"adapt_for_cpu": "false" if gpu_search else "true",
|
|
@@ -339,7 +343,7 @@ def create_nvingest_index_params(
|
|
|
339
343
|
index_name=DENSE_INDEX_NAME,
|
|
340
344
|
index_type="HNSW",
|
|
341
345
|
metric_type="L2",
|
|
342
|
-
params={"M":
|
|
346
|
+
params={"M": m, "efConstruction": ef_construction},
|
|
343
347
|
)
|
|
344
348
|
if sparse and local_index:
|
|
345
349
|
index_params.add_index(
|
|
@@ -407,6 +411,10 @@ def create_nvingest_collection(
|
|
|
407
411
|
recreate_meta: bool = False,
|
|
408
412
|
username: str = None,
|
|
409
413
|
password: str = None,
|
|
414
|
+
intermediate_graph_degree: int = 128,
|
|
415
|
+
graph_degree: int = 100,
|
|
416
|
+
m: int = 64,
|
|
417
|
+
ef_construction: int = 512,
|
|
410
418
|
) -> CollectionSchema:
|
|
411
419
|
"""
|
|
412
420
|
Creates a milvus collection with an nv-ingest compatible schema under
|
|
@@ -457,6 +465,10 @@ def create_nvingest_collection(
|
|
|
457
465
|
gpu_index=gpu_index,
|
|
458
466
|
gpu_search=gpu_search,
|
|
459
467
|
local_index=local_index,
|
|
468
|
+
intermediate_graph_degree=intermediate_graph_degree,
|
|
469
|
+
graph_degree=graph_degree,
|
|
470
|
+
m=m,
|
|
471
|
+
ef_construction=ef_construction,
|
|
460
472
|
)
|
|
461
473
|
create_collection(client, collection_name, schema, index_params, recreate=recreate)
|
|
462
474
|
d_idx, s_idx = _get_index_types(index_params, sparse=sparse)
|
|
@@ -892,7 +904,7 @@ def stream_insert_milvus(records, client: MilvusClient, collection_name: str, ba
|
|
|
892
904
|
logger.info(f"streamed {count} records")
|
|
893
905
|
|
|
894
906
|
|
|
895
|
-
def wait_for_index(collection_name: str,
|
|
907
|
+
def wait_for_index(collection_name: str, expected_rows_dict: dict, client: MilvusClient):
|
|
896
908
|
"""
|
|
897
909
|
This function waits for the index to be built. It checks
|
|
898
910
|
the indexed_rows of the index and waits for it to be equal
|
|
@@ -901,32 +913,28 @@ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient
|
|
|
901
913
|
(refer to MilvusClient.refresh_load for bulk inserts).
|
|
902
914
|
"""
|
|
903
915
|
client.flush(collection_name)
|
|
904
|
-
# index_names = utility.list_indexes(collection_name)
|
|
905
916
|
indexed_rows = 0
|
|
906
917
|
# observe dense_index, all indexes get populated simultaneously
|
|
907
|
-
for index_name in
|
|
908
|
-
indexed_rows =
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
pos_movement =
|
|
918
|
+
for index_name, rows_expected in expected_rows_dict.items():
|
|
919
|
+
indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
|
|
920
|
+
while indexed_rows < rows_expected:
|
|
921
|
+
# 0.5% of rows expected allowed without noticing an increase in indexed_rows
|
|
922
|
+
pos_movement = start_pos_movement = max((rows_expected - indexed_rows) * 0.005, 10)
|
|
912
923
|
for i in range(20):
|
|
913
|
-
|
|
924
|
+
prev_indexed_rows = indexed_rows
|
|
925
|
+
indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
|
|
914
926
|
time.sleep(1)
|
|
915
|
-
logger.info(
|
|
916
|
-
|
|
917
|
-
)
|
|
918
|
-
if current_indexed_rows == expected_rows:
|
|
919
|
-
indexed_rows = current_indexed_rows
|
|
927
|
+
logger.info(f"Indexed rows, {collection_name}, {index_name} - {indexed_rows} / {rows_expected}")
|
|
928
|
+
if indexed_rows == rows_expected:
|
|
920
929
|
break
|
|
921
930
|
# check if indexed_rows is staying the same, too many times means something is wrong
|
|
922
|
-
if
|
|
931
|
+
if indexed_rows == prev_indexed_rows:
|
|
923
932
|
pos_movement -= 1
|
|
924
933
|
else:
|
|
925
|
-
pos_movement =
|
|
934
|
+
pos_movement = start_pos_movement
|
|
926
935
|
# if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
|
|
927
936
|
if pos_movement == 0:
|
|
928
937
|
raise ValueError(f"Rows are not getting indexed as expected for: {index_name} - {collection_name}")
|
|
929
|
-
indexed_rows = current_indexed_rows
|
|
930
938
|
return indexed_rows
|
|
931
939
|
|
|
932
940
|
|
|
@@ -953,6 +961,7 @@ def write_to_nvingest_collection(
|
|
|
953
961
|
stream: bool = False,
|
|
954
962
|
username: str = None,
|
|
955
963
|
password: str = None,
|
|
964
|
+
no_wait_index: bool = False,
|
|
956
965
|
**kwargs,
|
|
957
966
|
):
|
|
958
967
|
"""
|
|
@@ -1046,15 +1055,22 @@ def write_to_nvingest_collection(
|
|
|
1046
1055
|
if num_elements < threshold:
|
|
1047
1056
|
stream = True
|
|
1048
1057
|
if stream:
|
|
1058
|
+
# most be accessed/saved before adding new records
|
|
1059
|
+
index_names = utility.list_indexes(collection_name)
|
|
1060
|
+
expected_rows = {}
|
|
1061
|
+
for index_name in index_names:
|
|
1062
|
+
expected_rows[index_name] = (
|
|
1063
|
+
int(client.describe_index(collection_name, index_name)["indexed_rows"]) + num_elements
|
|
1064
|
+
)
|
|
1049
1065
|
stream_insert_milvus(
|
|
1050
1066
|
cleaned_records,
|
|
1051
1067
|
client,
|
|
1052
1068
|
collection_name,
|
|
1053
1069
|
)
|
|
1054
|
-
if not local_index:
|
|
1070
|
+
if not local_index and not no_wait_index:
|
|
1055
1071
|
# Make sure all rows are indexed, decided not to wrap in a timeout because we dont
|
|
1056
1072
|
# know how long this should take, it is num_elements dependent.
|
|
1057
|
-
wait_for_index(collection_name,
|
|
1073
|
+
wait_for_index(collection_name, expected_rows, client)
|
|
1058
1074
|
else:
|
|
1059
1075
|
minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
|
|
1060
1076
|
bucket_name = bucket_name if bucket_name else ClientConfigSchema().minio_bucket_name
|
|
@@ -1968,6 +1984,7 @@ class Milvus(VDB):
|
|
|
1968
1984
|
threshold: int = 1000,
|
|
1969
1985
|
username: str = None,
|
|
1970
1986
|
password: str = None,
|
|
1987
|
+
no_wait_index: bool = False,
|
|
1971
1988
|
**kwargs,
|
|
1972
1989
|
):
|
|
1973
1990
|
"""
|
|
@@ -2005,6 +2022,12 @@ class Milvus(VDB):
|
|
|
2005
2022
|
"""
|
|
2006
2023
|
kwargs = locals().copy()
|
|
2007
2024
|
kwargs.pop("self", None)
|
|
2025
|
+
bucket_name = kwargs.get("bucket_name", None)
|
|
2026
|
+
if bucket_name is not None and bucket_name != ClientConfigSchema().minio_bucket_name:
|
|
2027
|
+
raise ValueError(
|
|
2028
|
+
"You must use the environment variable MINIO_BUCKET to specify bucket_name, detected:",
|
|
2029
|
+
f"`bucket_name`: {bucket_name} and MINIO_BUCKET: {ClientConfigSchema().minio_bucket_name}",
|
|
2030
|
+
)
|
|
2008
2031
|
super().__init__(**kwargs)
|
|
2009
2032
|
|
|
2010
2033
|
def create_index(self, **kwargs):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest-client
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.12.17.dev20251217
|
|
4
4
|
Summary: Python client for the nv-ingest service
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -223,6 +223,7 @@ Requires-Dist: pydantic-settings>2.0.0
|
|
|
223
223
|
Requires-Dist: requests>=2.28.2
|
|
224
224
|
Requires-Dist: setuptools>=78.1.1
|
|
225
225
|
Requires-Dist: tqdm>=4.67.1
|
|
226
|
+
Requires-Dist: lancedb>=0.25.3
|
|
226
227
|
Provides-Extra: milvus
|
|
227
228
|
Requires-Dist: pymilvus==2.5.10; extra == "milvus"
|
|
228
229
|
Requires-Dist: pymilvus[bulk_writer,model]; extra == "milvus"
|
|
@@ -1,31 +1,31 @@
|
|
|
1
1
|
nv_ingest_client/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
2
|
-
nv_ingest_client/nv_ingest_cli.py,sha256=
|
|
2
|
+
nv_ingest_client/nv_ingest_cli.py,sha256=qeZJZq_ltnNFiytQNwMY3VAL7nBUXW2HnwMzBGaKQJ0,14452
|
|
3
3
|
nv_ingest_client/cli/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
4
4
|
nv_ingest_client/cli/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
5
5
|
nv_ingest_client/cli/util/click.py,sha256=YjQU1uF148FU5D3ozC2m1kkfOOJxO1U8U552-T8PjU4,20029
|
|
6
6
|
nv_ingest_client/cli/util/processing.py,sha256=ULGCYQF1RTDQV_b35YM1WQRqIjR2wQRMJWu41DogagE,6259
|
|
7
7
|
nv_ingest_client/cli/util/system.py,sha256=AQLq0DD2Ns8jRanrKu1tmVBKPA9rl-F3-ZsGI6FXLqE,1105
|
|
8
8
|
nv_ingest_client/client/__init__.py,sha256=eEX9l1qmkLH2lAAZU3eP17SCV06ZjjrshHAB_xbboHA,375
|
|
9
|
-
nv_ingest_client/client/client.py,sha256=
|
|
9
|
+
nv_ingest_client/client/client.py,sha256=Mb5V3nQRg_jzr07-jmK5jwgx3_WmzaGmGXrEKfoyjHU,82103
|
|
10
10
|
nv_ingest_client/client/ingest_job_handler.py,sha256=4exvMwXbzwC-tb0dWleXE-AwhJkvxvhkf_u_1bJt30U,18387
|
|
11
|
-
nv_ingest_client/client/interface.py,sha256=
|
|
11
|
+
nv_ingest_client/client/interface.py,sha256=1gmFQ7bVQDiEweChN_Divv1Y87a4cNkEgH2Shp4tIMw,64915
|
|
12
12
|
nv_ingest_client/client/util/processing.py,sha256=Ky7x7QbLn3BlgYwmrmoIc-o1VwmlmrcP9tn7GVTi0t0,2502
|
|
13
13
|
nv_ingest_client/primitives/__init__.py,sha256=3rbpLCI7Bl0pntGatAxXD_V01y6dcLhHFheI3wqet-I,269
|
|
14
14
|
nv_ingest_client/primitives/jobs/__init__.py,sha256=-yohgHv3LcCtSleHSaxjv1oO7nNcMCjN3ZYoOkIypIk,469
|
|
15
|
-
nv_ingest_client/primitives/jobs/job_spec.py,sha256=
|
|
15
|
+
nv_ingest_client/primitives/jobs/job_spec.py,sha256=qT8d9zxEO4ODAcwIlyU7yN1HSuQbDkhCXhLA9hNOURc,16831
|
|
16
16
|
nv_ingest_client/primitives/jobs/job_state.py,sha256=CEe_oZr4p_MobauWIyhuNrP8y7AUwxhIGBuO7dN-VOQ,5277
|
|
17
17
|
nv_ingest_client/primitives/tasks/__init__.py,sha256=D8X4XuwCxk4g_sMSpNRL1XsjVE1eACYaUdEjSanSEfU,1130
|
|
18
18
|
nv_ingest_client/primitives/tasks/audio_extraction.py,sha256=KD5VvaRm6PYelfofZq_-83CbOmupgosokZzFERI5wDA,3559
|
|
19
|
-
nv_ingest_client/primitives/tasks/caption.py,sha256=
|
|
19
|
+
nv_ingest_client/primitives/tasks/caption.py,sha256=w-xPKN77zruUel0md4OA-x2ciELSLY-8Px1ds76gak0,2498
|
|
20
20
|
nv_ingest_client/primitives/tasks/chart_extraction.py,sha256=s5hsljgSXxQMZHGekpAg6OYJ9k3-DHk5NmFpvtKJ6Zs,1493
|
|
21
21
|
nv_ingest_client/primitives/tasks/dedup.py,sha256=qort6p3t6ZJuK_74sfOOLp3vMT3hkB5DAu3467WenyY,1719
|
|
22
22
|
nv_ingest_client/primitives/tasks/embed.py,sha256=ZLk7txs_0OHSjjxvRTYB5jm9RvvXRFo3i32Mj9d2mfc,7048
|
|
23
|
-
nv_ingest_client/primitives/tasks/extract.py,sha256=
|
|
23
|
+
nv_ingest_client/primitives/tasks/extract.py,sha256=jTCOSQG1MG0RoQg4DxPgmYgeHQR7O24hmysygkWYyIY,11270
|
|
24
24
|
nv_ingest_client/primitives/tasks/filter.py,sha256=dr6fWnh94i50MsGbrz9m_oN6DJKWIWsp7sMwm6Mjz8A,2617
|
|
25
25
|
nv_ingest_client/primitives/tasks/infographic_extraction.py,sha256=SyTjZQbdVA3QwM5yVm4fUzE4Gu4zm4tAfNLDZMvySV8,1537
|
|
26
26
|
nv_ingest_client/primitives/tasks/ocr_extraction.py,sha256=w4uNITktOs-FLczL4ZzVdQTP4t_Ha-9PzCJWlXeOEN0,1486
|
|
27
27
|
nv_ingest_client/primitives/tasks/split.py,sha256=8UkB3EialsOTEbsOZLxzmnDIfTJzC6uvjNv21IbgAVA,2332
|
|
28
|
-
nv_ingest_client/primitives/tasks/store.py,sha256=
|
|
28
|
+
nv_ingest_client/primitives/tasks/store.py,sha256=UeIspL_RDPBbUV3gv8SK3tIoYNun8r4cSSMxXvBSaks,4575
|
|
29
29
|
nv_ingest_client/primitives/tasks/table_extraction.py,sha256=wQIC70ZNFt0DNQ1lxfvyR3Ci8hl5uAymHXTC0p6v0FY,1107
|
|
30
30
|
nv_ingest_client/primitives/tasks/task_base.py,sha256=Mrx6kgePJHolYd3Im6mVISXcVgdulLst2MYG5gPov9I,1687
|
|
31
31
|
nv_ingest_client/primitives/tasks/task_factory.py,sha256=uvGQXjgWmeF015jPWmBhiclzfrUf3_yD2PPeirQBczM,3218
|
|
@@ -40,17 +40,18 @@ nv_ingest_client/util/process_json_files.py,sha256=YKR-fGT4kM8zO2p8r5tpo5-vvFywk
|
|
|
40
40
|
nv_ingest_client/util/processing.py,sha256=bAy8it-OUgGFO3pcy6D3ezpyZ6p2DfmoQUGhx3QmVf8,8989
|
|
41
41
|
nv_ingest_client/util/system.py,sha256=DVIRLlEWkpqftqxazCuPNdaFSjQiHGMYcHzBufJSRUM,2216
|
|
42
42
|
nv_ingest_client/util/transport.py,sha256=Kwi3r-EUD5yOInW2rH7tYm2DXnzP3aU9l95V-BbXO90,1836
|
|
43
|
-
nv_ingest_client/util/util.py,sha256=
|
|
43
|
+
nv_ingest_client/util/util.py,sha256=zvWgIxIeATrtrS8olo_8-fHQ4aDd83yg2SjNDcHIv4g,16805
|
|
44
44
|
nv_ingest_client/util/zipkin.py,sha256=p2tMtTVAqrZGxmAxWKE42wkx7U5KywiX5munI7rJt_k,4473
|
|
45
45
|
nv_ingest_client/util/file_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
46
|
-
nv_ingest_client/util/file_processing/extract.py,sha256=
|
|
46
|
+
nv_ingest_client/util/file_processing/extract.py,sha256=sJBfyv4N2P0-izN4RyCsnSDKuDNugG_tW8XCqN9Uqck,5574
|
|
47
47
|
nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIrSfSKzbAk,513
|
|
48
|
-
nv_ingest_client/util/vdb/adt_vdb.py,sha256=
|
|
49
|
-
nv_ingest_client/util/vdb/
|
|
48
|
+
nv_ingest_client/util/vdb/adt_vdb.py,sha256=wT3LJMAy2VQu6daXhc3Pte4Ijs6jN-YP6B9-rnuH_FA,10868
|
|
49
|
+
nv_ingest_client/util/vdb/lancedb.py,sha256=mLykdOFkLC5-SpRvHAvt0do9rhyQDqy_H48D6hEtegw,10037
|
|
50
|
+
nv_ingest_client/util/vdb/milvus.py,sha256=NLlsYU5LdESh0r_Psvn0vzGiNN-70iouOGr3RgZaMVg,81316
|
|
50
51
|
nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
|
|
51
|
-
nv_ingest_client-2025.
|
|
52
|
-
nv_ingest_client-2025.
|
|
53
|
-
nv_ingest_client-2025.
|
|
54
|
-
nv_ingest_client-2025.
|
|
55
|
-
nv_ingest_client-2025.
|
|
56
|
-
nv_ingest_client-2025.
|
|
52
|
+
nv_ingest_client-2025.12.17.dev20251217.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
53
|
+
nv_ingest_client-2025.12.17.dev20251217.dist-info/METADATA,sha256=EbEZoUk3-GvCBAB2z0hqZjgMOGasw75hZCWTDk7yxpk,30658
|
|
54
|
+
nv_ingest_client-2025.12.17.dev20251217.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
55
|
+
nv_ingest_client-2025.12.17.dev20251217.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
|
|
56
|
+
nv_ingest_client-2025.12.17.dev20251217.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
|
|
57
|
+
nv_ingest_client-2025.12.17.dev20251217.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|