nv-ingest-client 2025.11.17.dev20251117__py3-none-any.whl → 2025.12.17.dev20251217__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest_client/client/client.py +112 -2
- nv_ingest_client/client/interface.py +301 -83
- nv_ingest_client/nv_ingest_cli.py +2 -2
- nv_ingest_client/primitives/jobs/job_spec.py +26 -1
- nv_ingest_client/primitives/tasks/caption.py +12 -1
- nv_ingest_client/primitives/tasks/extract.py +50 -2
- nv_ingest_client/primitives/tasks/store.py +18 -13
- nv_ingest_client/util/file_processing/extract.py +23 -0
- nv_ingest_client/util/util.py +34 -1
- nv_ingest_client/util/vdb/adt_vdb.py +216 -0
- nv_ingest_client/util/vdb/lancedb.py +276 -0
- nv_ingest_client/util/vdb/milvus.py +44 -21
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/METADATA +2 -1
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/RECORD +18 -17
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.11.17.dev20251117.dist-info → nv_ingest_client-2025.12.17.dev20251217.dist-info}/top_level.txt +0 -0
|
@@ -10,6 +10,7 @@ from typing import Dict
|
|
|
10
10
|
from typing import List
|
|
11
11
|
from typing import Optional
|
|
12
12
|
from typing import Union
|
|
13
|
+
from typing import Tuple
|
|
13
14
|
from uuid import UUID
|
|
14
15
|
|
|
15
16
|
from nv_ingest_client.primitives.tasks import Task
|
|
@@ -222,7 +223,9 @@ class BatchJobSpec:
|
|
|
222
223
|
A dictionary that maps document types to a list of `JobSpec` instances.
|
|
223
224
|
"""
|
|
224
225
|
|
|
225
|
-
def __init__(
|
|
226
|
+
def __init__(
|
|
227
|
+
self, job_specs_or_files: Optional[Union[List[JobSpec], List[str], List[Tuple[str, BytesIO]]]] = None
|
|
228
|
+
) -> None:
|
|
226
229
|
"""
|
|
227
230
|
Initializes the BatchJobSpec instance.
|
|
228
231
|
|
|
@@ -239,6 +242,13 @@ class BatchJobSpec:
|
|
|
239
242
|
self.from_job_specs(job_specs_or_files)
|
|
240
243
|
elif isinstance(job_specs_or_files[0], str):
|
|
241
244
|
self.from_files(job_specs_or_files)
|
|
245
|
+
elif (
|
|
246
|
+
isinstance(job_specs_or_files[0], tuple)
|
|
247
|
+
and len(job_specs_or_files[0]) == 2
|
|
248
|
+
and isinstance(job_specs_or_files[0][0], str)
|
|
249
|
+
and isinstance(job_specs_or_files[0][1], BytesIO)
|
|
250
|
+
):
|
|
251
|
+
self.from_buffers(job_specs_or_files)
|
|
242
252
|
else:
|
|
243
253
|
raise ValueError("Invalid input type for job_specs. Must be a list of JobSpec or file paths.")
|
|
244
254
|
|
|
@@ -282,6 +292,21 @@ class BatchJobSpec:
|
|
|
282
292
|
for job_spec in job_specs:
|
|
283
293
|
self.add_job_spec(job_spec)
|
|
284
294
|
|
|
295
|
+
def from_buffers(self, buffers: List[Tuple[str, BytesIO]]) -> None:
|
|
296
|
+
"""
|
|
297
|
+
Initializes the batch from a list of buffers.
|
|
298
|
+
|
|
299
|
+
Parameters
|
|
300
|
+
----------
|
|
301
|
+
buffers : List[Tuple[str, BytesIO]]
|
|
302
|
+
A list of tuples containing the name of the buffer and the BytesIO object.
|
|
303
|
+
"""
|
|
304
|
+
from nv_ingest_client.util.util import create_job_specs_for_buffers
|
|
305
|
+
|
|
306
|
+
job_specs = create_job_specs_for_buffers(buffers)
|
|
307
|
+
for job_spec in job_specs:
|
|
308
|
+
self.add_job_spec(job_spec)
|
|
309
|
+
|
|
285
310
|
def _from_dataset(self, dataset: str, shuffle_dataset: bool = True) -> None:
|
|
286
311
|
"""
|
|
287
312
|
Internal method to initialize the batch from a dataset.
|
|
@@ -22,18 +22,24 @@ class CaptionTask(Task):
|
|
|
22
22
|
api_key: str = None,
|
|
23
23
|
endpoint_url: str = None,
|
|
24
24
|
prompt: str = None,
|
|
25
|
+
system_prompt: str = None,
|
|
25
26
|
model_name: str = None,
|
|
26
27
|
) -> None:
|
|
27
28
|
super().__init__()
|
|
28
29
|
|
|
29
30
|
# Use the API schema for validation
|
|
30
31
|
validated_data = IngestTaskCaptionSchema(
|
|
31
|
-
api_key=api_key,
|
|
32
|
+
api_key=api_key,
|
|
33
|
+
endpoint_url=endpoint_url,
|
|
34
|
+
prompt=prompt,
|
|
35
|
+
system_prompt=system_prompt,
|
|
36
|
+
model_name=model_name,
|
|
32
37
|
)
|
|
33
38
|
|
|
34
39
|
self._api_key = validated_data.api_key
|
|
35
40
|
self._endpoint_url = validated_data.endpoint_url
|
|
36
41
|
self._prompt = validated_data.prompt
|
|
42
|
+
self._system_prompt = validated_data.system_prompt
|
|
37
43
|
self._model_name = validated_data.model_name
|
|
38
44
|
|
|
39
45
|
def __str__(self) -> str:
|
|
@@ -49,6 +55,8 @@ class CaptionTask(Task):
|
|
|
49
55
|
info += f" endpoint_url: {self._endpoint_url}\n"
|
|
50
56
|
if self._prompt:
|
|
51
57
|
info += f" prompt: {self._prompt}\n"
|
|
58
|
+
if self._system_prompt:
|
|
59
|
+
info += f" system_prompt: {self._system_prompt}\n"
|
|
52
60
|
if self._model_name:
|
|
53
61
|
info += f" model_name: {self._model_name}\n"
|
|
54
62
|
|
|
@@ -69,6 +77,9 @@ class CaptionTask(Task):
|
|
|
69
77
|
if self._prompt:
|
|
70
78
|
task_properties["prompt"] = self._prompt
|
|
71
79
|
|
|
80
|
+
if self._system_prompt:
|
|
81
|
+
task_properties["system_prompt"] = self._system_prompt
|
|
82
|
+
|
|
72
83
|
if self._model_name:
|
|
73
84
|
task_properties["model_name"] = self._model_name
|
|
74
85
|
|
|
@@ -8,6 +8,8 @@
|
|
|
8
8
|
|
|
9
9
|
import logging
|
|
10
10
|
import os
|
|
11
|
+
import warnings
|
|
12
|
+
from typing import get_args
|
|
11
13
|
from typing import Any
|
|
12
14
|
from typing import Dict
|
|
13
15
|
from typing import Literal
|
|
@@ -52,15 +54,27 @@ _DEFAULT_EXTRACTOR_MAP = {
|
|
|
52
54
|
|
|
53
55
|
_Type_Extract_Method_PDF = Literal[
|
|
54
56
|
"adobe",
|
|
55
|
-
"
|
|
57
|
+
"nemotron_parse",
|
|
56
58
|
"haystack",
|
|
57
59
|
"llama_parse",
|
|
58
60
|
"pdfium",
|
|
59
61
|
"tika",
|
|
60
62
|
"unstructured_io",
|
|
63
|
+
"unstructured_local",
|
|
64
|
+
"pdfium_hybrid",
|
|
61
65
|
"ocr",
|
|
62
66
|
]
|
|
63
67
|
|
|
68
|
+
_Type_Extract_Method_DOCX = Literal[
|
|
69
|
+
"python_docx",
|
|
70
|
+
"render_as_pdf",
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
_Type_Extract_Method_PPTX = Literal[
|
|
74
|
+
"python_pptx",
|
|
75
|
+
"render_as_pdf",
|
|
76
|
+
]
|
|
77
|
+
|
|
64
78
|
_Type_Extract_Images_Method = Literal["group", "yolox"]
|
|
65
79
|
|
|
66
80
|
_Type_Extract_Tables_Method_PDF = Literal["yolox", "paddle"]
|
|
@@ -74,7 +88,7 @@ class ExtractTask(Task):
|
|
|
74
88
|
def __init__(
|
|
75
89
|
self,
|
|
76
90
|
document_type,
|
|
77
|
-
extract_method:
|
|
91
|
+
extract_method: Optional[str] = None,
|
|
78
92
|
extract_text: bool = False,
|
|
79
93
|
extract_images: bool = False,
|
|
80
94
|
extract_tables: bool = False,
|
|
@@ -109,6 +123,12 @@ class ExtractTask(Task):
|
|
|
109
123
|
)
|
|
110
124
|
extract_method = _DEFAULT_EXTRACTOR_MAP[document_type_lower]
|
|
111
125
|
|
|
126
|
+
if extract_method == "nemoretriever_parse":
|
|
127
|
+
logger.warning("'nemoretriever_parse' is deprecated. Please use 'nemotron_parse' instead.")
|
|
128
|
+
extract_method = "nemotron_parse"
|
|
129
|
+
|
|
130
|
+
self._validate_extract_method(document_type, extract_method)
|
|
131
|
+
|
|
112
132
|
# Set default extract_charts if None
|
|
113
133
|
if extract_charts is None:
|
|
114
134
|
extract_charts = extract_tables
|
|
@@ -240,3 +260,31 @@ class ExtractTask(Task):
|
|
|
240
260
|
@property
|
|
241
261
|
def document_type(self):
|
|
242
262
|
return self._document_type.value
|
|
263
|
+
|
|
264
|
+
def _validate_extract_method(self, document_type: str, extract_method: str):
|
|
265
|
+
doc_type = document_type.lower()
|
|
266
|
+
|
|
267
|
+
valid_docx = set(get_args(_Type_Extract_Method_DOCX))
|
|
268
|
+
valid_pptx = set(get_args(_Type_Extract_Method_PPTX))
|
|
269
|
+
valid_pdf = set(get_args(_Type_Extract_Method_PDF))
|
|
270
|
+
|
|
271
|
+
if doc_type == "docx" and extract_method not in valid_docx:
|
|
272
|
+
raise ValueError(f"'{extract_method}' is invalid for DOCX. Options: {valid_docx}")
|
|
273
|
+
|
|
274
|
+
elif doc_type == "pptx" and extract_method not in valid_pptx:
|
|
275
|
+
raise ValueError(f"'{extract_method}' is invalid for PPTX. Options: {valid_pptx}")
|
|
276
|
+
|
|
277
|
+
elif doc_type == "pdf" and extract_method not in valid_pdf:
|
|
278
|
+
raise ValueError(f"'{extract_method}' is invalid for PDF. Options: {valid_pdf}")
|
|
279
|
+
|
|
280
|
+
elif doc_type not in ["docx", "pptx", "pdf"]:
|
|
281
|
+
is_docx_method = extract_method in valid_docx
|
|
282
|
+
is_pptx_method = extract_method in valid_pptx
|
|
283
|
+
is_pdf_method = extract_method in valid_pdf
|
|
284
|
+
|
|
285
|
+
if (is_docx_method or is_pptx_method) and not is_pdf_method:
|
|
286
|
+
warnings.warn(
|
|
287
|
+
f"extract_method '{extract_method}' is valid for Office documents but NOT for PDFs. "
|
|
288
|
+
"If your batch includes PDFs, extraction may fail for those files. "
|
|
289
|
+
"Consider leaving extract_method=None for mixed batches."
|
|
290
|
+
)
|
|
@@ -7,8 +7,7 @@
|
|
|
7
7
|
# pylint: disable=too-many-arguments
|
|
8
8
|
|
|
9
9
|
import logging
|
|
10
|
-
from typing import Dict
|
|
11
|
-
from typing import Literal
|
|
10
|
+
from typing import Dict, Literal, Optional
|
|
12
11
|
|
|
13
12
|
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
|
|
14
13
|
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
|
|
@@ -17,23 +16,19 @@ from .task_base import Task
|
|
|
17
16
|
|
|
18
17
|
logger = logging.getLogger(__name__)
|
|
19
18
|
|
|
20
|
-
_DEFAULT_STORE_METHOD = "minio"
|
|
21
|
-
|
|
22
19
|
|
|
23
20
|
class StoreTask(Task):
|
|
24
21
|
"""
|
|
25
22
|
Object for image storage task.
|
|
26
23
|
"""
|
|
27
24
|
|
|
28
|
-
_Type_Content_Type = Literal["image",]
|
|
29
|
-
|
|
30
|
-
_Type_Store_Method = Literal["minio",]
|
|
31
|
-
|
|
32
25
|
def __init__(
|
|
33
26
|
self,
|
|
34
27
|
structured: bool = True,
|
|
35
28
|
images: bool = False,
|
|
36
|
-
|
|
29
|
+
storage_uri: Optional[str] = None,
|
|
30
|
+
storage_options: Optional[dict] = None,
|
|
31
|
+
public_base_url: Optional[str] = None,
|
|
37
32
|
params: dict = None,
|
|
38
33
|
**extra_params,
|
|
39
34
|
) -> None:
|
|
@@ -51,12 +46,19 @@ class StoreTask(Task):
|
|
|
51
46
|
|
|
52
47
|
# Use the API schema for validation
|
|
53
48
|
validated_data = IngestTaskStoreSchema(
|
|
54
|
-
structured=structured,
|
|
49
|
+
structured=structured,
|
|
50
|
+
images=images,
|
|
51
|
+
storage_uri=storage_uri,
|
|
52
|
+
storage_options=storage_options or {},
|
|
53
|
+
public_base_url=public_base_url,
|
|
54
|
+
params=merged_params,
|
|
55
55
|
)
|
|
56
56
|
|
|
57
57
|
self._structured = validated_data.structured
|
|
58
58
|
self._images = validated_data.images
|
|
59
|
-
self.
|
|
59
|
+
self._storage_uri = validated_data.storage_uri
|
|
60
|
+
self._storage_options = validated_data.storage_options
|
|
61
|
+
self._public_base_url = validated_data.public_base_url
|
|
60
62
|
self._params = validated_data.params
|
|
61
63
|
self._extra_params = extra_params
|
|
62
64
|
|
|
@@ -68,7 +70,8 @@ class StoreTask(Task):
|
|
|
68
70
|
info += "Store Task:\n"
|
|
69
71
|
info += f" store structured types: {self._structured}\n"
|
|
70
72
|
info += f" store image types: {self._images}\n"
|
|
71
|
-
info += f"
|
|
73
|
+
info += f" storage uri: {self._storage_uri}\n"
|
|
74
|
+
info += f" public base url: {self._public_base_url}\n"
|
|
72
75
|
for key, value in self._extra_params.items():
|
|
73
76
|
info += f" {key}: {value}\n"
|
|
74
77
|
for key, value in self._params.items():
|
|
@@ -81,9 +84,11 @@ class StoreTask(Task):
|
|
|
81
84
|
"""
|
|
82
85
|
|
|
83
86
|
task_properties = {
|
|
84
|
-
"method": self._store_method,
|
|
85
87
|
"structured": self._structured,
|
|
86
88
|
"images": self._images,
|
|
89
|
+
"storage_uri": self._storage_uri,
|
|
90
|
+
"storage_options": self._storage_options,
|
|
91
|
+
"public_base_url": self._public_base_url,
|
|
87
92
|
"params": self._params,
|
|
88
93
|
**self._extra_params,
|
|
89
94
|
}
|
|
@@ -145,3 +145,26 @@ def extract_file_content(path: str) -> Tuple[str, DocumentTypeEnum]:
|
|
|
145
145
|
|
|
146
146
|
logger.debug(f"Content extracted from '{path}'")
|
|
147
147
|
return content, DocumentTypeEnum(document_type)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def extract_content_from_buffer(buffer: Tuple[str, BytesIO]) -> Tuple[str, str]:
|
|
151
|
+
"""
|
|
152
|
+
Extracts the content and type from a buffer.
|
|
153
|
+
"""
|
|
154
|
+
document_type = get_or_infer_file_type(buffer[0])
|
|
155
|
+
try:
|
|
156
|
+
if document_type in [
|
|
157
|
+
DocumentTypeEnum.TXT,
|
|
158
|
+
DocumentTypeEnum.MD,
|
|
159
|
+
DocumentTypeEnum.HTML,
|
|
160
|
+
]:
|
|
161
|
+
content = detect_encoding_and_read_text_file(buffer[1])
|
|
162
|
+
else:
|
|
163
|
+
content = serialize_to_base64(buffer[1])
|
|
164
|
+
except Exception as e:
|
|
165
|
+
logger.error(f"Error processing buffer {buffer[0]}: {e}")
|
|
166
|
+
|
|
167
|
+
raise ValueError(f"Failed to extract content from buffer {buffer[0]}") from e
|
|
168
|
+
|
|
169
|
+
logger.debug(f"Content extracted from '{buffer[0]}'")
|
|
170
|
+
return content, DocumentTypeEnum(document_type)
|
nv_ingest_client/util/util.py
CHANGED
|
@@ -12,10 +12,12 @@ import math
|
|
|
12
12
|
import heapq
|
|
13
13
|
from typing import Dict
|
|
14
14
|
from typing import List
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from io import BytesIO
|
|
15
17
|
|
|
16
18
|
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
17
19
|
from nv_ingest_client.primitives.jobs.job_spec import JobSpec
|
|
18
|
-
from nv_ingest_client.util.file_processing.extract import extract_file_content
|
|
20
|
+
from nv_ingest_client.util.file_processing.extract import extract_file_content, extract_content_from_buffer
|
|
19
21
|
|
|
20
22
|
logger = logging.getLogger(__name__)
|
|
21
23
|
|
|
@@ -350,6 +352,37 @@ def create_job_specs_for_batch(files_batch: List[str]) -> List[JobSpec]:
|
|
|
350
352
|
return job_specs
|
|
351
353
|
|
|
352
354
|
|
|
355
|
+
def create_job_specs_for_buffers(buffers: List[Tuple[str, BytesIO]]) -> List[JobSpec]:
|
|
356
|
+
"""
|
|
357
|
+
Create and job specifications (JobSpecs) for a list of buffers.
|
|
358
|
+
This function takes a list of buffers, processes each buffer to extract its content and type,
|
|
359
|
+
creates a job specification (JobSpec) for each buffer.
|
|
360
|
+
|
|
361
|
+
Parameters
|
|
362
|
+
----------
|
|
363
|
+
buffers : List[Tuple[str, BytesIO]]
|
|
364
|
+
A list of tuples containing the name of the buffer and the BytesIO object.
|
|
365
|
+
|
|
366
|
+
Returns
|
|
367
|
+
-------
|
|
368
|
+
List[JobSpec]
|
|
369
|
+
A list of JobSpecs.
|
|
370
|
+
"""
|
|
371
|
+
|
|
372
|
+
job_specs = []
|
|
373
|
+
for name, buffer in buffers:
|
|
374
|
+
content, file_type = extract_content_from_buffer((name, buffer))
|
|
375
|
+
job_spec = JobSpec(
|
|
376
|
+
document_type=file_type,
|
|
377
|
+
payload=content,
|
|
378
|
+
source_id=name,
|
|
379
|
+
source_name=name,
|
|
380
|
+
)
|
|
381
|
+
job_specs.append(job_spec)
|
|
382
|
+
|
|
383
|
+
return job_specs
|
|
384
|
+
|
|
385
|
+
|
|
353
386
|
def apply_pdf_split_config_to_job_specs(job_specs: List[JobSpec], pages_per_chunk: int) -> None:
|
|
354
387
|
"""
|
|
355
388
|
Apply PDF split configuration to a list of JobSpec objects.
|
|
@@ -1,27 +1,243 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
|
|
3
3
|
|
|
4
|
+
"""Abstract Vector Database (VDB) operator API.
|
|
5
|
+
|
|
6
|
+
This module defines the `VDB` abstract base class which specifies the
|
|
7
|
+
interface that custom vector-database operators must implement to integrate
|
|
8
|
+
with NV-Ingest.
|
|
9
|
+
|
|
10
|
+
The implementation details and an example OpenSearch operator are described
|
|
11
|
+
in the `examples/building_vdb_operator.ipynb` notebook in this repository, and a
|
|
12
|
+
production-ready OpenSearch implementation is available at
|
|
13
|
+
`client/src/nv_ingest_client/util/vdb/opensearch.py`.
|
|
14
|
+
|
|
15
|
+
Design goals:
|
|
16
|
+
- Provide a small, well-documented interface that supports common vector
|
|
17
|
+
database operations: index creation, batch ingestion, nearest-neighbor
|
|
18
|
+
retrieval, and a simple `run` orchestration entry-point used by the
|
|
19
|
+
NV-Ingest pipeline.
|
|
20
|
+
- Keep the API flexible by accepting `**kwargs` on methods so implementers can
|
|
21
|
+
pass database-specific options without changing the interface.
|
|
22
|
+
|
|
23
|
+
Typical implementation notes (inferred from the example OpenSearch operator):
|
|
24
|
+
- Constructor accepts connection and index configuration parameters such as
|
|
25
|
+
`host`, `port`, `index_name`, `dense_dim` and feature toggles for content
|
|
26
|
+
types (e.g. `enable_text`, `enable_images`).
|
|
27
|
+
- `create_index` should be able to create (and optionally recreate) an
|
|
28
|
+
index with appropriate vector settings (k-NN, HNSW/FAISS parameters, etc.).
|
|
29
|
+
- `write_to_index` should accept batches of NV-Ingest records, perform
|
|
30
|
+
validation/transformation, and write documents into the database efficiently
|
|
31
|
+
(bulk APIs are recommended).
|
|
32
|
+
- `retrieval` should accept a list of textual queries, convert them to
|
|
33
|
+
embeddings (by calling an external embedding service or model), perform a
|
|
34
|
+
vector search (top-k), and return cleaned results (e.g., removing stored
|
|
35
|
+
dense vectors from returned payloads).
|
|
36
|
+
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
|
|
4
40
|
class VDB(ABC):
|
|
41
|
+
"""Abstract base class for Vector Database operators.
|
|
42
|
+
|
|
43
|
+
Subclasses must implement the abstract methods below. The interface is
|
|
44
|
+
intentionally small and uses `**kwargs` to allow operator-specific
|
|
45
|
+
configuration without changing the common API.
|
|
46
|
+
|
|
47
|
+
Example (high level):
|
|
48
|
+
|
|
49
|
+
class OpenSearch(VDB):
|
|
50
|
+
def __init__(self, **kwargs):
|
|
51
|
+
# parse kwargs, initialize client, call super().__init__(**kwargs)
|
|
52
|
+
...
|
|
53
|
+
|
|
54
|
+
def create_index(self, **kwargs):
|
|
55
|
+
# create index, mappings, settings
|
|
56
|
+
...
|
|
57
|
+
|
|
58
|
+
def write_to_index(self, records: list, **kwargs):
|
|
59
|
+
# transform NV-Ingest records and write to database
|
|
60
|
+
...
|
|
61
|
+
|
|
62
|
+
def retrieval(self, queries: list, **kwargs):
|
|
63
|
+
# convert queries to embeddings, k-NN search, format results
|
|
64
|
+
...
|
|
65
|
+
|
|
66
|
+
def run(self, records):
|
|
67
|
+
# orchestrate create_index + write_to_index
|
|
68
|
+
...
|
|
69
|
+
|
|
70
|
+
Notes on recommended constructor parameters (not enforced by this ABC):
|
|
71
|
+
- host (str): database hostname (default: 'localhost')
|
|
72
|
+
- port (int): database port (default: 9200 for OpenSearch/Elasticsearch)
|
|
73
|
+
- index_name (str): base index name used by the operator
|
|
74
|
+
- dense_dim (int): dimensionality of stored dense embeddings
|
|
75
|
+
- enable_text/enable_images/... (bool): content-type toggles used when
|
|
76
|
+
extracting text from NV-Ingest records before indexing
|
|
77
|
+
|
|
78
|
+
The concrete operator may accept additional parameters (username,
|
|
79
|
+
password, ssl options, client-specific flags). Passing these via
|
|
80
|
+
`**kwargs` is the intended pattern.
|
|
81
|
+
"""
|
|
5
82
|
|
|
6
83
|
@abstractmethod
|
|
7
84
|
def __init__(self, **kwargs):
|
|
85
|
+
"""Initialize the VDB operator.
|
|
86
|
+
|
|
87
|
+
Implementations should extract configuration values from `kwargs`
|
|
88
|
+
(or use defaults) and initialize any client connections required to
|
|
89
|
+
talk to the target vector database. Implementations are encouraged to
|
|
90
|
+
call `super().__init__(**kwargs)` only if they want the base-class
|
|
91
|
+
behavior of storing kwargs on the instance (the base class itself does
|
|
92
|
+
not require that behavior).
|
|
93
|
+
|
|
94
|
+
Parameters (suggested/common):
|
|
95
|
+
- host (str): database host
|
|
96
|
+
- port (int): database port
|
|
97
|
+
- index_name (str): base name for created indices
|
|
98
|
+
- dense_dim (int): embedding vector dimension
|
|
99
|
+
- enable_text (bool): whether text content should be extracted/indexed
|
|
100
|
+
- enable_images (bool), enable_audio (bool), etc.: other toggles
|
|
101
|
+
|
|
102
|
+
The constructor should not perform heavy operations (like creating
|
|
103
|
+
indices) unless explicitly desired; prefer leaving that work to
|
|
104
|
+
`create_index` to make the operator easier to test.
|
|
105
|
+
"""
|
|
8
106
|
self.__dict__.update(kwargs)
|
|
9
107
|
|
|
10
108
|
@abstractmethod
|
|
11
109
|
def create_index(self, **kwargs):
|
|
110
|
+
"""Create and configure the index(es) required by this operator.
|
|
111
|
+
|
|
112
|
+
Implementations must ensure an appropriate index (or indices) exist
|
|
113
|
+
before data ingestion. For vector indexes this typically means
|
|
114
|
+
creating settings and mappings that enable k-NN/vector search (for
|
|
115
|
+
example, enabling an HNSW/FAISS engine, setting `dimension`, and any
|
|
116
|
+
engine-specific parameters).
|
|
117
|
+
|
|
118
|
+
Common keyword arguments (operator-specific):
|
|
119
|
+
- recreate (bool): if True, delete and recreate the index even if it
|
|
120
|
+
already exists (default: False)
|
|
121
|
+
- index_name (str): override the operator's configured index name for
|
|
122
|
+
this call
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
implementation-specific result (e.g., a boolean, the created
|
|
126
|
+
index name, or the raw response from the database client). There
|
|
127
|
+
is no strict requirement here because different DB clients return
|
|
128
|
+
different values; document behavior in concrete implementations.
|
|
129
|
+
"""
|
|
12
130
|
pass
|
|
13
131
|
|
|
14
132
|
@abstractmethod
|
|
15
133
|
def write_to_index(self, records: list, **kwargs):
|
|
134
|
+
"""Write a batch of NV-Ingest records to the vector database.
|
|
135
|
+
|
|
136
|
+
This method receives `records` formatted as NV-Ingest provides them
|
|
137
|
+
(commonly a list of record-sets). Implementations are responsible for
|
|
138
|
+
transforming each record into the target database document format,
|
|
139
|
+
validating the presence of embeddings and content, and using the most
|
|
140
|
+
efficient ingestion API available (for example a bulk endpoint).
|
|
141
|
+
|
|
142
|
+
Expected behavior:
|
|
143
|
+
- Iterate over the provided `records` (which can be nested lists of
|
|
144
|
+
record dictionaries) and transform each record to the DB document
|
|
145
|
+
structure (fields such as `dense` for the vector, `text` for the
|
|
146
|
+
content, and `metadata` for auxiliary fields are common in the
|
|
147
|
+
repository examples).
|
|
148
|
+
- Skip records missing required fields (for example, missing
|
|
149
|
+
embeddings) and log or report failures as appropriate.
|
|
150
|
+
- Use batching / bulk APIs to reduce overhead when writing large
|
|
151
|
+
volumes of documents.
|
|
152
|
+
|
|
153
|
+
Parameters:
|
|
154
|
+
- records (list): NV-Ingest records (see repository examples for
|
|
155
|
+
structure)
|
|
156
|
+
- batch_size (int, optional): how many documents to send per bulk
|
|
157
|
+
request; database-specific implementations can use this hint
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
implementation-specific result (e.g., number of documents
|
|
161
|
+
indexed, client response for bulk API). Concrete implementations
|
|
162
|
+
should document exact return values and failure semantics.
|
|
163
|
+
"""
|
|
16
164
|
pass
|
|
17
165
|
|
|
18
166
|
@abstractmethod
|
|
19
167
|
def retrieval(self, queries: list, **kwargs):
|
|
168
|
+
"""Perform similarity search for a list of text queries.
|
|
169
|
+
|
|
170
|
+
The typical retrieval flow implemented by operators in this ecosystem
|
|
171
|
+
is:
|
|
172
|
+
1. Convert each textual `query` into a dense embedding using an
|
|
173
|
+
external embedding model or service (the example uses an NVIDIA
|
|
174
|
+
embedding model via `llama_index.embeddings.nvidia.NVIDIAEmbedding`).
|
|
175
|
+
2. Issue a vector (k-NN) search to the database using the generated
|
|
176
|
+
embedding, requesting the top-k (configurable) neighbors.
|
|
177
|
+
3. Post-process results (for example, remove stored dense vectors
|
|
178
|
+
from returned documents to reduce payload size) and return a
|
|
179
|
+
list-of-lists of result documents aligned with the input `queries`.
|
|
180
|
+
|
|
181
|
+
Keyword arguments (common):
|
|
182
|
+
- index_name (str): index to search (default: operator's configured
|
|
183
|
+
index_name)
|
|
184
|
+
- top_k (int): number of nearest neighbors to return (default: 10)
|
|
185
|
+
- embedding_endpoint / model_name / nvidia_api_key: parameters needed
|
|
186
|
+
when the operator integrates with an external embedding service.
|
|
187
|
+
|
|
188
|
+
Parameters:
|
|
189
|
+
- queries (list[str]): list of text queries to be vectorized and
|
|
190
|
+
searched
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
- results (list[list[dict]]): for each query, a list of hit documents
|
|
194
|
+
(concrete implementations should specify the document shape they
|
|
195
|
+
return). Operators should remove large binary/vector fields from
|
|
196
|
+
responses where possible.
|
|
197
|
+
"""
|
|
20
198
|
pass
|
|
21
199
|
|
|
22
200
|
@abstractmethod
|
|
23
201
|
def run(self, records):
|
|
202
|
+
"""Main entry point used by the NV-Ingest pipeline.
|
|
203
|
+
|
|
204
|
+
The `run` method is intended to be a simple orchestration layer that
|
|
205
|
+
ensures the index exists and then ingests provided records. A minimal
|
|
206
|
+
recommended implementation is::
|
|
207
|
+
|
|
208
|
+
def run(self, records):
|
|
209
|
+
self.create_index()
|
|
210
|
+
self.write_to_index(records)
|
|
211
|
+
|
|
212
|
+
Implementers can add pre/post hooks, metrics, retries, or error
|
|
213
|
+
handling as needed for production readiness. Keep `run` simple so the
|
|
214
|
+
pipeline orchestration remains predictable.
|
|
215
|
+
|
|
216
|
+
Parameters:
|
|
217
|
+
- records: NV-Ingest records to index (format follows repository
|
|
218
|
+
conventions)
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
- implementation-specific result (for example, a summary dict or
|
|
222
|
+
boolean success flag).
|
|
223
|
+
"""
|
|
24
224
|
pass
|
|
25
225
|
|
|
26
226
|
def reindex(self, records: list, **kwargs):
|
|
227
|
+
"""Optional helper to rebuild or re-populate indexes with new data.
|
|
228
|
+
|
|
229
|
+
This non-abstract method is provided as an optional hook that concrete
|
|
230
|
+
classes may override. A typical reindex implementation will:
|
|
231
|
+
- optionally delete the existing index and recreate it (via
|
|
232
|
+
`create_index(recreate=True)`)
|
|
233
|
+
- call `write_to_index(records)` to populate the new index
|
|
234
|
+
|
|
235
|
+
Parameters:
|
|
236
|
+
- records (list): records used to populate the index
|
|
237
|
+
- recreate (bool, optional): whether to delete and recreate the
|
|
238
|
+
index before writing
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
- implementation-specific result
|
|
242
|
+
"""
|
|
27
243
|
pass
|