endee-llamaindex 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- endee_llamaindex/base.py +613 -658
- endee_llamaindex/constants.py +70 -0
- endee_llamaindex/utils.py +7 -587
- {endee_llamaindex-0.1.3.dist-info → endee_llamaindex-0.1.5.dist-info}/METADATA +173 -50
- endee_llamaindex-0.1.5.dist-info/RECORD +8 -0
- {endee_llamaindex-0.1.3.dist-info → endee_llamaindex-0.1.5.dist-info}/WHEEL +1 -1
- endee_llamaindex-0.1.3.dist-info/RECORD +0 -7
- {endee_llamaindex-0.1.3.dist-info → endee_llamaindex-0.1.5.dist-info}/top_level.txt +0 -0
endee_llamaindex/base.py
CHANGED
|
@@ -1,16 +1,32 @@
|
|
|
1
|
+
"""
|
|
2
|
+
EndeeVectorStore: LlamaIndex vector store backed by the Endee API.
|
|
3
|
+
|
|
4
|
+
Aligned with the local endee package (./endee). API contract:
|
|
5
|
+
|
|
6
|
+
Endee (endee.endee):
|
|
7
|
+
- __init__(token, http_library)
|
|
8
|
+
- create_index(name, dimension, space_type, M, ef_con, precision, version, sparse_dim)
|
|
9
|
+
Validates: index name (alphanumeric + underscores, max length), dimension <= MAX_DIMENSION_ALLOWED,
|
|
10
|
+
space_type in SPACE_TYPES_SUPPORTED ('cosine','l2','ip'), precision in PRECISION_TYPES_SUPPORTED,
|
|
11
|
+
sparse_dim >= 0. Map 'euclidean'->'l2', 'inner_product'->'ip' before calling.
|
|
12
|
+
- get_index(name) -> Index
|
|
13
|
+
|
|
14
|
+
Index (endee.index):
|
|
15
|
+
- upsert(input_array): list of {id, vector, meta?, filter?, sparse_indices?, sparse_values?}; max MAX_VECTORS_PER_BATCH per batch; duplicate IDs in batch raise
|
|
16
|
+
- query(vector, top_k, filter, ef, include_vectors, sparse_indices, sparse_values)
|
|
17
|
+
- delete_vector(id), get_vector(id), describe()
|
|
18
|
+
|
|
19
|
+
No list_ids or batch fetch in endee; filter for query is JSON-serializable (e.g. [{"field":{"$op":value}}] or dict).
|
|
20
|
+
"""
|
|
21
|
+
|
|
1
22
|
import logging
|
|
2
|
-
from collections import Counter
|
|
3
|
-
from functools import partial
|
|
4
23
|
import json
|
|
5
24
|
from typing import Any, Callable, Dict, List, Optional, cast
|
|
6
|
-
|
|
7
25
|
from llama_index.core.bridge.pydantic import PrivateAttr
|
|
8
|
-
from llama_index.core.schema import BaseNode,
|
|
26
|
+
from llama_index.core.schema import BaseNode, TextNode
|
|
9
27
|
from llama_index.core.vector_stores.types import (
|
|
10
28
|
BasePydanticVectorStore,
|
|
11
|
-
MetadataFilters,
|
|
12
29
|
VectorStoreQuery,
|
|
13
|
-
VectorStoreQueryMode,
|
|
14
30
|
VectorStoreQueryResult,
|
|
15
31
|
)
|
|
16
32
|
from llama_index.core.vector_stores.utils import (
|
|
@@ -19,258 +35,34 @@ from llama_index.core.vector_stores.utils import (
|
|
|
19
35
|
metadata_dict_to_node,
|
|
20
36
|
node_to_metadata_dict,
|
|
21
37
|
)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
ID_KEY = "id"
|
|
40
|
-
VECTOR_KEY = "values"
|
|
41
|
-
SPARSE_VECTOR_KEY = "sparse_values"
|
|
42
|
-
METADATA_KEY = "metadata"
|
|
43
|
-
|
|
44
|
-
DEFAULT_BATCH_SIZE = 100
|
|
45
|
-
|
|
38
|
+
from .constants import (
|
|
39
|
+
DEFAULT_BATCH_SIZE,
|
|
40
|
+
DEFAULT_EF_SEARCH,
|
|
41
|
+
MAX_DIMENSION_ALLOWED,
|
|
42
|
+
MAX_EF_SEARCH_ALLOWED,
|
|
43
|
+
MAX_INDEX_NAME_LENGTH_ALLOWED,
|
|
44
|
+
MAX_TOP_K_ALLOWED,
|
|
45
|
+
MAX_VECTORS_PER_BATCH,
|
|
46
|
+
PRECISION_VALID,
|
|
47
|
+
REVERSE_OPERATOR_MAP,
|
|
48
|
+
SPACE_TYPE_MAP,
|
|
49
|
+
SPACE_TYPES_VALID,
|
|
50
|
+
SUPPORTED_FILTER_OPERATORS,
|
|
51
|
+
)
|
|
52
|
+
from .utils import get_sparse_encoder
|
|
53
|
+
from endee import Endee
|
|
46
54
|
_logger = logging.getLogger(__name__)
|
|
47
55
|
|
|
48
|
-
from llama_index.core.vector_stores.types import MetadataFilter, FilterOperator
|
|
49
|
-
|
|
50
|
-
reverse_operator_map = {
|
|
51
|
-
FilterOperator.EQ: "$eq",
|
|
52
|
-
FilterOperator.NE: "$ne",
|
|
53
|
-
FilterOperator.GT: "$gt",
|
|
54
|
-
FilterOperator.GTE: "$gte",
|
|
55
|
-
FilterOperator.LT: "$lt",
|
|
56
|
-
FilterOperator.LTE: "$lte",
|
|
57
|
-
FilterOperator.IN: "$in",
|
|
58
|
-
FilterOperator.NIN: "$nin",
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def build_dict(input_batch: List[List[int]]) -> List[Dict[str, Any]]:
|
|
64
|
-
"""
|
|
65
|
-
Build a list of sparse dictionaries from a batch of input_ids.
|
|
66
|
-
|
|
67
|
-
NOTE: taken from https://www.pinecone.io/learn/hybrid-search-intro/.
|
|
68
|
-
|
|
69
|
-
"""
|
|
70
|
-
# store a batch of sparse embeddings
|
|
71
|
-
sparse_emb = []
|
|
72
|
-
# iterate through input batch
|
|
73
|
-
for token_ids in input_batch:
|
|
74
|
-
indices = []
|
|
75
|
-
values = []
|
|
76
|
-
# convert the input_ids list to a dictionary of key to frequency values
|
|
77
|
-
d = dict(Counter(token_ids))
|
|
78
|
-
for idx in d:
|
|
79
|
-
indices.append(idx)
|
|
80
|
-
values.append(float(d[idx]))
|
|
81
|
-
sparse_emb.append({"indices": indices, "values": values})
|
|
82
|
-
# return sparse_emb list
|
|
83
|
-
return sparse_emb
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
def generate_sparse_vectors(
|
|
87
|
-
context_batch: List[str], tokenizer: Callable
|
|
88
|
-
) -> List[Dict[str, Any]]:
|
|
89
|
-
"""
|
|
90
|
-
Generate sparse vectors from a batch of contexts.
|
|
91
|
-
|
|
92
|
-
NOTE: taken from https://www.pinecone.io/learn/hybrid-search-intro/.
|
|
93
|
-
|
|
94
|
-
"""
|
|
95
|
-
# create batch of input_ids
|
|
96
|
-
inputs = tokenizer(context_batch)["input_ids"]
|
|
97
|
-
# create sparse dictionaries
|
|
98
|
-
return build_dict(inputs)
|
|
99
56
|
|
|
100
57
|
|
|
101
58
|
# Supported sparse embedding models
|
|
102
59
|
SUPPORTED_SPARSE_MODELS = {
|
|
103
60
|
"splade_pp": "prithivida/Splade_PP_en_v1",
|
|
104
61
|
"splade_cocondenser": "naver/splade-cocondenser-ensembledistil",
|
|
105
|
-
"bert_base": "bert-base-uncased",
|
|
106
|
-
"distilbert": "distilbert-base-uncased",
|
|
107
|
-
"minilm": "sentence-transformers/all-MiniLM-L6-v2",
|
|
108
|
-
"mpnet": "sentence-transformers/all-mpnet-base-v2",
|
|
109
|
-
"roberta": "roberta-base",
|
|
110
|
-
"xlm_roberta": "xlm-roberta-base",
|
|
111
62
|
}
|
|
112
63
|
|
|
113
64
|
|
|
114
|
-
|
|
115
|
-
model_name: str,
|
|
116
|
-
batch_size: int = 256,
|
|
117
|
-
cache_dir: Optional[str] = None,
|
|
118
|
-
threads: Optional[int] = None,
|
|
119
|
-
) -> Callable:
|
|
120
|
-
"""
|
|
121
|
-
Initialize a sparse encoder using FastEmbed (recommended for SPLADE models).
|
|
122
|
-
|
|
123
|
-
Args:
|
|
124
|
-
model_name: Model identifier or alias
|
|
125
|
-
batch_size: Batch size for encoding
|
|
126
|
-
cache_dir: Directory to cache model files
|
|
127
|
-
threads: Number of threads to use
|
|
128
|
-
|
|
129
|
-
Returns:
|
|
130
|
-
Callable function that generates sparse vectors from text
|
|
131
|
-
"""
|
|
132
|
-
try:
|
|
133
|
-
from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding
|
|
134
|
-
except ImportError as e:
|
|
135
|
-
raise ImportError(
|
|
136
|
-
"Could not import FastEmbed. "
|
|
137
|
-
"Please install it with `pip install fastembed` or "
|
|
138
|
-
"`pip install fastembed-gpu` for GPU support."
|
|
139
|
-
) from e
|
|
140
|
-
|
|
141
|
-
# Resolve model name from alias if needed
|
|
142
|
-
resolved_model_name = SUPPORTED_SPARSE_MODELS.get(model_name, model_name)
|
|
143
|
-
|
|
144
|
-
# Try GPU first, fallback to CPU
|
|
145
|
-
try:
|
|
146
|
-
model = SparseTextEmbedding(
|
|
147
|
-
resolved_model_name,
|
|
148
|
-
cache_dir=cache_dir,
|
|
149
|
-
threads=threads,
|
|
150
|
-
providers=["CUDAExecutionProvider"],
|
|
151
|
-
)
|
|
152
|
-
_logger.info(f"Initialized sparse encoder '{resolved_model_name}' on GPU")
|
|
153
|
-
except Exception:
|
|
154
|
-
model = SparseTextEmbedding(
|
|
155
|
-
resolved_model_name,
|
|
156
|
-
cache_dir=cache_dir,
|
|
157
|
-
threads=threads
|
|
158
|
-
)
|
|
159
|
-
_logger.info(f"Initialized sparse encoder '{resolved_model_name}' on CPU")
|
|
160
|
-
|
|
161
|
-
def compute_vectors(texts: List[str]) -> tuple:
|
|
162
|
-
"""Compute sparse vectors (indices, values) for a list of texts."""
|
|
163
|
-
embeddings = model.embed(texts, batch_size=batch_size)
|
|
164
|
-
indices = []
|
|
165
|
-
values = []
|
|
166
|
-
for embedding in embeddings:
|
|
167
|
-
indices.append(embedding.indices.tolist())
|
|
168
|
-
values.append(embedding.values.tolist())
|
|
169
|
-
return indices, values
|
|
170
|
-
|
|
171
|
-
return compute_vectors
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
def _initialize_sparse_encoder_transformers(
|
|
175
|
-
model_name: str,
|
|
176
|
-
) -> Callable:
|
|
177
|
-
"""
|
|
178
|
-
Initialize a sparse encoder using Transformers library.
|
|
179
|
-
|
|
180
|
-
Args:
|
|
181
|
-
model_name: Model identifier or alias
|
|
182
|
-
|
|
183
|
-
Returns:
|
|
184
|
-
Callable function that generates sparse vectors from text
|
|
185
|
-
"""
|
|
186
|
-
try:
|
|
187
|
-
import torch
|
|
188
|
-
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
|
189
|
-
except ImportError as e:
|
|
190
|
-
raise ImportError(
|
|
191
|
-
"Could not import transformers library. "
|
|
192
|
-
'Please install transformers with `pip install "transformers[torch]"`'
|
|
193
|
-
) from e
|
|
194
|
-
|
|
195
|
-
# Resolve model name from alias if needed
|
|
196
|
-
resolved_model_name = SUPPORTED_SPARSE_MODELS.get(model_name, model_name)
|
|
197
|
-
|
|
198
|
-
tokenizer = AutoTokenizer.from_pretrained(resolved_model_name)
|
|
199
|
-
model = AutoModelForMaskedLM.from_pretrained(resolved_model_name)
|
|
200
|
-
|
|
201
|
-
if torch.cuda.is_available():
|
|
202
|
-
model = model.to("cuda")
|
|
203
|
-
_logger.info(f"Initialized sparse encoder '{resolved_model_name}' on GPU")
|
|
204
|
-
else:
|
|
205
|
-
_logger.info(f"Initialized sparse encoder '{resolved_model_name}' on CPU")
|
|
206
|
-
|
|
207
|
-
def compute_vectors(texts: List[str]) -> tuple:
|
|
208
|
-
"""
|
|
209
|
-
Compute sparse vectors from logits using ReLU, log, and max operations.
|
|
210
|
-
"""
|
|
211
|
-
tokens = tokenizer(
|
|
212
|
-
texts,
|
|
213
|
-
truncation=True,
|
|
214
|
-
padding=True,
|
|
215
|
-
max_length=512,
|
|
216
|
-
return_tensors="pt"
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
if torch.cuda.is_available():
|
|
220
|
-
tokens = tokens.to("cuda")
|
|
221
|
-
|
|
222
|
-
with torch.no_grad():
|
|
223
|
-
output = model(**tokens)
|
|
224
|
-
logits, attention_mask = output.logits, tokens.attention_mask
|
|
225
|
-
relu_log = torch.log(1 + torch.relu(logits))
|
|
226
|
-
weighted_log = relu_log * attention_mask.unsqueeze(-1)
|
|
227
|
-
tvecs, _ = torch.max(weighted_log, dim=1)
|
|
228
|
-
|
|
229
|
-
# Extract non-zero vectors and their indices
|
|
230
|
-
indices = []
|
|
231
|
-
values = []
|
|
232
|
-
for batch in tvecs:
|
|
233
|
-
nz_indices = batch.nonzero(as_tuple=True)[0].tolist()
|
|
234
|
-
indices.append(nz_indices)
|
|
235
|
-
values.append(batch[nz_indices].tolist())
|
|
236
|
-
|
|
237
|
-
return indices, values
|
|
238
|
-
|
|
239
|
-
return compute_vectors
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
def get_sparse_encoder(
|
|
243
|
-
model_name: Optional[str] = None,
|
|
244
|
-
use_fastembed: bool = True,
|
|
245
|
-
batch_size: int = 256,
|
|
246
|
-
cache_dir: Optional[str] = None,
|
|
247
|
-
threads: Optional[int] = None,
|
|
248
|
-
) -> Optional[Callable]:
|
|
249
|
-
"""
|
|
250
|
-
Get a sparse encoder function for the specified model.
|
|
251
|
-
|
|
252
|
-
Args:
|
|
253
|
-
model_name: Model name or alias (e.g., 'splade_pp', 'bert_base', or full model ID)
|
|
254
|
-
use_fastembed: If True, use FastEmbed (recommended for SPLADE models), else use Transformers
|
|
255
|
-
batch_size: Batch size for encoding
|
|
256
|
-
cache_dir: Directory to cache model files
|
|
257
|
-
threads: Number of threads to use
|
|
258
|
-
|
|
259
|
-
Returns:
|
|
260
|
-
Callable function that generates sparse vectors, or None if model_name is not provided
|
|
261
|
-
"""
|
|
262
|
-
if model_name is None:
|
|
263
|
-
return None
|
|
264
|
-
|
|
265
|
-
if use_fastembed:
|
|
266
|
-
return _initialize_sparse_encoder_fastembed(
|
|
267
|
-
model_name=model_name,
|
|
268
|
-
batch_size=batch_size,
|
|
269
|
-
cache_dir=cache_dir,
|
|
270
|
-
threads=threads,
|
|
271
|
-
)
|
|
272
|
-
else:
|
|
273
|
-
return _initialize_sparse_encoder_transformers(model_name=model_name)
|
|
65
|
+
# Import sparse encoder utilities from utils module
|
|
274
66
|
|
|
275
67
|
|
|
276
68
|
import_err_msg = (
|
|
@@ -282,22 +74,18 @@ class EndeeVectorStore(BasePydanticVectorStore):
|
|
|
282
74
|
|
|
283
75
|
stores_text: bool = True
|
|
284
76
|
flat_metadata: bool = False
|
|
285
|
-
|
|
286
77
|
api_token: Optional[str]
|
|
287
78
|
index_name: Optional[str]
|
|
288
79
|
space_type: Optional[str]
|
|
289
80
|
dimension: Optional[int]
|
|
290
|
-
insert_kwargs: Optional[Dict]
|
|
291
81
|
add_sparse_vector: bool
|
|
292
82
|
text_key: str
|
|
293
83
|
batch_size: int
|
|
294
84
|
remove_text_from_metadata: bool
|
|
295
85
|
hybrid: bool
|
|
296
|
-
|
|
86
|
+
sparse_dim: Optional[int]
|
|
297
87
|
model_name: Optional[str]
|
|
298
88
|
precision: Optional[str]
|
|
299
|
-
key: Optional[str]
|
|
300
|
-
|
|
301
89
|
_endee_index: Any = PrivateAttr()
|
|
302
90
|
_sparse_encoder: Optional[Callable] = PrivateAttr(default=None)
|
|
303
91
|
|
|
@@ -308,63 +96,76 @@ class EndeeVectorStore(BasePydanticVectorStore):
|
|
|
308
96
|
index_name: Optional[str] = None,
|
|
309
97
|
space_type: Optional[str] = "cosine",
|
|
310
98
|
dimension: Optional[int] = None,
|
|
311
|
-
insert_kwargs: Optional[Dict] = None,
|
|
312
99
|
add_sparse_vector: bool = False,
|
|
313
100
|
text_key: str = DEFAULT_TEXT_KEY,
|
|
314
101
|
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
315
102
|
remove_text_from_metadata: bool = False,
|
|
316
103
|
hybrid: bool = False,
|
|
317
|
-
|
|
104
|
+
sparse_dim: Optional[int] = None,
|
|
318
105
|
model_name: Optional[str] = None,
|
|
319
|
-
precision: Optional[str] = "
|
|
320
|
-
|
|
106
|
+
precision: Optional[str] = "float16",
|
|
107
|
+
M: Optional[int] = None,
|
|
108
|
+
ef_con: Optional[int] = None,
|
|
321
109
|
**kwargs: Any,
|
|
322
110
|
) -> None:
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
add_sparse_vector=add_sparse_vector,
|
|
332
|
-
text_key=text_key,
|
|
333
|
-
batch_size=batch_size,
|
|
334
|
-
remove_text_from_metadata=remove_text_from_metadata,
|
|
335
|
-
vocab_size=vocab_size,
|
|
336
|
-
hybrid=hybrid,
|
|
337
|
-
model_name=model_name,
|
|
338
|
-
precision=precision,
|
|
339
|
-
key=key,
|
|
340
|
-
)
|
|
341
|
-
|
|
342
|
-
# Initialize index based on hybrid flag
|
|
343
|
-
if endee_index is not None:
|
|
344
|
-
# Use provided index
|
|
345
|
-
self._endee_index = endee_index
|
|
346
|
-
elif hybrid:
|
|
347
|
-
# Initialize hybrid index
|
|
348
|
-
self._endee_index = self._initialize_hybrid_index(
|
|
349
|
-
api_token, index_name, dimension, space_type, vocab_size, precision, key
|
|
350
|
-
)
|
|
351
|
-
else:
|
|
352
|
-
# Initialize regular index
|
|
353
|
-
self._endee_index = self._initialize_endee_index(
|
|
354
|
-
api_token, index_name, dimension, space_type, precision, key
|
|
355
|
-
)
|
|
356
|
-
|
|
357
|
-
# Initialize sparse encoder if model name is provided and hybrid mode is enabled
|
|
358
|
-
if hybrid and model_name:
|
|
359
|
-
_logger.info(f"Initializing sparse encoder with model: {model_name}")
|
|
360
|
-
self._sparse_encoder = get_sparse_encoder(
|
|
361
|
-
model_name=model_name,
|
|
362
|
-
use_fastembed=True, # Default to FastEmbed
|
|
111
|
+
try:
|
|
112
|
+
super().__init__(
|
|
113
|
+
index_name=index_name,
|
|
114
|
+
api_token=api_token,
|
|
115
|
+
space_type=space_type,
|
|
116
|
+
dimension=dimension,
|
|
117
|
+
add_sparse_vector=add_sparse_vector,
|
|
118
|
+
text_key=text_key,
|
|
363
119
|
batch_size=batch_size,
|
|
120
|
+
remove_text_from_metadata=remove_text_from_metadata,
|
|
121
|
+
sparse_dim=sparse_dim,
|
|
122
|
+
hybrid=hybrid,
|
|
123
|
+
model_name=model_name,
|
|
124
|
+
precision=precision,
|
|
364
125
|
)
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
126
|
+
|
|
127
|
+
# Initialize index (handles both dense and hybrid)
|
|
128
|
+
if endee_index is not None:
|
|
129
|
+
self._endee_index = endee_index
|
|
130
|
+
else:
|
|
131
|
+
# sparse_dim=None creates dense index, sparse_dim>0 creates hybrid index
|
|
132
|
+
self._endee_index = self._initialize_endee_index(
|
|
133
|
+
api_token,
|
|
134
|
+
index_name,
|
|
135
|
+
dimension,
|
|
136
|
+
space_type,
|
|
137
|
+
precision,
|
|
138
|
+
sparse_dim=sparse_dim if hybrid else None,
|
|
139
|
+
M=M,
|
|
140
|
+
ef_con=ef_con,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Initialize sparse encoder if hybrid mode is enabled
|
|
144
|
+
if hybrid:
|
|
145
|
+
# Use default model if none provided
|
|
146
|
+
if model_name is None:
|
|
147
|
+
model_name = "splade_pp" # Default sparse model
|
|
148
|
+
_logger.info(f"Using default sparse model: {model_name}")
|
|
149
|
+
# If user provided an unsupported model, fall back to one of the supported models
|
|
150
|
+
elif model_name not in SUPPORTED_SPARSE_MODELS:
|
|
151
|
+
_logger.warning(
|
|
152
|
+
f"Unsupported sparse model_name {model_name!r} provided. "
|
|
153
|
+
"Falling back to default 'splade_pp'. "
|
|
154
|
+
f"Supported sparse models: {list(SUPPORTED_SPARSE_MODELS.keys())}"
|
|
155
|
+
)
|
|
156
|
+
model_name = "splade_pp"
|
|
157
|
+
|
|
158
|
+
_logger.info(f"Initializing sparse encoder with model: {model_name}")
|
|
159
|
+
self._sparse_encoder = get_sparse_encoder(
|
|
160
|
+
model_name=model_name,
|
|
161
|
+
use_fastembed=True, # Default to FastEmbed
|
|
162
|
+
batch_size=batch_size,
|
|
163
|
+
)
|
|
164
|
+
else:
|
|
165
|
+
self._sparse_encoder = None
|
|
166
|
+
except Exception as e:
|
|
167
|
+
_logger.error(f"Error initializing EndeeVectorStore: {e}")
|
|
168
|
+
raise
|
|
368
169
|
|
|
369
170
|
@classmethod
|
|
370
171
|
def _initialize_endee_index(
|
|
@@ -373,82 +174,139 @@ class EndeeVectorStore(BasePydanticVectorStore):
|
|
|
373
174
|
index_name: Optional[str],
|
|
374
175
|
dimension: Optional[int] = None,
|
|
375
176
|
space_type: Optional[str] = "cosine",
|
|
376
|
-
precision: Optional[str] = "
|
|
377
|
-
|
|
177
|
+
precision: Optional[str] = "float16",
|
|
178
|
+
sparse_dim: Optional[int] = None,
|
|
179
|
+
M: Optional[int] = None,
|
|
180
|
+
ef_con: Optional[int] = None,
|
|
378
181
|
) -> Any:
|
|
379
|
-
"""
|
|
380
|
-
|
|
381
|
-
from endee.endee import Endee
|
|
382
|
-
|
|
383
|
-
# Initialize Endee client
|
|
384
|
-
nd = Endee(token=api_token)
|
|
182
|
+
"""
|
|
183
|
+
Initialize Endee index (dense or hybrid).
|
|
385
184
|
|
|
185
|
+
Args:
|
|
186
|
+
api_token: Endee API token
|
|
187
|
+
index_name: Name of the index
|
|
188
|
+
dimension: Dense vector dimension
|
|
189
|
+
space_type: Distance metric (cosine, l2, ip)
|
|
190
|
+
precision: Vector precision type
|
|
191
|
+
sparse_dim: Sparse vector dimension. If None or 0, creates dense-only index.
|
|
192
|
+
If > 0, creates hybrid index with both dense and sparse vectors.
|
|
193
|
+
M: HNSW graph connectivity parameter (optional)
|
|
194
|
+
ef_con: HNSW construction parameter (optional)
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Endee Index object
|
|
198
|
+
"""
|
|
386
199
|
try:
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
_logger.info(f"Retrieved existing index: {index_name}")
|
|
390
|
-
return index
|
|
391
|
-
except Exception as e:
|
|
392
|
-
if dimension is None:
|
|
393
|
-
raise ValueError(
|
|
394
|
-
"Must provide dimension when creating a new index"
|
|
395
|
-
) from e
|
|
396
|
-
|
|
397
|
-
# Create a new index if it doesn't exist
|
|
398
|
-
_logger.info(f"Creating new index: {index_name}")
|
|
399
|
-
nd.create_index(
|
|
400
|
-
name=index_name,
|
|
401
|
-
dimension=dimension,
|
|
402
|
-
space_type=space_type,
|
|
403
|
-
precision=precision,
|
|
404
|
-
key=key,
|
|
405
|
-
)
|
|
406
|
-
return nd.get_index(name=index_name, key=key)
|
|
200
|
+
|
|
201
|
+
|
|
407
202
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
dimension: Optional[int] = None,
|
|
414
|
-
space_type: Optional[str] = "cosine",
|
|
415
|
-
vocab_size: Optional[int] = None,
|
|
416
|
-
precision: Optional[str] = "medium",
|
|
417
|
-
key: Optional[str] = None,
|
|
418
|
-
) -> Any:
|
|
419
|
-
"""Initialize Endee hybrid index using the current API."""
|
|
420
|
-
endee = _import_endee()
|
|
421
|
-
from endee.endee import Endee
|
|
203
|
+
_logger.info("Connecting to Endee service...")
|
|
204
|
+
nd = Endee(token=api_token)
|
|
205
|
+
prec = precision if precision is not None else "float16"
|
|
206
|
+
is_hybrid = sparse_dim is not None and sparse_dim > 0
|
|
207
|
+
dim_sparse = sparse_dim if is_hybrid else 0
|
|
422
208
|
|
|
423
|
-
|
|
424
|
-
|
|
209
|
+
try:
|
|
210
|
+
_logger.info(f"Checking if index '{index_name}' exists...")
|
|
211
|
+
index = nd.get_index(name=index_name)
|
|
212
|
+
# Check if existing index matches expected type
|
|
213
|
+
existing_sparse_dim = getattr(index, "sparse_dim", 0)
|
|
214
|
+
if is_hybrid and existing_sparse_dim > 0:
|
|
215
|
+
_logger.info(f"✓ Retrieved existing hybrid index: {index_name}")
|
|
216
|
+
elif not is_hybrid and existing_sparse_dim == 0:
|
|
217
|
+
_logger.info(f"✓ Retrieved existing dense index: {index_name}")
|
|
218
|
+
elif is_hybrid and existing_sparse_dim == 0:
|
|
219
|
+
_logger.warning(
|
|
220
|
+
f"Index '{index_name}' exists as dense-only (sparse_dim=0) but hybrid was requested. "
|
|
221
|
+
f"Using existing dense index."
|
|
222
|
+
)
|
|
223
|
+
else:
|
|
224
|
+
_logger.warning(
|
|
225
|
+
f"Index '{index_name}' exists as hybrid (sparse_dim={existing_sparse_dim}) "
|
|
226
|
+
f"but dense-only was requested. Using existing hybrid index."
|
|
227
|
+
)
|
|
228
|
+
return index
|
|
229
|
+
|
|
230
|
+
except Exception as e:
|
|
231
|
+
# Index doesn't exist, create new one
|
|
232
|
+
if dimension is None:
|
|
233
|
+
raise ValueError(
|
|
234
|
+
f"Must provide dimension when creating a new {'hybrid' if is_hybrid else 'dense'} index"
|
|
235
|
+
) from e
|
|
236
|
+
if is_hybrid and sparse_dim is None:
|
|
237
|
+
raise ValueError(
|
|
238
|
+
"Must provide sparse_dim when creating a new hybrid index"
|
|
239
|
+
) from e
|
|
240
|
+
|
|
241
|
+
# Validate index name
|
|
242
|
+
try:
|
|
243
|
+
from endee.utils import is_valid_index_name
|
|
244
|
+
|
|
245
|
+
if not is_valid_index_name(index_name):
|
|
246
|
+
raise ValueError(
|
|
247
|
+
f"Invalid index name. Index name must be alphanumeric and can "
|
|
248
|
+
f"contain underscores and should be less than "
|
|
249
|
+
f"{MAX_INDEX_NAME_LENGTH_ALLOWED} characters"
|
|
250
|
+
)
|
|
251
|
+
except ImportError:
|
|
252
|
+
pass
|
|
253
|
+
|
|
254
|
+
# Validate dimension
|
|
255
|
+
if dimension > MAX_DIMENSION_ALLOWED:
|
|
256
|
+
raise ValueError(
|
|
257
|
+
f"Dimension cannot be greater than {MAX_DIMENSION_ALLOWED}"
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
# Validate sparse_dim
|
|
261
|
+
if dim_sparse < 0:
|
|
262
|
+
raise ValueError("sparse_dim cannot be negative")
|
|
263
|
+
|
|
264
|
+
# Validate and map space_type
|
|
265
|
+
space = SPACE_TYPE_MAP.get(
|
|
266
|
+
(space_type or "cosine").lower(), (space_type or "cosine").lower()
|
|
267
|
+
)
|
|
268
|
+
if space not in SPACE_TYPES_VALID:
|
|
269
|
+
raise ValueError(f"Invalid space type: {space}")
|
|
270
|
+
|
|
271
|
+
# Validate precision
|
|
272
|
+
if prec not in PRECISION_VALID:
|
|
273
|
+
raise ValueError(
|
|
274
|
+
f"Invalid precision: {prec}. Use one of {PRECISION_VALID}"
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Build create_index kwargs
|
|
278
|
+
create_kwargs = {
|
|
279
|
+
"name": index_name,
|
|
280
|
+
"dimension": dimension,
|
|
281
|
+
"space_type": space,
|
|
282
|
+
"precision": prec,
|
|
283
|
+
"sparse_dim": dim_sparse,
|
|
284
|
+
}
|
|
285
|
+
# Only add M and ef_con if provided
|
|
286
|
+
if M is not None:
|
|
287
|
+
create_kwargs["M"] = M
|
|
288
|
+
if ef_con is not None:
|
|
289
|
+
create_kwargs["ef_con"] = ef_con
|
|
290
|
+
# Build log message
|
|
291
|
+
index_type = "hybrid" if is_hybrid else "dense"
|
|
292
|
+
log_msg = f"Creating new {index_type} index '{index_name}' (dimension={dimension}"
|
|
293
|
+
if is_hybrid:
|
|
294
|
+
log_msg += f", sparse_dim={dim_sparse}"
|
|
295
|
+
if M is not None:
|
|
296
|
+
log_msg += f", M={M}"
|
|
297
|
+
if ef_con is not None:
|
|
298
|
+
log_msg += f", ef_con={ef_con}"
|
|
299
|
+
log_msg += ")..."
|
|
300
|
+
|
|
301
|
+
_logger.info(log_msg)
|
|
302
|
+
|
|
303
|
+
nd.create_index(**create_kwargs)
|
|
304
|
+
_logger.info("✓ Index created successfully")
|
|
305
|
+
return nd.get_index(name=index_name)
|
|
425
306
|
|
|
426
|
-
try:
|
|
427
|
-
# Try to get existing hybrid index
|
|
428
|
-
index = nd.get_hybrid_index(name=index_name, key=key)
|
|
429
|
-
_logger.info(f"Retrieved existing hybrid index: {index_name}")
|
|
430
|
-
return index
|
|
431
307
|
except Exception as e:
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
"Must provide dimension when creating a new hybrid index"
|
|
435
|
-
) from e
|
|
436
|
-
if vocab_size is None:
|
|
437
|
-
raise ValueError(
|
|
438
|
-
"Must provide vocab_size when creating a new hybrid index"
|
|
439
|
-
) from e
|
|
440
|
-
|
|
441
|
-
# Create a new hybrid index if it doesn't exist
|
|
442
|
-
_logger.info(f"Creating new hybrid index: {index_name}")
|
|
443
|
-
nd.create_hybrid_index(
|
|
444
|
-
name=index_name,
|
|
445
|
-
dimension=dimension,
|
|
446
|
-
space_type=space_type,
|
|
447
|
-
vocab_size=vocab_size,
|
|
448
|
-
precision=precision,
|
|
449
|
-
key=key,
|
|
450
|
-
)
|
|
451
|
-
return nd.get_hybrid_index(name=index_name, key=key)
|
|
308
|
+
_logger.error(f"Error initializing Endee index: {e}")
|
|
309
|
+
raise
|
|
452
310
|
|
|
453
311
|
@classmethod
|
|
454
312
|
def from_params(
|
|
@@ -459,351 +317,448 @@ class EndeeVectorStore(BasePydanticVectorStore):
|
|
|
459
317
|
space_type: str = "cosine",
|
|
460
318
|
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
461
319
|
hybrid: bool = False,
|
|
462
|
-
|
|
320
|
+
sparse_dim: Optional[int] = None,
|
|
463
321
|
model_name: Optional[str] = None,
|
|
464
|
-
precision: Optional[str] = "
|
|
465
|
-
|
|
322
|
+
precision: Optional[str] = "float16",
|
|
323
|
+
M: Optional[int] = None,
|
|
324
|
+
ef_con: Optional[int] = None,
|
|
466
325
|
) -> "EndeeVectorStore":
|
|
467
326
|
"""Create EndeeVectorStore from parameters.
|
|
468
|
-
|
|
327
|
+
|
|
469
328
|
Args:
|
|
470
329
|
api_token: API token for Endee service
|
|
471
330
|
index_name: Name of the index
|
|
472
331
|
dimension: Vector dimension
|
|
473
332
|
space_type: Distance metric ("cosine", "l2", or "ip")
|
|
474
333
|
batch_size: Batch size for operations
|
|
475
|
-
hybrid: If True, create/use a hybrid index
|
|
476
|
-
|
|
477
|
-
model_name: Model name or alias for sparse embeddings
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
- 'distilbert': distilbert-base-uncased (~256 MB)
|
|
483
|
-
- 'minilm': sentence-transformers/all-MiniLM-L6-v2 (~90 MB)
|
|
484
|
-
- 'mpnet': sentence-transformers/all-mpnet-base-v2 (~420 MB)
|
|
485
|
-
- 'roberta': roberta-base (~501 MB)
|
|
486
|
-
- 'xlm_roberta': xlm-roberta-base (~1.3 GB)
|
|
487
|
-
precision: Precision setting for index ("low", "medium", "high", or None)
|
|
488
|
-
key: Encryption key for encrypting metadata (256-bit hex key, 64 hex characters)
|
|
489
|
-
If provided, metadata will be encrypted using AES-256. Store this key securely.
|
|
334
|
+
hybrid: If True, create/use a hybrid index. Auto-set to True if sparse_dim > 0.
|
|
335
|
+
sparse_dim: Sparse dimension for hybrid index. If > 0, hybrid is automatically enabled.
|
|
336
|
+
model_name: Model name or alias for sparse embeddings. Defaults to 'splade_pp' if not provided.
|
|
337
|
+
Available: 'splade_pp', 'splade_cocondenser', 'bert_base', 'distilbert', etc.
|
|
338
|
+
precision: Precision for index. Use one of: "binary", "float16", "float32", "int16d", "int8d". Default "float16".
|
|
339
|
+
M: Optional HNSW M parameter (bi-directional links per node). If not provided, backend uses default.
|
|
340
|
+
ef_con: Optional HNSW ef_construction parameter. If not provided, backend uses default.
|
|
490
341
|
"""
|
|
491
|
-
if
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
342
|
+
# Auto-enable hybrid if sparse_dim is provided and > 0
|
|
343
|
+
try:
|
|
344
|
+
if sparse_dim is not None and sparse_dim > 0:
|
|
345
|
+
hybrid = True
|
|
346
|
+
_logger.info(f"Auto-enabling hybrid mode (sparse_dim={sparse_dim} > 0)")
|
|
347
|
+
if sparse_dim>0:
|
|
348
|
+
sparse_dim=30522
|
|
349
|
+
|
|
350
|
+
# Initialize index (unified method handles both dense and hybrid)
|
|
496
351
|
endee_index = cls._initialize_endee_index(
|
|
497
|
-
api_token,
|
|
352
|
+
api_token,
|
|
353
|
+
index_name,
|
|
354
|
+
dimension,
|
|
355
|
+
space_type,
|
|
356
|
+
precision,
|
|
357
|
+
sparse_dim=sparse_dim if hybrid else None,
|
|
358
|
+
M=M,
|
|
359
|
+
ef_con=ef_con,
|
|
498
360
|
)
|
|
499
361
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
362
|
+
# Get actual index configuration from the backend
|
|
363
|
+
try:
|
|
364
|
+
index_info = endee_index.describe()
|
|
365
|
+
actual_index_name = index_info.get("name", index_name)
|
|
366
|
+
actual_dimension = index_info.get("dimension", dimension)
|
|
367
|
+
actual_space_type = index_info.get("space_type", space_type)
|
|
368
|
+
actual_precision = index_info.get("precision", precision)
|
|
369
|
+
actual_sparse_dim = index_info.get("sparse_dim", sparse_dim)
|
|
370
|
+
except Exception as e:
|
|
371
|
+
_logger.warning(
|
|
372
|
+
f"Could not get index info, using provided parameters: {e}"
|
|
373
|
+
)
|
|
374
|
+
# Fallback to provided parameters
|
|
375
|
+
actual_index_name = index_name
|
|
376
|
+
actual_dimension = dimension
|
|
377
|
+
actual_space_type = space_type
|
|
378
|
+
actual_precision = precision
|
|
379
|
+
actual_sparse_dim = sparse_dim
|
|
380
|
+
|
|
381
|
+
# Determine if index is hybrid based on sparse_dim
|
|
382
|
+
actual_hybrid = actual_sparse_dim is not None and actual_sparse_dim > 0
|
|
383
|
+
|
|
384
|
+
return cls(
|
|
385
|
+
endee_index=endee_index,
|
|
386
|
+
api_token=api_token,
|
|
387
|
+
index_name=actual_index_name,
|
|
388
|
+
dimension=actual_dimension,
|
|
389
|
+
space_type=actual_space_type,
|
|
390
|
+
batch_size=batch_size,
|
|
391
|
+
sparse_dim=actual_sparse_dim,
|
|
392
|
+
hybrid=actual_hybrid,
|
|
393
|
+
model_name=model_name,
|
|
394
|
+
precision=actual_precision,
|
|
395
|
+
M=M,
|
|
396
|
+
ef_con=ef_con,
|
|
397
|
+
)
|
|
398
|
+
except Exception as e:
|
|
399
|
+
_logger.error(f"Error creating EndeeVectorStore from params: {e}")
|
|
400
|
+
raise
|
|
513
401
|
|
|
514
402
|
@classmethod
|
|
515
403
|
def class_name(cls) -> str:
|
|
516
|
-
|
|
404
|
+
try:
|
|
405
|
+
return "EndeeVectorStore"
|
|
406
|
+
except Exception as e:
|
|
407
|
+
_logger.error(f"Error getting class name: {e}")
|
|
408
|
+
raise
|
|
517
409
|
|
|
518
410
|
def _compute_sparse_vectors(self, texts: List[str]) -> tuple:
|
|
519
411
|
"""Compute sparse vectors for a list of texts."""
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
412
|
+
try:
|
|
413
|
+
if self._sparse_encoder is None:
|
|
414
|
+
raise ValueError(
|
|
415
|
+
"Sparse encoder not initialized. "
|
|
416
|
+
"Please provide model_name when creating the store with hybrid=True."
|
|
417
|
+
)
|
|
418
|
+
return self._sparse_encoder(texts)
|
|
419
|
+
except Exception as e:
|
|
420
|
+
_logger.error(f"Error computing sparse vectors: {e}")
|
|
421
|
+
raise
|
|
526
422
|
|
|
527
423
|
def add(
|
|
528
424
|
self,
|
|
529
425
|
nodes: List[BaseNode],
|
|
530
|
-
hybrid: Optional[bool] = None,
|
|
531
426
|
**add_kwargs: Any,
|
|
532
427
|
) -> List[str]:
|
|
533
428
|
"""
|
|
534
429
|
Add nodes to index.
|
|
535
430
|
|
|
536
431
|
Args:
|
|
537
|
-
nodes: List
|
|
538
|
-
|
|
539
|
-
|
|
432
|
+
nodes: List of nodes with embeddings to add to the index.
|
|
433
|
+
If index is configured for hybrid search (self.hybrid=True),
|
|
434
|
+
sparse vectors will be automatically computed from node text.
|
|
540
435
|
"""
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
for
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
else:
|
|
558
|
-
sparse_indices = [[] for _ in texts]
|
|
559
|
-
sparse_values = [[] for _ in texts]
|
|
560
|
-
|
|
561
|
-
for i, node in enumerate(nodes):
|
|
562
|
-
node_id = node.node_id
|
|
563
|
-
metadata = node_to_metadata_dict(node)
|
|
564
|
-
|
|
565
|
-
# Filter values must be simple key-value pairs
|
|
566
|
-
filter_data = {}
|
|
567
|
-
if "file_name" in metadata:
|
|
568
|
-
filter_data["file_name"] = metadata["file_name"]
|
|
569
|
-
if "doc_id" in metadata:
|
|
570
|
-
filter_data["doc_id"] = metadata["doc_id"]
|
|
571
|
-
if "category" in metadata:
|
|
572
|
-
filter_data["category"] = metadata["category"]
|
|
573
|
-
if "difficulty" in metadata:
|
|
574
|
-
filter_data["difficulty"] = metadata["difficulty"]
|
|
575
|
-
if "language" in metadata:
|
|
576
|
-
filter_data["language"] = metadata["language"]
|
|
577
|
-
if "field" in metadata:
|
|
578
|
-
filter_data["field"] = metadata["field"]
|
|
579
|
-
if "type" in metadata:
|
|
580
|
-
filter_data["type"] = metadata["type"]
|
|
581
|
-
if "feature" in metadata:
|
|
582
|
-
filter_data["feature"] = metadata["feature"]
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
# Build entry based on hybrid mode
|
|
436
|
+
try:
|
|
437
|
+
# Use instance hybrid setting
|
|
438
|
+
use_hybrid = self.hybrid
|
|
439
|
+
|
|
440
|
+
# Endee Index.upsert rejects duplicate IDs in a batch; dedupe by node_id (keep last)
|
|
441
|
+
seen: Dict[str, int] = {}
|
|
442
|
+
for idx, node in enumerate(nodes):
|
|
443
|
+
seen[node.node_id] = idx
|
|
444
|
+
deduped_indices = sorted(seen.values())
|
|
445
|
+
nodes = [nodes[i] for i in deduped_indices]
|
|
446
|
+
|
|
447
|
+
ids = []
|
|
448
|
+
entries = []
|
|
449
|
+
texts = []
|
|
450
|
+
|
|
451
|
+
# Collect texts for sparse encoding if hybrid mode
|
|
586
452
|
if use_hybrid:
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
"sparse_vector": {
|
|
591
|
-
"indices": sparse_indices[i],
|
|
592
|
-
"values": sparse_values[i]
|
|
593
|
-
},
|
|
594
|
-
"meta": metadata,
|
|
595
|
-
}
|
|
596
|
-
else:
|
|
597
|
-
entry = {
|
|
598
|
-
"id": node_id,
|
|
599
|
-
"vector": node.get_embedding(),
|
|
600
|
-
"meta": metadata,
|
|
601
|
-
"filter": filter_data
|
|
602
|
-
}
|
|
453
|
+
for node in nodes:
|
|
454
|
+
text = node.get_content()
|
|
455
|
+
texts.append(text)
|
|
603
456
|
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
457
|
+
# Compute sparse vectors in batch
|
|
458
|
+
if self._sparse_encoder is not None and texts:
|
|
459
|
+
sparse_indices, sparse_values = self._compute_sparse_vectors(texts)
|
|
460
|
+
else:
|
|
461
|
+
sparse_indices = [[] for _ in texts]
|
|
462
|
+
sparse_values = [[] for _ in texts]
|
|
463
|
+
|
|
464
|
+
for i, node in enumerate(nodes):
|
|
465
|
+
node_id = node.node_id
|
|
466
|
+
metadata = node_to_metadata_dict(node)
|
|
467
|
+
|
|
468
|
+
# Filter values must be simple key-value pairs
|
|
469
|
+
filter_data = {}
|
|
470
|
+
ref_id = getattr(node, "ref_doc_id", None) or metadata.get("ref_doc_id")
|
|
471
|
+
if ref_id is not None:
|
|
472
|
+
filter_data["ref_doc_id"] = ref_id
|
|
473
|
+
if "file_name" in metadata:
|
|
474
|
+
filter_data["file_name"] = metadata["file_name"]
|
|
475
|
+
if "doc_id" in metadata:
|
|
476
|
+
filter_data["doc_id"] = metadata["doc_id"]
|
|
477
|
+
if "category" in metadata:
|
|
478
|
+
filter_data["category"] = metadata["category"]
|
|
479
|
+
if "difficulty" in metadata:
|
|
480
|
+
filter_data["difficulty"] = metadata["difficulty"]
|
|
481
|
+
if "language" in metadata:
|
|
482
|
+
filter_data["language"] = metadata["language"]
|
|
483
|
+
if "field" in metadata:
|
|
484
|
+
filter_data["field"] = metadata["field"]
|
|
485
|
+
if "type" in metadata:
|
|
486
|
+
filter_data["type"] = metadata["type"]
|
|
487
|
+
if "feature" in metadata:
|
|
488
|
+
filter_data["feature"] = metadata["feature"]
|
|
489
|
+
|
|
490
|
+
# Build entry for endee Index.upsert
|
|
491
|
+
if use_hybrid:
|
|
492
|
+
entry = {
|
|
493
|
+
"id": node_id,
|
|
494
|
+
"vector": node.get_embedding(),
|
|
495
|
+
"sparse_indices": sparse_indices[i],
|
|
496
|
+
"sparse_values": sparse_values[i],
|
|
497
|
+
"meta": metadata,
|
|
498
|
+
"filter": filter_data,
|
|
499
|
+
}
|
|
500
|
+
else:
|
|
501
|
+
entry = {
|
|
502
|
+
"id": node_id,
|
|
503
|
+
"vector": node.get_embedding(),
|
|
504
|
+
"meta": metadata,
|
|
505
|
+
"filter": filter_data,
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
ids.append(node_id)
|
|
509
|
+
entries.append(entry)
|
|
510
|
+
|
|
511
|
+
# Batch insert; endee Index.upsert allows max MAX_VECTORS_PER_BATCH per batch
|
|
512
|
+
batch_size = min(self.batch_size, MAX_VECTORS_PER_BATCH)
|
|
513
|
+
for i in range(0, len(entries), batch_size):
|
|
514
|
+
batch = entries[i : i + batch_size]
|
|
515
|
+
self._endee_index.upsert(batch)
|
|
516
|
+
|
|
517
|
+
return ids
|
|
518
|
+
except Exception as e:
|
|
519
|
+
_logger.error(f"Error adding nodes to index: {e}")
|
|
520
|
+
raise
|
|
614
521
|
|
|
615
522
|
def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
|
|
616
523
|
"""
|
|
617
|
-
Delete nodes using
|
|
618
|
-
|
|
619
|
-
Args:
|
|
620
|
-
ref_doc_id (str): The id of the document to delete.
|
|
524
|
+
Delete nodes by ref_doc_id using endee Index.delete_with_filter.
|
|
525
|
+
Only deletes vectors that were stored with ref_doc_id in their filter (see add()).
|
|
621
526
|
"""
|
|
622
527
|
try:
|
|
623
|
-
|
|
528
|
+
# Filter format consistent with query: list of {field: {$op: value}}
|
|
529
|
+
filter_dict = [{"ref_doc_id": {"$eq": ref_doc_id}}]
|
|
530
|
+
self._endee_index.delete_with_filter(filter_dict)
|
|
624
531
|
except Exception as e:
|
|
625
|
-
_logger.error(f"Error deleting
|
|
532
|
+
_logger.error(f"Error deleting by ref_doc_id {ref_doc_id!r}: {e}")
|
|
533
|
+
raise
|
|
626
534
|
|
|
627
535
|
@property
|
|
628
536
|
def client(self) -> Any:
|
|
629
537
|
"""Return Endee index client."""
|
|
630
|
-
|
|
631
|
-
|
|
538
|
+
try:
|
|
539
|
+
return self._endee_index
|
|
540
|
+
except Exception as e:
|
|
541
|
+
_logger.error(f"Error getting client: {e}")
|
|
542
|
+
raise
|
|
543
|
+
|
|
544
|
+
def describe(self) -> Dict[str, Any]:
|
|
545
|
+
"""Get index metadata (endee Index.describe())."""
|
|
546
|
+
try:
|
|
547
|
+
return self._endee_index.describe()
|
|
548
|
+
except Exception as e:
|
|
549
|
+
_logger.error(f"Error describing index: {e}")
|
|
550
|
+
return {}
|
|
551
|
+
|
|
552
|
+
def fetch(self, ids: List[str]) -> List[Dict[str, Any]]:
|
|
553
|
+
"""Fetch vectors by IDs (uses endee Index.get_vector per id)."""
|
|
554
|
+
out: List[Dict[str, Any]] = []
|
|
555
|
+
for id_ in ids:
|
|
556
|
+
try:
|
|
557
|
+
out.append(self._endee_index.get_vector(id_))
|
|
558
|
+
except Exception as e:
|
|
559
|
+
_logger.error(f"Error fetching vector id {id_}: {e}")
|
|
560
|
+
return out
|
|
632
561
|
|
|
633
562
|
def query(
|
|
634
563
|
self,
|
|
635
564
|
query: VectorStoreQuery,
|
|
636
|
-
|
|
637
|
-
sparse_query_text: Optional[str] = None,
|
|
638
|
-
sparse_top_k: Optional[int] = None,
|
|
639
|
-
dense_top_k: Optional[int] = None,
|
|
640
|
-
rrf_k: int = 60,
|
|
565
|
+
ef: int = DEFAULT_EF_SEARCH,
|
|
641
566
|
**kwargs: Any,
|
|
642
567
|
) -> VectorStoreQueryResult:
|
|
643
568
|
"""
|
|
644
569
|
Query index for top k most similar nodes.
|
|
645
570
|
|
|
646
571
|
Args:
|
|
647
|
-
query: VectorStoreQuery object containing query parameters
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
Defaults to query.similarity_top_k if not specified.
|
|
656
|
-
rrf_k: Reciprocal Rank Fusion parameter (default: 60).
|
|
572
|
+
query: VectorStoreQuery object containing query parameters:
|
|
573
|
+
- query_embedding: Dense vector for search
|
|
574
|
+
- query_str: Text query for sparse search (used if index is hybrid)
|
|
575
|
+
- similarity_top_k: Number of results to return
|
|
576
|
+
- filters: Optional metadata filters
|
|
577
|
+
- alpha: Optional weighting for hybrid search (0=sparse, 1=dense)
|
|
578
|
+
ef: HNSW ef_search parameter (default 128, max 1024).
|
|
579
|
+
Controls search quality vs speed tradeoff.
|
|
657
580
|
"""
|
|
658
|
-
# Use
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
if not hasattr(self._endee_index, 'dimension'):
|
|
662
|
-
# Get dimension from index if available, otherwise try to infer from query
|
|
663
|
-
try:
|
|
664
|
-
dimension = self._endee_index.describe()["dimension"]
|
|
665
|
-
except:
|
|
666
|
-
if query.query_embedding is not None:
|
|
667
|
-
dimension = len(query.query_embedding)
|
|
668
|
-
else:
|
|
669
|
-
raise ValueError("Could not determine vector dimension")
|
|
670
|
-
else:
|
|
671
|
-
dimension = self._endee_index.dimension
|
|
672
|
-
|
|
673
|
-
query_embedding = [0.0] * dimension # Default empty vector
|
|
674
|
-
filters = {}
|
|
675
|
-
|
|
676
|
-
# Apply any metadata filters if provided
|
|
677
|
-
if query.filters is not None:
|
|
678
|
-
for filter_item in query.filters.filters:
|
|
679
|
-
# Case 1: MetadataFilter object
|
|
680
|
-
if hasattr(filter_item, "key") and hasattr(filter_item, "value") and hasattr(filter_item, "operator"):
|
|
681
|
-
op_symbol = reverse_operator_map.get(filter_item.operator)
|
|
682
|
-
if not op_symbol:
|
|
683
|
-
raise ValueError(f"Unsupported filter operator: {filter_item.operator}")
|
|
684
|
-
|
|
685
|
-
if filter_item.key not in filters:
|
|
686
|
-
filters[filter_item.key] = {}
|
|
687
|
-
|
|
688
|
-
filters[filter_item.key][op_symbol] = filter_item.value
|
|
689
|
-
|
|
690
|
-
# Case 2: Raw dict, e.g. {"category": {"$eq": "programming"}}
|
|
691
|
-
elif isinstance(filter_item, dict):
|
|
692
|
-
for key, op_dict in filter_item.items():
|
|
693
|
-
if isinstance(op_dict, dict):
|
|
694
|
-
for op, val in op_dict.items():
|
|
695
|
-
if key not in filters:
|
|
696
|
-
filters[key] = {}
|
|
697
|
-
filters[key][op] = val
|
|
698
|
-
else:
|
|
699
|
-
raise ValueError(f"Unsupported filter format: {filter_item}")
|
|
700
|
-
|
|
701
|
-
_logger.info(f"Final structured filters: {filters}")
|
|
702
|
-
|
|
703
|
-
# Use the query embedding if provided
|
|
704
|
-
if query.query_embedding is not None:
|
|
705
|
-
query_embedding = cast(List[float], query.query_embedding)
|
|
706
|
-
if query.alpha is not None and query.mode == VectorStoreQueryMode.HYBRID:
|
|
707
|
-
# Apply alpha scaling in hybrid mode
|
|
708
|
-
query_embedding = [v * query.alpha for v in query_embedding]
|
|
709
|
-
|
|
710
|
-
# Compute sparse query vector if hybrid mode
|
|
711
|
-
sparse_vector = {"indices": [], "values": []}
|
|
712
|
-
|
|
713
|
-
if use_hybrid:
|
|
714
|
-
query_text = sparse_query_text or getattr(query, 'query_str', None)
|
|
715
|
-
if query_text and self._sparse_encoder is not None:
|
|
716
|
-
sparse_indices_batch, sparse_values_batch = self._compute_sparse_vectors([query_text])
|
|
717
|
-
sparse_vector = {
|
|
718
|
-
"indices": sparse_indices_batch[0],
|
|
719
|
-
"values": sparse_values_batch[0]
|
|
720
|
-
}
|
|
581
|
+
# Use index configuration to determine hybrid mode
|
|
582
|
+
try:
|
|
583
|
+
use_hybrid = self.hybrid
|
|
721
584
|
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
585
|
+
# Log the mode being used
|
|
586
|
+
_logger.info(
|
|
587
|
+
f"Using {'hybrid' if use_hybrid else 'dense-only'} search (index configured with hybrid={self.hybrid})"
|
|
588
|
+
)
|
|
725
589
|
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
rrf_k=rrf_k,
|
|
737
|
-
)
|
|
590
|
+
if not hasattr(self._endee_index, "dimension"):
|
|
591
|
+
# Get dimension from index if available, otherwise try to infer from query
|
|
592
|
+
try:
|
|
593
|
+
dimension = self._endee_index.describe()["dimension"]
|
|
594
|
+
except Exception as e:
|
|
595
|
+
_logger.warning(f"Could not get dimension from index: {e}")
|
|
596
|
+
if query.query_embedding is not None:
|
|
597
|
+
dimension = len(query.query_embedding)
|
|
598
|
+
else:
|
|
599
|
+
raise ValueError("Could not determine vector dimension")
|
|
738
600
|
else:
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
601
|
+
dimension = self._endee_index.dimension
|
|
602
|
+
|
|
603
|
+
query_embedding = [0.0] * dimension # Default empty vector
|
|
604
|
+
filters = {}
|
|
605
|
+
# Apply any metadata filters if provided
|
|
606
|
+
if query.filters is not None:
|
|
607
|
+
for filter_item in query.filters.filters:
|
|
608
|
+
# Case 1: MetadataFilter object
|
|
609
|
+
if (
|
|
610
|
+
hasattr(filter_item, "key")
|
|
611
|
+
and hasattr(filter_item, "value")
|
|
612
|
+
and hasattr(filter_item, "operator")
|
|
613
|
+
):
|
|
614
|
+
if filter_item.operator not in SUPPORTED_FILTER_OPERATORS:
|
|
615
|
+
raise ValueError(
|
|
616
|
+
f"Unsupported filter operator: {filter_item.operator}. "
|
|
617
|
+
"Supported filter operations: EQ ($eq), IN ($in)."
|
|
618
|
+
)
|
|
619
|
+
op_symbol = REVERSE_OPERATOR_MAP[filter_item.operator]
|
|
620
|
+
if filter_item.key not in filters:
|
|
621
|
+
filters[filter_item.key] = {}
|
|
622
|
+
filters[filter_item.key][op_symbol] = filter_item.value
|
|
623
|
+
|
|
624
|
+
# Case 2: Raw dict, e.g. {"category": {"$eq": "programming"}}
|
|
625
|
+
elif isinstance(filter_item, dict):
|
|
626
|
+
for key, op_dict in filter_item.items():
|
|
627
|
+
if isinstance(op_dict, dict):
|
|
628
|
+
for op, val in op_dict.items():
|
|
629
|
+
if key not in filters:
|
|
630
|
+
filters[key] = {}
|
|
631
|
+
filters[key][op] = val
|
|
632
|
+
else:
|
|
633
|
+
raise ValueError(f"Unsupported filter format: {filter_item}")
|
|
634
|
+
|
|
635
|
+
_logger.info(f"Final structured filters: {filters}")
|
|
636
|
+
|
|
637
|
+
# Endee API expects filter as array: [{"field": {"$op": value}}, ...]
|
|
638
|
+
filter_for_api: Optional[List[Dict[str, Any]]] = None
|
|
639
|
+
if filters:
|
|
640
|
+
filter_for_api = [{field: ops} for field, ops in filters.items()]
|
|
641
|
+
_logger.info(f"Filter sent to backend API: {filter_for_api}")
|
|
642
|
+
|
|
643
|
+
# Use the query embedding if provided
|
|
644
|
+
if query.query_embedding is not None:
|
|
645
|
+
query_embedding = cast(List[float], query.query_embedding)
|
|
646
|
+
if query.alpha is not None and use_hybrid:
|
|
647
|
+
# Apply alpha scaling in hybrid mode
|
|
648
|
+
query_embedding = [v * query.alpha for v in query_embedding]# Sparse query components for hybrid (endee Index.query uses sparse_indices, sparse_values)
|
|
649
|
+
sparse_indices_q: Optional[List[int]] = None
|
|
650
|
+
sparse_values_q: Optional[List[float]] = None
|
|
651
|
+
if use_hybrid:
|
|
652
|
+
# Get query text from query.query_str
|
|
653
|
+
query_text = getattr(query, "query_str", None)
|
|
654
|
+
if query_text and self._sparse_encoder is not None:
|
|
655
|
+
_logger.info(
|
|
656
|
+
f"Processing sparse vectors for hybrid search with query_str: '{query_text[:100]}...'"
|
|
657
|
+
)
|
|
658
|
+
si, sv = self._compute_sparse_vectors([query_text])
|
|
659
|
+
sparse_indices_q = si[0]
|
|
660
|
+
sparse_values_q = [float(v) for v in sv[0]]
|
|
661
|
+
_logger.info(f"Generated {len(sparse_indices_q)} sparse features")
|
|
662
|
+
elif query_text:
|
|
663
|
+
_logger.warning(
|
|
664
|
+
"Hybrid mode enabled but no sparse encoder available"
|
|
665
|
+
)
|
|
666
|
+
else:
|
|
667
|
+
_logger.warning(
|
|
668
|
+
"Hybrid mode enabled but no query_str provided in VectorStoreQuery"
|
|
669
|
+
)
|
|
769
670
|
else:
|
|
770
|
-
|
|
771
|
-
metadata=metadata,
|
|
772
|
-
text_key=self.text_key,
|
|
773
|
-
)
|
|
774
|
-
|
|
775
|
-
# Create TextNode with the extracted metadata
|
|
776
|
-
# Step 1: Get the JSON string from "_node_content"
|
|
777
|
-
_node_content_str = metadata.get("_node_content", "{}")
|
|
671
|
+
_logger.info("Using dense-only search (not hybrid mode)")
|
|
778
672
|
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
673
|
+
# Cap to endee limits (MAX_TOP_K_ALLOWED=512, MAX_EF_SEARCH_ALLOWED=1024)
|
|
674
|
+
requested_top_k = (
|
|
675
|
+
query.similarity_top_k if query.similarity_top_k is not None else 10
|
|
676
|
+
)
|
|
677
|
+
top_k = min(requested_top_k, MAX_TOP_K_ALLOWED)
|
|
678
|
+
ef_capped = min(ef, MAX_EF_SEARCH_ALLOWED)
|
|
679
|
+
|
|
680
|
+
# Build query kwargs - only include optional parameters if they have values
|
|
681
|
+
query_kwargs = {
|
|
682
|
+
"vector": query_embedding,
|
|
683
|
+
"top_k": top_k,
|
|
684
|
+
"ef": ef_capped,
|
|
685
|
+
"include_vectors": True,
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
# Only add filter if provided
|
|
689
|
+
if filter_for_api is not None:
|
|
690
|
+
query_kwargs["filter"] = filter_for_api# Only add sparse vectors if provided (for hybrid search)
|
|
691
|
+
if sparse_indices_q is not None:
|
|
692
|
+
query_kwargs["sparse_indices"] = sparse_indices_q
|
|
693
|
+
if sparse_values_q is not None:
|
|
694
|
+
query_kwargs["sparse_values"] = sparse_values_q
|
|
695
|
+
# Use endee Index.query
|
|
696
|
+
try:
|
|
697
|
+
results = self._endee_index.query(**query_kwargs)
|
|
698
|
+
except Exception as e:
|
|
699
|
+
_logger.error(f"Error querying Endee: {e}")
|
|
700
|
+
return VectorStoreQueryResult(nodes=[], similarities=[], ids=[])
|
|
701
|
+
|
|
702
|
+
# Process results
|
|
703
|
+
nodes = []
|
|
704
|
+
similarities = []
|
|
705
|
+
ids = []
|
|
706
|
+
|
|
707
|
+
for result in results:
|
|
708
|
+
node_id = result["id"]
|
|
709
|
+
score = result.get("similarity", result.get("score", 0.0))
|
|
710
|
+
metadata = result.get("meta", {})
|
|
711
|
+
|
|
712
|
+
# Create node from metadata
|
|
713
|
+
if self.flat_metadata:
|
|
714
|
+
node = metadata_dict_to_node(
|
|
715
|
+
metadata=metadata,
|
|
716
|
+
text=metadata.pop(self.text_key, None),
|
|
717
|
+
id_=node_id,
|
|
718
|
+
)
|
|
719
|
+
else:
|
|
720
|
+
metadata_dict, node_info, relationships = (
|
|
721
|
+
legacy_metadata_dict_to_node(
|
|
722
|
+
metadata=metadata,
|
|
723
|
+
text_key=self.text_key,
|
|
724
|
+
)
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
# Create TextNode with the extracted metadata
|
|
728
|
+
# Step 1: Get the JSON string from "_node_content"
|
|
729
|
+
_node_content_str = metadata.get("_node_content", "{}")
|
|
730
|
+
|
|
731
|
+
# Step 2: Convert JSON string to Python dict
|
|
732
|
+
try:
|
|
733
|
+
node_content = json.loads(_node_content_str)
|
|
734
|
+
except json.JSONDecodeError:
|
|
735
|
+
node_content = {}
|
|
736
|
+
|
|
737
|
+
# Step 3: Get the text
|
|
738
|
+
text = node_content.get(self.text_key, "")
|
|
739
|
+
node = TextNode(
|
|
740
|
+
text=text,
|
|
741
|
+
metadata=metadata_dict,
|
|
742
|
+
relationships=relationships,
|
|
743
|
+
node_id=node_id,
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
# Add any node_info properties to the node
|
|
747
|
+
for key, val in node_info.items():
|
|
748
|
+
if hasattr(node, key):
|
|
749
|
+
setattr(node, key, val)
|
|
750
|
+
|
|
751
|
+
# If embedding was returned in the results, add it to the node
|
|
752
|
+
if "vector" in result:
|
|
753
|
+
node.embedding = result["vector"]
|
|
754
|
+
|
|
755
|
+
nodes.append(node)
|
|
756
|
+
similarities.append(score)
|
|
757
|
+
ids.append(node_id)
|
|
758
|
+
|
|
759
|
+
return VectorStoreQueryResult(
|
|
760
|
+
nodes=nodes, similarities=similarities, ids=ids
|
|
761
|
+
)
|
|
762
|
+
except Exception as e:
|
|
763
|
+
_logger.error(f"Error querying index: {e}")
|
|
764
|
+
raise
|