endee-llamaindex 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,740 @@
1
+ import logging
2
+ from collections import Counter
3
+ from typing import Any, Callable, Dict, List, Optional, cast
4
+ import json
5
+
6
+ from llama_index.core.bridge.pydantic import PrivateAttr
7
+ from llama_index.core.schema import BaseNode, TextNode
8
+ from llama_index.core.vector_stores.types import (
9
+ BasePydanticVectorStore,
10
+ MetadataFilters,
11
+ VectorStoreQuery,
12
+ VectorStoreQueryMode,
13
+ VectorStoreQueryResult,
14
+ )
15
+ from llama_index.core.vector_stores.utils import (
16
+ DEFAULT_TEXT_KEY,
17
+ legacy_metadata_dict_to_node,
18
+ metadata_dict_to_node,
19
+ node_to_metadata_dict,
20
+ )
21
+ from llama_index.core.vector_stores.types import MetadataFilter, FilterOperator
22
+
23
+ _logger = logging.getLogger(__name__)
24
+
25
+ DEFAULT_BATCH_SIZE = 100
26
+
27
+ # Supported sparse embedding models
28
+ SUPPORTED_SPARSE_MODELS = {
29
+ "splade_pp": "prithivida/Splade_PP_en_v1",
30
+ "splade_cocondenser": "naver/splade-cocondenser-ensembledistil",
31
+ "bert_base": "bert-base-uncased",
32
+ "distilbert": "distilbert-base-uncased",
33
+ "minilm": "sentence-transformers/all-MiniLM-L6-v2",
34
+ "mpnet": "sentence-transformers/all-mpnet-base-v2",
35
+ "roberta": "roberta-base",
36
+ "xlm_roberta": "xlm-roberta-base",
37
+ }
38
+
39
+ reverse_operator_map = {
40
+ FilterOperator.EQ: "$eq",
41
+ FilterOperator.NE: "$ne",
42
+ FilterOperator.GT: "$gt",
43
+ FilterOperator.GTE: "$gte",
44
+ FilterOperator.LT: "$lt",
45
+ FilterOperator.LTE: "$lte",
46
+ FilterOperator.IN: "$in",
47
+ FilterOperator.NIN: "$nin",
48
+ }
49
+
50
+
51
+ def _import_endee() -> Any:
52
+ """Import endee module."""
53
+ try:
54
+ import endee
55
+ from endee.endee import Endee
56
+ except ImportError as e:
57
+ raise ImportError(
58
+ "Could not import endee python package. "
59
+ "Please install it with `pip install endee`."
60
+ ) from e
61
+ return endee
62
+
63
+
64
+ def build_dict(input_batch: List[List[int]]) -> List[Dict[str, Any]]:
65
+ """
66
+ Build a list of sparse dictionaries from a batch of input_ids.
67
+ """
68
+ sparse_emb = []
69
+ for token_ids in input_batch:
70
+ indices = []
71
+ values = []
72
+ d = dict(Counter(token_ids))
73
+ for idx in d:
74
+ indices.append(idx)
75
+ values.append(float(d[idx]))
76
+ sparse_emb.append({"indices": indices, "values": values})
77
+ return sparse_emb
78
+
79
+
80
+ def generate_sparse_vectors(
81
+ context_batch: List[str], tokenizer: Callable
82
+ ) -> List[Dict[str, Any]]:
83
+ """Generate sparse vectors from a batch of contexts."""
84
+ inputs = tokenizer(context_batch)["input_ids"]
85
+ return build_dict(inputs)
86
+
87
+
88
+ def _initialize_sparse_encoder_fastembed(
89
+ model_name: str,
90
+ batch_size: int = 256,
91
+ cache_dir: Optional[str] = None,
92
+ threads: Optional[int] = None,
93
+ ) -> Callable:
94
+ """
95
+ Initialize a sparse encoder using FastEmbed (recommended for SPLADE models).
96
+ """
97
+ try:
98
+ from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding
99
+ except ImportError as e:
100
+ raise ImportError(
101
+ "Could not import FastEmbed. "
102
+ "Please install it with `pip install fastembed` or "
103
+ "`pip install fastembed-gpu` for GPU support."
104
+ ) from e
105
+
106
+ resolved_model_name = SUPPORTED_SPARSE_MODELS.get(model_name, model_name)
107
+
108
+ try:
109
+ model = SparseTextEmbedding(
110
+ resolved_model_name,
111
+ cache_dir=cache_dir,
112
+ threads=threads,
113
+ providers=["CUDAExecutionProvider"],
114
+ )
115
+ _logger.info(f"Initialized sparse encoder '{resolved_model_name}' on GPU")
116
+ except Exception:
117
+ model = SparseTextEmbedding(
118
+ resolved_model_name,
119
+ cache_dir=cache_dir,
120
+ threads=threads
121
+ )
122
+ _logger.info(f"Initialized sparse encoder '{resolved_model_name}' on CPU")
123
+
124
+ def compute_vectors(texts: List[str]) -> tuple:
125
+ """Compute sparse vectors (indices, values) for a list of texts."""
126
+ embeddings = model.embed(texts, batch_size=batch_size)
127
+ indices = []
128
+ values = []
129
+ for embedding in embeddings:
130
+ indices.append(embedding.indices.tolist())
131
+ values.append(embedding.values.tolist())
132
+ return indices, values
133
+
134
+ return compute_vectors
135
+
136
+
137
+ def _initialize_sparse_encoder_transformers(model_name: str) -> Callable:
138
+ """
139
+ Initialize a sparse encoder using Transformers library.
140
+ """
141
+ try:
142
+ import torch
143
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
144
+ except ImportError as e:
145
+ raise ImportError(
146
+ "Could not import transformers library. "
147
+ 'Please install transformers with `pip install "transformers[torch]"`'
148
+ ) from e
149
+
150
+ resolved_model_name = SUPPORTED_SPARSE_MODELS.get(model_name, model_name)
151
+
152
+ tokenizer = AutoTokenizer.from_pretrained(resolved_model_name)
153
+ model = AutoModelForMaskedLM.from_pretrained(resolved_model_name)
154
+
155
+ if torch.cuda.is_available():
156
+ model = model.to("cuda")
157
+ _logger.info(f"Initialized sparse encoder '{resolved_model_name}' on GPU")
158
+ else:
159
+ _logger.info(f"Initialized sparse encoder '{resolved_model_name}' on CPU")
160
+
161
+ def compute_vectors(texts: List[str]) -> tuple:
162
+ """Compute sparse vectors from logits."""
163
+ tokens = tokenizer(
164
+ texts,
165
+ truncation=True,
166
+ padding=True,
167
+ max_length=512,
168
+ return_tensors="pt"
169
+ )
170
+
171
+ if torch.cuda.is_available():
172
+ tokens = tokens.to("cuda")
173
+
174
+ with torch.no_grad():
175
+ output = model(**tokens)
176
+ logits, attention_mask = output.logits, tokens.attention_mask
177
+ relu_log = torch.log(1 + torch.relu(logits))
178
+ weighted_log = relu_log * attention_mask.unsqueeze(-1)
179
+ tvecs, _ = torch.max(weighted_log, dim=1)
180
+
181
+ indices = []
182
+ values = []
183
+ for batch in tvecs:
184
+ nz_indices = batch.nonzero(as_tuple=True)[0].tolist()
185
+ indices.append(nz_indices)
186
+ values.append(batch[nz_indices].tolist())
187
+
188
+ return indices, values
189
+
190
+ return compute_vectors
191
+
192
+
193
+ def get_sparse_encoder(
194
+ model_name: Optional[str] = None,
195
+ use_fastembed: bool = True,
196
+ batch_size: int = 256,
197
+ cache_dir: Optional[str] = None,
198
+ threads: Optional[int] = None,
199
+ ) -> Optional[Callable]:
200
+ """
201
+ Get a sparse encoder function for the specified model.
202
+ """
203
+ if model_name is None:
204
+ return None
205
+
206
+ if use_fastembed:
207
+ return _initialize_sparse_encoder_fastembed(
208
+ model_name=model_name,
209
+ batch_size=batch_size,
210
+ cache_dir=cache_dir,
211
+ threads=threads,
212
+ )
213
+ else:
214
+ return _initialize_sparse_encoder_transformers(model_name=model_name)
215
+
216
+
217
+ class EndeeHybridVectorStore(BasePydanticVectorStore):
218
+ """
219
+ Endee Hybrid Vector Store for combined dense and sparse vector search.
220
+
221
+ This class provides hybrid search capabilities using both dense embeddings
222
+ and sparse vectors (e.g., SPLADE, BM25-style) for improved retrieval.
223
+ """
224
+
225
+ stores_text: bool = True
226
+ flat_metadata: bool = False
227
+
228
+ api_token: Optional[str]
229
+ index_name: Optional[str]
230
+ space_type: Optional[str]
231
+ dimension: Optional[int]
232
+ vocab_size: int
233
+ insert_kwargs: Optional[Dict]
234
+ text_key: str
235
+ batch_size: int
236
+ remove_text_from_metadata: bool
237
+ model_name: Optional[str]
238
+ use_fastembed: bool
239
+ alpha: float # Weight for dense vs sparse (0=sparse only, 1=dense only)
240
+
241
+ _endee_index: Any = PrivateAttr()
242
+ _sparse_encoder: Optional[Callable] = PrivateAttr(default=None)
243
+
244
+ def __init__(
245
+ self,
246
+ endee_index: Optional[Any] = None,
247
+ api_token: Optional[str] = None,
248
+ index_name: Optional[str] = None,
249
+ space_type: Optional[str] = "cosine",
250
+ dimension: Optional[int] = None,
251
+ vocab_size: int = 30522, # Default BERT vocab size
252
+ insert_kwargs: Optional[Dict] = None,
253
+ text_key: str = DEFAULT_TEXT_KEY,
254
+ batch_size: int = DEFAULT_BATCH_SIZE,
255
+ remove_text_from_metadata: bool = False,
256
+ model_name: Optional[str] = "splade_pp",
257
+ use_fastembed: bool = True,
258
+ alpha: float = 0.5,
259
+ **kwargs: Any,
260
+ ) -> None:
261
+ insert_kwargs = insert_kwargs or {}
262
+
263
+ super().__init__(
264
+ index_name=index_name,
265
+ api_token=api_token,
266
+ space_type=space_type,
267
+ dimension=dimension,
268
+ vocab_size=vocab_size,
269
+ insert_kwargs=insert_kwargs,
270
+ text_key=text_key,
271
+ batch_size=batch_size,
272
+ remove_text_from_metadata=remove_text_from_metadata,
273
+ model_name=model_name,
274
+ use_fastembed=use_fastembed,
275
+ alpha=alpha,
276
+ )
277
+
278
+ # Initialize hybrid index
279
+ if endee_index is not None:
280
+ self._endee_index = endee_index
281
+ else:
282
+ self._endee_index = self._initialize_hybrid_index(
283
+ api_token, index_name, dimension, space_type, vocab_size
284
+ )
285
+
286
+ # Initialize sparse encoder
287
+ if model_name:
288
+ _logger.info(f"Initializing sparse encoder with model: {model_name}")
289
+ self._sparse_encoder = get_sparse_encoder(
290
+ model_name=model_name,
291
+ use_fastembed=use_fastembed,
292
+ batch_size=batch_size,
293
+ )
294
+ else:
295
+ self._sparse_encoder = None
296
+
297
+ @classmethod
298
+ def _initialize_hybrid_index(
299
+ cls,
300
+ api_token: Optional[str],
301
+ index_name: Optional[str],
302
+ dimension: Optional[int] = None,
303
+ space_type: Optional[str] = "cosine",
304
+ vocab_size: Optional[int] = None,
305
+ ) -> Any:
306
+ """Initialize Endee hybrid index."""
307
+ _import_endee()
308
+ from endee.endee import Endee
309
+
310
+ nd = Endee(token=api_token)
311
+
312
+ try:
313
+ index = nd.get_hybrid_index(name=index_name)
314
+ _logger.info(f"Retrieved existing hybrid index: {index_name}")
315
+ return index
316
+ except Exception as e:
317
+ if dimension is None:
318
+ raise ValueError(
319
+ "Must provide dimension when creating a new hybrid index"
320
+ ) from e
321
+ if vocab_size is None:
322
+ raise ValueError(
323
+ "Must provide vocab_size when creating a new hybrid index"
324
+ ) from e
325
+
326
+ _logger.info(f"Creating new hybrid index: {index_name}")
327
+ nd.create_hybrid_index(
328
+ name=index_name,
329
+ dimension=dimension,
330
+ space_type=space_type,
331
+ vocab_size=vocab_size,
332
+ )
333
+ return nd.get_hybrid_index(name=index_name)
334
+
335
+ @classmethod
336
+ def from_params(
337
+ cls,
338
+ api_token: Optional[str] = None,
339
+ index_name: Optional[str] = None,
340
+ dimension: Optional[int] = None,
341
+ space_type: str = "cosine",
342
+ vocab_size: int = 30522,
343
+ batch_size: int = DEFAULT_BATCH_SIZE,
344
+ model_name: Optional[str] = "splade_pp",
345
+ use_fastembed: bool = True,
346
+ alpha: float = 0.5,
347
+ ) -> "EndeeHybridVectorStore":
348
+ """
349
+ Create EndeeHybridVectorStore from parameters.
350
+
351
+ Args:
352
+ api_token: API token for Endee service
353
+ index_name: Name of the hybrid index
354
+ dimension: Vector dimension for dense embeddings
355
+ space_type: Distance metric ("cosine", "l2", or "ip")
356
+ vocab_size: Vocabulary size for sparse vectors
357
+ batch_size: Batch size for operations
358
+ model_name: Model name or alias for sparse embeddings
359
+ Supported models:
360
+ - 'splade_pp': prithivida/Splade_PP_en_v1
361
+ - 'splade_cocondenser': naver/splade-cocondenser-ensembledistil
362
+ - 'bert_base': bert-base-uncased
363
+ - 'distilbert': distilbert-base-uncased
364
+ - 'minilm': sentence-transformers/all-MiniLM-L6-v2
365
+ - 'mpnet': sentence-transformers/all-mpnet-base-v2
366
+ - 'roberta': roberta-base
367
+ - 'xlm_roberta': xlm-roberta-base
368
+ use_fastembed: Use FastEmbed for sparse encoding (recommended)
369
+ alpha: Weight for hybrid search (0=sparse only, 1=dense only, 0.5=balanced)
370
+ """
371
+ endee_index = cls._initialize_hybrid_index(
372
+ api_token, index_name, dimension, space_type, vocab_size
373
+ )
374
+
375
+ return cls(
376
+ endee_index=endee_index,
377
+ api_token=api_token,
378
+ index_name=index_name,
379
+ dimension=dimension,
380
+ space_type=space_type,
381
+ vocab_size=vocab_size,
382
+ batch_size=batch_size,
383
+ model_name=model_name,
384
+ use_fastembed=use_fastembed,
385
+ alpha=alpha,
386
+ )
387
+
388
+ @classmethod
389
+ def class_name(cls) -> str:
390
+ return "EndeeHybridVectorStore"
391
+
392
+ def _compute_sparse_vectors(self, texts: List[str]) -> tuple:
393
+ """Compute sparse vectors for a list of texts."""
394
+ if self._sparse_encoder is None:
395
+ raise ValueError(
396
+ "Sparse encoder not initialized. "
397
+ "Please provide model_name when creating the store."
398
+ )
399
+ return self._sparse_encoder(texts)
400
+
401
+ def add(
402
+ self,
403
+ nodes: List[BaseNode],
404
+ **add_kwargs: Any,
405
+ ) -> List[str]:
406
+ """
407
+ Add nodes to hybrid index with both dense and sparse vectors.
408
+
409
+ Args:
410
+ nodes: List[BaseNode]: list of nodes with embeddings
411
+ """
412
+ ids = []
413
+ entries = []
414
+ texts = []
415
+
416
+ # Collect all texts for batch sparse encoding
417
+ for node in nodes:
418
+ text = node.get_content()
419
+ texts.append(text)
420
+
421
+ # Compute sparse vectors in batch
422
+ if self._sparse_encoder is not None and texts:
423
+ sparse_indices, sparse_values = self._compute_sparse_vectors(texts)
424
+ else:
425
+ sparse_indices = [[] for _ in texts]
426
+ sparse_values = [[] for _ in texts]
427
+
428
+ for i, node in enumerate(nodes):
429
+ node_id = node.node_id
430
+ metadata = node_to_metadata_dict(node)
431
+
432
+ # Filter values for hybrid index
433
+ filter_data = {}
434
+ for key in ["file_name", "doc_id", "category", "difficulty",
435
+ "language", "field", "type", "feature"]:
436
+ if key in metadata:
437
+ filter_data[key] = metadata[key]
438
+
439
+ entry = {
440
+ "id": node_id,
441
+ "vector": node.get_embedding(),
442
+ "sparse_indices": sparse_indices[i],
443
+ "sparse_values": sparse_values[i],
444
+ "meta": metadata,
445
+ "filter": filter_data
446
+ }
447
+
448
+ ids.append(node_id)
449
+ entries.append(entry)
450
+
451
+ # Batch upsert
452
+ batch_size = self.batch_size
453
+ for i in range(0, len(entries), batch_size):
454
+ batch = entries[i : i + batch_size]
455
+ self._endee_index.upsert(batch)
456
+
457
+ return ids
458
+
459
+ def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
460
+ """
461
+ Delete nodes using ref_doc_id.
462
+
463
+ Args:
464
+ ref_doc_id (str): The id of the document to delete.
465
+ """
466
+ try:
467
+ self._endee_index.delete_with_filter({"doc_id": ref_doc_id})
468
+ except Exception as e:
469
+ _logger.error(f"Error deleting vectors for doc_id {ref_doc_id}: {e}")
470
+
471
+ def delete_by_ids(self, ids: List[str], **delete_kwargs: Any) -> None:
472
+ """
473
+ Delete nodes by their IDs.
474
+
475
+ Args:
476
+ ids: List of node IDs to delete.
477
+ """
478
+ try:
479
+ self._endee_index.delete(ids)
480
+ except Exception as e:
481
+ _logger.error(f"Error deleting vectors by IDs: {e}")
482
+
483
+ def delete_with_filter(self, filter_dict: Dict[str, Any], **delete_kwargs: Any) -> None:
484
+ """
485
+ Delete nodes matching a filter.
486
+
487
+ Args:
488
+ filter_dict: Filter dictionary for deletion.
489
+ """
490
+ try:
491
+ self._endee_index.delete_with_filter(filter_dict)
492
+ except Exception as e:
493
+ _logger.error(f"Error deleting vectors with filter: {e}")
494
+
495
+ @property
496
+ def client(self) -> Any:
497
+ """Return Endee hybrid index client."""
498
+ return self._endee_index
499
+
500
+ def query(
501
+ self,
502
+ query: VectorStoreQuery,
503
+ sparse_query_text: Optional[str] = None,
504
+ alpha: Optional[float] = None,
505
+ **kwargs: Any,
506
+ ) -> VectorStoreQueryResult:
507
+ """
508
+ Query hybrid index for top k most similar nodes.
509
+
510
+ Args:
511
+ query: VectorStoreQuery object containing query parameters
512
+ sparse_query_text: Optional text to compute sparse vector for query.
513
+ If not provided, uses query.query_str if available.
514
+ alpha: Optional weight override for this query (0=sparse only, 1=dense only)
515
+ """
516
+ # Get dimension
517
+ try:
518
+ dimension = self._endee_index.describe()["dimension"]
519
+ except:
520
+ if query.query_embedding is not None:
521
+ dimension = len(query.query_embedding)
522
+ else:
523
+ raise ValueError("Could not determine vector dimension")
524
+
525
+ query_embedding = [0.0] * dimension
526
+ filters = {}
527
+ use_alpha = alpha if alpha is not None else self.alpha
528
+
529
+ # Build filters
530
+ if query.filters is not None:
531
+ for filter_item in query.filters.filters:
532
+ if hasattr(filter_item, "key") and hasattr(filter_item, "value") and hasattr(filter_item, "operator"):
533
+ op_symbol = reverse_operator_map.get(filter_item.operator)
534
+ if not op_symbol:
535
+ raise ValueError(f"Unsupported filter operator: {filter_item.operator}")
536
+
537
+ if filter_item.key not in filters:
538
+ filters[filter_item.key] = {}
539
+ filters[filter_item.key][op_symbol] = filter_item.value
540
+
541
+ elif isinstance(filter_item, dict):
542
+ for key, op_dict in filter_item.items():
543
+ if isinstance(op_dict, dict):
544
+ for op, val in op_dict.items():
545
+ if key not in filters:
546
+ filters[key] = {}
547
+ filters[key][op] = val
548
+ else:
549
+ raise ValueError(f"Unsupported filter format: {filter_item}")
550
+
551
+ _logger.info(f"Final structured filters: {filters}")
552
+
553
+ # Get dense query embedding
554
+ if query.query_embedding is not None:
555
+ query_embedding = cast(List[float], query.query_embedding)
556
+
557
+ # Compute sparse query vector
558
+ sparse_indices = []
559
+ sparse_values = []
560
+
561
+ query_text = sparse_query_text or getattr(query, 'query_str', None)
562
+ if query_text and self._sparse_encoder is not None:
563
+ sparse_indices_batch, sparse_values_batch = self._compute_sparse_vectors([query_text])
564
+ sparse_indices = sparse_indices_batch[0]
565
+ sparse_values = sparse_values_batch[0]
566
+
567
+ # Execute hybrid query
568
+ try:
569
+ results = self._endee_index.query(
570
+ vector=query_embedding,
571
+ sparse_indices=sparse_indices,
572
+ sparse_values=sparse_values,
573
+ top_k=query.similarity_top_k,
574
+ filter=filters if filters else None,
575
+ include_vectors=True,
576
+ alpha=use_alpha,
577
+ )
578
+ except Exception as e:
579
+ _logger.error(f"Error querying Endee hybrid index: {e}")
580
+ return VectorStoreQueryResult(nodes=[], similarities=[], ids=[])
581
+
582
+ # Process results
583
+ nodes = []
584
+ similarities = []
585
+ ids = []
586
+
587
+ for result in results:
588
+ node_id = result["id"]
589
+ score = result.get("similarity", result.get("score", 0.0))
590
+ metadata = result.get("meta", {})
591
+
592
+ if self.flat_metadata:
593
+ node = metadata_dict_to_node(
594
+ metadata=metadata,
595
+ text=metadata.pop(self.text_key, None),
596
+ id_=node_id,
597
+ )
598
+ else:
599
+ metadata_dict, node_info, relationships = legacy_metadata_dict_to_node(
600
+ metadata=metadata,
601
+ text_key=self.text_key,
602
+ )
603
+
604
+ _node_content_str = metadata.get("_node_content", "{}")
605
+ try:
606
+ node_content = json.loads(_node_content_str)
607
+ except json.JSONDecodeError:
608
+ node_content = {}
609
+
610
+ text = node_content.get(self.text_key, "")
611
+ node = TextNode(
612
+ text=text,
613
+ metadata=metadata_dict,
614
+ relationships=relationships,
615
+ node_id=node_id,
616
+ )
617
+
618
+ for key, val in node_info.items():
619
+ if hasattr(node, key):
620
+ setattr(node, key, val)
621
+
622
+ if "vector" in result:
623
+ node.embedding = result["vector"]
624
+
625
+ nodes.append(node)
626
+ similarities.append(score)
627
+ ids.append(node_id)
628
+
629
+ return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)
630
+
631
+ def hybrid_query(
632
+ self,
633
+ query_text: str,
634
+ query_embedding: List[float],
635
+ top_k: int = 10,
636
+ alpha: Optional[float] = None,
637
+ filters: Optional[Dict[str, Any]] = None,
638
+ ) -> VectorStoreQueryResult:
639
+ """
640
+ Direct hybrid query method for convenience.
641
+
642
+ Args:
643
+ query_text: Text query for sparse vector computation
644
+ query_embedding: Dense embedding vector
645
+ top_k: Number of results to return
646
+ alpha: Weight for hybrid search (0=sparse, 1=dense)
647
+ filters: Optional filter dictionary
648
+
649
+ Returns:
650
+ VectorStoreQueryResult with combined results
651
+ """
652
+ use_alpha = alpha if alpha is not None else self.alpha
653
+
654
+ # Compute sparse vector
655
+ sparse_indices = []
656
+ sparse_values = []
657
+ if self._sparse_encoder is not None:
658
+ sparse_indices_batch, sparse_values_batch = self._compute_sparse_vectors([query_text])
659
+ sparse_indices = sparse_indices_batch[0]
660
+ sparse_values = sparse_values_batch[0]
661
+
662
+ try:
663
+ results = self._endee_index.query(
664
+ vector=query_embedding,
665
+ sparse_indices=sparse_indices,
666
+ sparse_values=sparse_values,
667
+ top_k=top_k,
668
+ filter=filters,
669
+ include_vectors=True,
670
+ alpha=use_alpha,
671
+ )
672
+ except Exception as e:
673
+ _logger.error(f"Error in hybrid query: {e}")
674
+ return VectorStoreQueryResult(nodes=[], similarities=[], ids=[])
675
+
676
+ nodes = []
677
+ similarities = []
678
+ ids = []
679
+
680
+ for result in results:
681
+ node_id = result["id"]
682
+ score = result.get("similarity", result.get("score", 0.0))
683
+ metadata = result.get("meta", {})
684
+
685
+ metadata_dict, node_info, relationships = legacy_metadata_dict_to_node(
686
+ metadata=metadata,
687
+ text_key=self.text_key,
688
+ )
689
+
690
+ _node_content_str = metadata.get("_node_content", "{}")
691
+ try:
692
+ node_content = json.loads(_node_content_str)
693
+ except json.JSONDecodeError:
694
+ node_content = {}
695
+
696
+ text = node_content.get(self.text_key, "")
697
+ node = TextNode(
698
+ text=text,
699
+ metadata=metadata_dict,
700
+ relationships=relationships,
701
+ node_id=node_id,
702
+ )
703
+
704
+ for key, val in node_info.items():
705
+ if hasattr(node, key):
706
+ setattr(node, key, val)
707
+
708
+ if "vector" in result:
709
+ node.embedding = result["vector"]
710
+
711
+ nodes.append(node)
712
+ similarities.append(score)
713
+ ids.append(node_id)
714
+
715
+ return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)
716
+
717
+ def describe(self) -> Dict[str, Any]:
718
+ """Get index description/stats."""
719
+ try:
720
+ return self._endee_index.describe()
721
+ except Exception as e:
722
+ _logger.error(f"Error describing index: {e}")
723
+ return {}
724
+
725
+ def list_ids(self, limit: int = 100) -> List[str]:
726
+ """List IDs in the index."""
727
+ try:
728
+ return self._endee_index.list_ids(limit=limit)
729
+ except Exception as e:
730
+ _logger.error(f"Error listing IDs: {e}")
731
+ return []
732
+
733
+ def fetch(self, ids: List[str]) -> List[Dict[str, Any]]:
734
+ """Fetch vectors by IDs."""
735
+ try:
736
+ return self._endee_index.fetch(ids)
737
+ except Exception as e:
738
+ _logger.error(f"Error fetching vectors: {e}")
739
+ return []
740
+