endee-llamaindex 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ from endee_llamaindex.base import EndeeVectorStore
2
+
3
+ __all__ = ["EndeeVectorStore"]
@@ -0,0 +1,416 @@
1
+ import logging
2
+ from collections import Counter
3
+ from functools import partial
4
+ import json
5
+ from typing import Any, Callable, Dict, List, Optional, cast
6
+
7
+ from llama_index.core.bridge.pydantic import PrivateAttr
8
+ from llama_index.core.schema import BaseNode, MetadataMode, TextNode
9
+ from llama_index.core.vector_stores.types import (
10
+ BasePydanticVectorStore,
11
+ MetadataFilters,
12
+ VectorStoreQuery,
13
+ VectorStoreQueryMode,
14
+ VectorStoreQueryResult,
15
+ )
16
+ from llama_index.core.vector_stores.utils import (
17
+ DEFAULT_TEXT_KEY,
18
+ legacy_metadata_dict_to_node,
19
+ metadata_dict_to_node,
20
+ node_to_metadata_dict,
21
+ )
22
+
23
+ from datetime import datetime
24
+
25
+ def _import_endee() -> Any:
26
+ """
27
+ Try to import endee module. If it's not already installed, instruct user how to install.
28
+ """
29
+ try:
30
+ import endee
31
+ from endee.endee_client import Endee
32
+ except ImportError as e:
33
+ raise ImportError(
34
+ "Could not import endee python package. "
35
+ "Please install it with `pip install endee`."
36
+ ) from e
37
+ return endee
38
+
39
+ ID_KEY = "id"
40
+ VECTOR_KEY = "values"
41
+ SPARSE_VECTOR_KEY = "sparse_values"
42
+ METADATA_KEY = "metadata"
43
+
44
+ DEFAULT_BATCH_SIZE = 100
45
+
46
+ _logger = logging.getLogger(__name__)
47
+
48
+ from llama_index.core.vector_stores.types import MetadataFilter, FilterOperator
49
+
50
+ reverse_operator_map = {
51
+ FilterOperator.EQ: "$eq",
52
+ FilterOperator.NE: "$ne",
53
+ FilterOperator.GT: "$gt",
54
+ FilterOperator.GTE: "$gte",
55
+ FilterOperator.LT: "$lt",
56
+ FilterOperator.LTE: "$lte",
57
+ FilterOperator.IN: "$in",
58
+ FilterOperator.NIN: "$nin",
59
+ }
60
+
61
+
62
+
63
+ def build_dict(input_batch: List[List[int]]) -> List[Dict[str, Any]]:
64
+ """
65
+ Build a list of sparse dictionaries from a batch of input_ids.
66
+
67
+ NOTE: taken from https://www.pinecone.io/learn/hybrid-search-intro/.
68
+
69
+ """
70
+ # store a batch of sparse embeddings
71
+ sparse_emb = []
72
+ # iterate through input batch
73
+ for token_ids in input_batch:
74
+ indices = []
75
+ values = []
76
+ # convert the input_ids list to a dictionary of key to frequency values
77
+ d = dict(Counter(token_ids))
78
+ for idx in d:
79
+ indices.append(idx)
80
+ values.append(float(d[idx]))
81
+ sparse_emb.append({"indices": indices, "values": values})
82
+ # return sparse_emb list
83
+ return sparse_emb
84
+
85
+
86
+ def generate_sparse_vectors(
87
+ context_batch: List[str], tokenizer: Callable
88
+ ) -> List[Dict[str, Any]]:
89
+ """
90
+ Generate sparse vectors from a batch of contexts.
91
+
92
+ NOTE: taken from https://www.pinecone.io/learn/hybrid-search-intro/.
93
+
94
+ """
95
+ # create batch of input_ids
96
+ inputs = tokenizer(context_batch)["input_ids"]
97
+ # create sparse dictionaries
98
+ return build_dict(inputs)
99
+
100
+
101
+ import_err_msg = (
102
+ "`endee` package not found, please run `pip install endee` to install it.`"
103
+ )
104
+
105
+
106
+ class EndeeVectorStore(BasePydanticVectorStore):
107
+
108
+ stores_text: bool = True
109
+ flat_metadata: bool = False
110
+
111
+ api_token: Optional[str]
112
+ index_name: Optional[str]
113
+ space_type: Optional[str]
114
+ dimension: Optional[int]
115
+ insert_kwargs: Optional[Dict]
116
+ add_sparse_vector: bool
117
+ text_key: str
118
+ batch_size: int
119
+ remove_text_from_metadata: bool
120
+
121
+ _endee_index: Any = PrivateAttr()
122
+
123
+ def __init__(
124
+ self,
125
+ endee_index: Optional[Any] = None,
126
+ api_token: Optional[str] = None,
127
+ index_name: Optional[str] = None,
128
+ space_type: Optional[str] = "cosine",
129
+ dimension: Optional[int] = None,
130
+ insert_kwargs: Optional[Dict] = None,
131
+ add_sparse_vector: bool = False,
132
+ text_key: str = DEFAULT_TEXT_KEY,
133
+ batch_size: int = DEFAULT_BATCH_SIZE,
134
+ remove_text_from_metadata: bool = False,
135
+ **kwargs: Any,
136
+ ) -> None:
137
+ insert_kwargs = insert_kwargs or {}
138
+
139
+ super().__init__(
140
+ index_name=index_name,
141
+ api_token=api_token,
142
+ space_type=space_type,
143
+ dimension=dimension,
144
+ insert_kwargs=insert_kwargs,
145
+ add_sparse_vector=add_sparse_vector,
146
+ text_key=text_key,
147
+ batch_size=batch_size,
148
+ remove_text_from_metadata=remove_text_from_metadata,
149
+ )
150
+
151
+ # Use existing endee_index or initialize a new one
152
+ self._endee_index = endee_index or self._initialize_endee_index(
153
+ api_token, index_name, dimension, space_type
154
+ )
155
+
156
+ @classmethod
157
+ def _initialize_endee_index(
158
+ cls,
159
+ api_token: Optional[str],
160
+ index_name: Optional[str],
161
+ dimension: Optional[int] = None,
162
+ space_type: Optional[str] = "cosine",
163
+ ) -> Any:
164
+ """Initialize Endee index using the current API."""
165
+ endee = _import_endee()
166
+ from endee.endee_client import Endee
167
+
168
+ # Initialize Endee client
169
+ nd = Endee(token=api_token)
170
+
171
+ try:
172
+ # Try to get existing index
173
+ index = nd.get_index(name=index_name)
174
+ _logger.info(f"Retrieved existing index: {index_name}")
175
+ return index
176
+ except Exception as e:
177
+ if dimension is None:
178
+ raise ValueError(
179
+ "Must provide dimension when creating a new index"
180
+ ) from e
181
+
182
+ # Create a new index if it doesn't exist
183
+ _logger.info(f"Creating new index: {index_name}")
184
+ nd.create_index(
185
+ name=index_name,
186
+ dimension=dimension,
187
+ space_type=space_type,
188
+ )
189
+ return nd.get_index(name=index_name)
190
+
191
+ @classmethod
192
+ def from_params(
193
+ cls,
194
+ api_token: Optional[str] = None,
195
+ index_name: Optional[str] = None,
196
+ dimension: Optional[int] = None,
197
+ space_type: str = "cosine",
198
+ batch_size: int = DEFAULT_BATCH_SIZE,
199
+ ) -> "EndeeVectorStore":
200
+ """Create EndeeVectorStore from parameters."""
201
+ endee_index = cls._initialize_endee_index(
202
+ api_token, index_name, dimension, space_type
203
+ )
204
+
205
+ return cls(
206
+ endee_index=endee_index,
207
+ api_token=api_token,
208
+ index_name=index_name,
209
+ dimension=dimension,
210
+ space_type=space_type,
211
+ batch_size=batch_size,
212
+ )
213
+
214
+ @classmethod
215
+ def class_name(cls) -> str:
216
+ return "EndeeVectorStore"
217
+
218
+ def add(
219
+ self,
220
+ nodes: List[BaseNode],
221
+ **add_kwargs: Any,
222
+ ) -> List[str]:
223
+ """
224
+ Add nodes to index.
225
+
226
+ Args:
227
+ nodes: List[BaseNode]: list of nodes with embeddings
228
+ """
229
+ ids = []
230
+ entries = []
231
+
232
+ for node in nodes:
233
+ node_id = node.node_id
234
+ metadata = node_to_metadata_dict(node)
235
+
236
+ # Filter values must be simple key-value pairs
237
+ filter_data = {}
238
+ if "file_name" in metadata:
239
+ filter_data["file_name"] = metadata["file_name"]
240
+ if "doc_id" in metadata:
241
+ filter_data["doc_id"] = metadata["doc_id"]
242
+ if "category" in metadata:
243
+ filter_data["category"] = metadata["category"]
244
+ if "difficulty" in metadata:
245
+ filter_data["difficulty"] = metadata["difficulty"]
246
+ if "language" in metadata:
247
+ filter_data["language"] = metadata["language"]
248
+ if "field" in metadata:
249
+ filter_data["field"] = metadata["field"]
250
+ if "type" in metadata:
251
+ filter_data["type"] = metadata["type"]
252
+ if "feature" in metadata:
253
+ filter_data["feature"] = metadata["feature"]
254
+
255
+
256
+ entry = {
257
+ "id": node_id,
258
+ "vector": node.get_embedding(),
259
+ "meta": metadata,
260
+ "filter": filter_data
261
+ }
262
+
263
+ ids.append(node_id)
264
+ entries.append(entry)
265
+
266
+ # Batch insert to avoid hitting API limits
267
+ batch_size = self.batch_size
268
+ for i in range(0, len(entries), batch_size):
269
+ batch = entries[i : i + batch_size]
270
+ self._endee_index.upsert(batch)
271
+
272
+ return ids
273
+
274
+ def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
275
+ """
276
+ Delete nodes using with ref_doc_id.
277
+
278
+ Args:
279
+ ref_doc_id (str): The id of the document to delete.
280
+ """
281
+ try:
282
+ self._endee_index.delete_with_filter({"doc_id": ref_doc_id})
283
+ except Exception as e:
284
+ _logger.error(f"Error deleting vectors for doc_id {ref_doc_id}: {e}")
285
+
286
+ @property
287
+ def client(self) -> Any:
288
+ """Return Endee index client."""
289
+ return self._endee_index
290
+
291
+ def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
292
+ """
293
+ Query index for top k most similar nodes.
294
+
295
+ Args:
296
+ query: VectorStoreQuery object containing query parameters
297
+ """
298
+ if not hasattr(self._endee_index, 'dimension'):
299
+ # Get dimension from index if available, otherwise try to infer from query
300
+ try:
301
+ dimension = self._endee_index.describe()["dimension"]
302
+ except:
303
+ if query.query_embedding is not None:
304
+ dimension = len(query.query_embedding)
305
+ else:
306
+ raise ValueError("Could not determine vector dimension")
307
+ else:
308
+ dimension = self._endee_index.dimension
309
+
310
+ query_embedding = [0.0] * dimension # Default empty vector
311
+ filters = {}
312
+
313
+ # Apply any metadata filters if provided
314
+ if query.filters is not None:
315
+ for filter_item in query.filters.filters:
316
+ # Case 1: MetadataFilter object
317
+ if hasattr(filter_item, "key") and hasattr(filter_item, "value") and hasattr(filter_item, "operator"):
318
+ op_symbol = reverse_operator_map.get(filter_item.operator)
319
+ if not op_symbol:
320
+ raise ValueError(f"Unsupported filter operator: {filter_item.operator}")
321
+
322
+ if filter_item.key not in filters:
323
+ filters[filter_item.key] = {}
324
+
325
+ filters[filter_item.key][op_symbol] = filter_item.value
326
+
327
+ # Case 2: Raw dict, e.g. {"category": {"$eq": "programming"}}
328
+ elif isinstance(filter_item, dict):
329
+ for key, op_dict in filter_item.items():
330
+ if isinstance(op_dict, dict):
331
+ for op, val in op_dict.items():
332
+ if key not in filters:
333
+ filters[key] = {}
334
+ filters[key][op] = val
335
+ else:
336
+ raise ValueError(f"Unsupported filter format: {filter_item}")
337
+
338
+ _logger.info(f"Final structured filters: {filters}")
339
+
340
+ # Use the query embedding if provided
341
+ if query.query_embedding is not None:
342
+ query_embedding = cast(List[float], query.query_embedding)
343
+ if query.alpha is not None and query.mode == VectorStoreQueryMode.HYBRID:
344
+ # Apply alpha scaling in hybrid mode
345
+ query_embedding = [v * query.alpha for v in query_embedding]
346
+
347
+ # Execute query
348
+ try:
349
+ results = self._endee_index.query(
350
+ vector=query_embedding,
351
+ top_k=query.similarity_top_k,
352
+ filter=filters if filters else None,
353
+ include_vectors=True
354
+ )
355
+ except Exception as e:
356
+ _logger.error(f"Error querying Endee: {e}")
357
+ return VectorStoreQueryResult(nodes=[], similarities=[], ids=[])
358
+
359
+ # Process results
360
+ nodes = []
361
+ similarities = []
362
+ ids = []
363
+
364
+ for result in results:
365
+ node_id = result["id"]
366
+ score = result["similarity"]
367
+
368
+ # Get metadata from result
369
+ metadata = result.get("meta", {})
370
+
371
+ # Create node from metadata
372
+ if self.flat_metadata:
373
+ node = metadata_dict_to_node(
374
+ metadata=metadata,
375
+ text=metadata.pop(self.text_key, None),
376
+ id_=node_id,
377
+ )
378
+ else:
379
+ metadata_dict, node_info, relationships = legacy_metadata_dict_to_node(
380
+ metadata=metadata,
381
+ text_key=self.text_key,
382
+ )
383
+
384
+ # Create TextNode with the extracted metadata
385
+ # Step 1: Get the JSON string from "_node_content"
386
+ _node_content_str = metadata.get("_node_content", "{}")
387
+
388
+ # Step 2: Convert JSON string to Python dict
389
+ try:
390
+ node_content = json.loads(_node_content_str)
391
+ except json.JSONDecodeError:
392
+ node_content = {}
393
+
394
+ # Step 3: Get the text
395
+ text = node_content.get(self.text_key, "")
396
+ node = TextNode(
397
+ text=text,
398
+ metadata=metadata_dict,
399
+ relationships=relationships,
400
+ node_id=node_id,
401
+ )
402
+
403
+ # Add any node_info properties to the node
404
+ for key, val in node_info.items():
405
+ if hasattr(node, key):
406
+ setattr(node, key, val)
407
+
408
+ # If embedding was returned in the results, add it to the node
409
+ if "vector" in result:
410
+ node.embedding = result["vector"]
411
+
412
+ nodes.append(node)
413
+ similarities.append(score)
414
+ ids.append(node_id)
415
+
416
+ return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)
@@ -0,0 +1,140 @@
1
+ Metadata-Version: 2.4
2
+ Name: endee-llamaindex
3
+ Version: 0.1.2
4
+ Summary: Vector Database for Fast ANN Searches
5
+ Home-page: https://endee.io
6
+ Author: Endee Labs
7
+ Author-email: vineet@endee.io
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.6
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: llama-index>=0.12.34
14
+ Requires-Dist: endee>=0.1.2
15
+ Dynamic: author
16
+ Dynamic: author-email
17
+ Dynamic: classifier
18
+ Dynamic: description
19
+ Dynamic: description-content-type
20
+ Dynamic: home-page
21
+ Dynamic: requires-dist
22
+ Dynamic: requires-python
23
+ Dynamic: summary
24
+
25
+ # Endee LlamaIndex Integration
26
+
27
+ This package provides an integration between [Endee](https://endeedb.ai) (a vector database) and [LlamaIndex](https://www.llamaindex.ai/), allowing you to use Endee as a vector store backend for LlamaIndex.
28
+
29
+ ## Features
30
+
31
+ - **Vector Storage**: Use Endee for your LlamaIndex embeddings
32
+ - **Multiple Distance Metrics**: Support for cosine, L2, and inner product distance metrics
33
+ - **Metadata Filtering**: Filter search results based on metadata
34
+ - **High Performance**: Optimized for speed and efficiency
35
+
36
+ ## Installation
37
+
38
+ ```bash
39
+ pip install endee-llamaindex
40
+ ```
41
+
42
+ This will install both the `endee-llamaindex` package and its dependencies (`endee` and `llama-index`).
43
+
44
+ ## Quick Start
45
+
46
+ ```python
47
+ import os
48
+ from llama_index.core.schema import TextNode
49
+ from llama_index.core.vector_stores.types import VectorStoreQuery
50
+ from endee_llamaindex import EndeeVectorStore
51
+
52
+ # Configure your Endee credentials
53
+ api_token = os.environ.get("ENDEE_API_TOKEN")
54
+ index_name = "my_llamaindex_vectors"
55
+ dimension = 1536 # OpenAI ada-002 embedding dimension
56
+
57
+ # Initialize the vector store
58
+ vector_store = EndeeVectorStore.from_params(
59
+ api_token=api_token,
60
+ index_name=index_name,
61
+ dimension=dimension,
62
+ space_type="cosine"
63
+ )
64
+
65
+ # Create a node with embedding
66
+ node = TextNode(
67
+ text="This is a sample document",
68
+ id_="doc1",
69
+ embedding=[0.1, 0.2, 0.3, ...], # Your embedding vector
70
+ metadata={
71
+ "doc_id": "doc1",
72
+ "source": "example",
73
+ "author": "Endee"
74
+ }
75
+ )
76
+
77
+ # Add the node to the vector store
78
+ vector_store.add([node])
79
+
80
+ # Query the vector store
81
+ query = VectorStoreQuery(
82
+ query_embedding=[0.2, 0.3, 0.4, ...], # Your query vector
83
+ similarity_top_k=5
84
+ )
85
+
86
+ results = vector_store.query(query)
87
+
88
+ # Process results
89
+ for node, score in zip(results.nodes, results.similarities):
90
+ print(f"Node ID: {node.node_id}, Similarity: {score}")
91
+ print(f"Text: {node.text}")
92
+ print(f"Metadata: {node.metadata}")
93
+ ```
94
+
95
+ ## Using with LlamaIndex
96
+
97
+ ```python
98
+ from llama_index.core import VectorStoreIndex, StorageContext
99
+ from llama_index.embeddings.openai import OpenAIEmbedding
100
+
101
+ # Initialize your nodes or documents
102
+ nodes = [...] # Your nodes with text but no embeddings yet
103
+
104
+ # Setup embedding function
105
+ embed_model = OpenAIEmbedding() # Or any other embedding model
106
+
107
+ # Initialize Endee vector store
108
+ vector_store = EndeeVectorStore.from_params(
109
+ api_token=api_token,
110
+ index_name=index_name,
111
+ dimension=1536, # Make sure this matches your embedding dimension
112
+ )
113
+
114
+ # Create storage context
115
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
116
+
117
+ # Create vector index
118
+ index = VectorStoreIndex(
119
+ nodes,
120
+ storage_context=storage_context,
121
+ embed_model=embed_model
122
+ )
123
+
124
+ # Query the index
125
+ query_engine = index.as_query_engine()
126
+ response = query_engine.query("Your query here")
127
+ print(response)
128
+ ```
129
+
130
+ ## Configuration Options
131
+
132
+ The `EndeeVectorStore` constructor accepts the following parameters:
133
+
134
+ - `api_token`: Your Endee API token
135
+ - `index_name`: Name of the Endee index
136
+ - `dimension`: Vector dimension (required when creating a new index)
137
+ - `space_type`: Distance metric, one of "cosine", "l2", or "ip" (default: "cosine")
138
+ - `batch_size`: Number of vectors to insert in a single API call (default: 100)
139
+ - `text_key`: Key to use for storing text in metadata (default: "text")
140
+ - `remove_text_from_metadata`: Whether to remove text from metadata (default: False)
@@ -0,0 +1,6 @@
1
+ endee_llamaindex/__init__.py,sha256=ctCcicNLMO3LpXPGLwvQifvQLX7TEd8CYgFO6Nd9afc,83
2
+ endee_llamaindex/base.py,sha256=g5o5020lZuccMuKdaeNTAQ3a8J368rhIQypeCkOZjFk,13888
3
+ endee_llamaindex-0.1.2.dist-info/METADATA,sha256=7unMMmO3QT520VFRp7UIIpm75VmYVZsx5e_FfJXt1Us,4088
4
+ endee_llamaindex-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
5
+ endee_llamaindex-0.1.2.dist-info/top_level.txt,sha256=AReiKL0lBXSdKPsQlDusPIH_qbS_txOSUctuCR0rRNQ,17
6
+ endee_llamaindex-0.1.2.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ endee_llamaindex