llama-index-vector-stores-chroma 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_index/py.typed +0 -0
- llama_index/vector_stores/chroma/__init__.py +3 -0
- llama_index/vector_stores/chroma/base.py +708 -0
- llama_index_vector_stores_chroma-0.5.5.dist-info/METADATA +13 -0
- llama_index_vector_stores_chroma-0.5.5.dist-info/RECORD +7 -0
- llama_index_vector_stores_chroma-0.5.5.dist-info/WHEEL +4 -0
- llama_index_vector_stores_chroma-0.5.5.dist-info/licenses/LICENSE +21 -0
llama_index/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,708 @@
|
|
|
1
|
+
"""Chroma vector store."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import math
|
|
5
|
+
from typing import Any, Dict, Generator, List, Optional, Union, cast
|
|
6
|
+
|
|
7
|
+
import chromadb
|
|
8
|
+
from chromadb.api.models.Collection import Collection
|
|
9
|
+
from llama_index.core.bridge.pydantic import Field, PrivateAttr
|
|
10
|
+
from llama_index.core.indices.query.embedding_utils import get_top_k_mmr_embeddings
|
|
11
|
+
from llama_index.core.schema import BaseNode, MetadataMode, TextNode
|
|
12
|
+
from llama_index.core.utils import truncate_text
|
|
13
|
+
from llama_index.core.vector_stores.types import (
|
|
14
|
+
BasePydanticVectorStore,
|
|
15
|
+
MetadataFilters,
|
|
16
|
+
VectorStoreQuery,
|
|
17
|
+
VectorStoreQueryMode,
|
|
18
|
+
VectorStoreQueryResult,
|
|
19
|
+
)
|
|
20
|
+
from llama_index.core.vector_stores.utils import (
|
|
21
|
+
legacy_metadata_dict_to_node,
|
|
22
|
+
metadata_dict_to_node,
|
|
23
|
+
node_to_metadata_dict,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
# MMR constants
|
|
29
|
+
DEFAULT_MMR_PREFETCH_FACTOR = 4.0
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _transform_chroma_filter_condition(condition: str) -> str:
|
|
33
|
+
"""Translate standard metadata filter op to Chroma specific spec."""
|
|
34
|
+
if condition == "and":
|
|
35
|
+
return "$and"
|
|
36
|
+
elif condition == "or":
|
|
37
|
+
return "$or"
|
|
38
|
+
else:
|
|
39
|
+
raise ValueError(f"Filter condition {condition} not supported")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _transform_chroma_filter_operator(operator: str) -> str:
|
|
43
|
+
"""Translate standard metadata filter operator to Chroma specific spec."""
|
|
44
|
+
if operator == "!=":
|
|
45
|
+
return "$ne"
|
|
46
|
+
elif operator == "==":
|
|
47
|
+
return "$eq"
|
|
48
|
+
elif operator == ">":
|
|
49
|
+
return "$gt"
|
|
50
|
+
elif operator == "<":
|
|
51
|
+
return "$lt"
|
|
52
|
+
elif operator == ">=":
|
|
53
|
+
return "$gte"
|
|
54
|
+
elif operator == "<=":
|
|
55
|
+
return "$lte"
|
|
56
|
+
elif operator == "in":
|
|
57
|
+
return "$in"
|
|
58
|
+
elif operator == "nin":
|
|
59
|
+
return "$nin"
|
|
60
|
+
else:
|
|
61
|
+
raise ValueError(f"Filter operator {operator} not supported")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _to_chroma_filter(
|
|
65
|
+
standard_filters: MetadataFilters,
|
|
66
|
+
) -> dict:
|
|
67
|
+
"""Translate standard metadata filters to Chroma specific spec."""
|
|
68
|
+
filters = {}
|
|
69
|
+
filters_list = []
|
|
70
|
+
condition = standard_filters.condition or "and"
|
|
71
|
+
condition = _transform_chroma_filter_condition(condition)
|
|
72
|
+
if standard_filters.filters:
|
|
73
|
+
for filter in standard_filters.filters:
|
|
74
|
+
if isinstance(filter, MetadataFilters):
|
|
75
|
+
filters_list.append(_to_chroma_filter(filter))
|
|
76
|
+
elif filter.operator:
|
|
77
|
+
filters_list.append(
|
|
78
|
+
{
|
|
79
|
+
filter.key: {
|
|
80
|
+
_transform_chroma_filter_operator(
|
|
81
|
+
filter.operator
|
|
82
|
+
): filter.value
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
)
|
|
86
|
+
else:
|
|
87
|
+
filters_list.append({filter.key: filter.value})
|
|
88
|
+
|
|
89
|
+
if len(filters_list) == 1:
|
|
90
|
+
# If there is only one filter, return it directly
|
|
91
|
+
return filters_list[0]
|
|
92
|
+
elif len(filters_list) > 1:
|
|
93
|
+
filters[condition] = filters_list
|
|
94
|
+
return filters
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
import_err_msg = "`chromadb` package not found, please run `pip install chromadb`"
|
|
98
|
+
|
|
99
|
+
MAX_CHUNK_SIZE = 41665 # One less than the max chunk size for ChromaDB
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def chunk_list(
|
|
103
|
+
lst: List[BaseNode], max_chunk_size: int
|
|
104
|
+
) -> Generator[List[BaseNode], None, None]:
|
|
105
|
+
"""
|
|
106
|
+
Yield successive max_chunk_size-sized chunks from lst.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
lst (List[BaseNode]): list of nodes with embeddings
|
|
110
|
+
max_chunk_size (int): max chunk size
|
|
111
|
+
|
|
112
|
+
Yields:
|
|
113
|
+
Generator[List[BaseNode], None, None]: list of nodes with embeddings
|
|
114
|
+
|
|
115
|
+
"""
|
|
116
|
+
for i in range(0, len(lst), max_chunk_size):
|
|
117
|
+
yield lst[i : i + max_chunk_size]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class ChromaVectorStore(BasePydanticVectorStore):
|
|
121
|
+
"""
|
|
122
|
+
Chroma vector store.
|
|
123
|
+
|
|
124
|
+
In this vector store, embeddings are stored within a ChromaDB collection.
|
|
125
|
+
|
|
126
|
+
During query time, the index uses ChromaDB to query for the top
|
|
127
|
+
k most similar nodes.
|
|
128
|
+
|
|
129
|
+
Supports MMR (Maximum Marginal Relevance) search mode for improved diversity
|
|
130
|
+
in search results.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
chroma_collection (chromadb.api.models.Collection.Collection):
|
|
134
|
+
ChromaDB collection instance
|
|
135
|
+
|
|
136
|
+
Examples:
|
|
137
|
+
`uv add llama-index-vector-stores-chroma`
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
import chromadb
|
|
141
|
+
from llama_index.vector_stores.chroma import ChromaVectorStore
|
|
142
|
+
|
|
143
|
+
# Create a Chroma client and collection
|
|
144
|
+
chroma_client = chromadb.EphemeralClient()
|
|
145
|
+
chroma_collection = chroma_client.create_collection("example_collection")
|
|
146
|
+
|
|
147
|
+
# Set up the ChromaVectorStore and StorageContext
|
|
148
|
+
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
|
|
149
|
+
|
|
150
|
+
# Use MMR mode with threshold
|
|
151
|
+
query_engine = index.as_query_engine(
|
|
152
|
+
vector_store_query_mode="mmr",
|
|
153
|
+
vector_store_kwargs={"mmr_threshold": 0.5}
|
|
154
|
+
)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
stores_text: bool = True
|
|
160
|
+
flat_metadata: bool = True
|
|
161
|
+
|
|
162
|
+
collection_name: Optional[str]
|
|
163
|
+
host: Optional[str]
|
|
164
|
+
port: Optional[Union[str, int]]
|
|
165
|
+
ssl: bool
|
|
166
|
+
headers: Optional[Dict[str, str]]
|
|
167
|
+
persist_dir: Optional[str]
|
|
168
|
+
collection_kwargs: Dict[str, Any] = Field(default_factory=dict)
|
|
169
|
+
|
|
170
|
+
_collection: Collection = PrivateAttr()
|
|
171
|
+
|
|
172
|
+
def __init__(
|
|
173
|
+
self,
|
|
174
|
+
chroma_collection: Optional[Any] = None,
|
|
175
|
+
collection_name: Optional[str] = None,
|
|
176
|
+
host: Optional[str] = None,
|
|
177
|
+
port: Optional[Union[str, int]] = None,
|
|
178
|
+
ssl: bool = False,
|
|
179
|
+
headers: Optional[Dict[str, str]] = None,
|
|
180
|
+
persist_dir: Optional[str] = None,
|
|
181
|
+
collection_kwargs: Optional[dict] = None,
|
|
182
|
+
**kwargs: Any,
|
|
183
|
+
) -> None:
|
|
184
|
+
"""Init params."""
|
|
185
|
+
collection_kwargs = collection_kwargs or {}
|
|
186
|
+
|
|
187
|
+
super().__init__(
|
|
188
|
+
host=host,
|
|
189
|
+
port=port,
|
|
190
|
+
ssl=ssl,
|
|
191
|
+
headers=headers,
|
|
192
|
+
collection_name=collection_name,
|
|
193
|
+
persist_dir=persist_dir,
|
|
194
|
+
collection_kwargs=collection_kwargs or {},
|
|
195
|
+
)
|
|
196
|
+
if chroma_collection is None:
|
|
197
|
+
client = chromadb.HttpClient(host=host, port=port, ssl=ssl, headers=headers)
|
|
198
|
+
self._collection = client.get_or_create_collection(
|
|
199
|
+
name=collection_name, **collection_kwargs
|
|
200
|
+
)
|
|
201
|
+
else:
|
|
202
|
+
self._collection = cast(Collection, chroma_collection)
|
|
203
|
+
|
|
204
|
+
@classmethod
|
|
205
|
+
def from_collection(cls, collection: Any) -> "ChromaVectorStore":
|
|
206
|
+
try:
|
|
207
|
+
from chromadb import Collection
|
|
208
|
+
except ImportError:
|
|
209
|
+
raise ImportError(import_err_msg)
|
|
210
|
+
|
|
211
|
+
if not isinstance(collection, Collection):
|
|
212
|
+
raise Exception("argument is not chromadb collection instance")
|
|
213
|
+
|
|
214
|
+
return cls(chroma_collection=collection)
|
|
215
|
+
|
|
216
|
+
@classmethod
|
|
217
|
+
def from_params(
|
|
218
|
+
cls,
|
|
219
|
+
collection_name: str,
|
|
220
|
+
host: Optional[str] = None,
|
|
221
|
+
port: Optional[Union[str, int]] = None,
|
|
222
|
+
ssl: bool = False,
|
|
223
|
+
headers: Optional[Dict[str, str]] = None,
|
|
224
|
+
persist_dir: Optional[str] = None,
|
|
225
|
+
collection_kwargs: dict = {},
|
|
226
|
+
**kwargs: Any,
|
|
227
|
+
) -> "ChromaVectorStore":
|
|
228
|
+
if persist_dir:
|
|
229
|
+
client = chromadb.PersistentClient(path=persist_dir)
|
|
230
|
+
collection = client.get_or_create_collection(
|
|
231
|
+
name=collection_name, **collection_kwargs
|
|
232
|
+
)
|
|
233
|
+
elif host and port:
|
|
234
|
+
client = chromadb.HttpClient(host=host, port=port, ssl=ssl, headers=headers)
|
|
235
|
+
collection = client.get_or_create_collection(
|
|
236
|
+
name=collection_name, **collection_kwargs
|
|
237
|
+
)
|
|
238
|
+
else:
|
|
239
|
+
raise ValueError(
|
|
240
|
+
"Either `persist_dir` or (`host`,`port`) must be specified"
|
|
241
|
+
)
|
|
242
|
+
return cls(
|
|
243
|
+
chroma_collection=collection,
|
|
244
|
+
host=host,
|
|
245
|
+
port=port,
|
|
246
|
+
ssl=ssl,
|
|
247
|
+
headers=headers,
|
|
248
|
+
persist_dir=persist_dir,
|
|
249
|
+
collection_kwargs=collection_kwargs,
|
|
250
|
+
**kwargs,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
@classmethod
|
|
254
|
+
def class_name(cls) -> str:
|
|
255
|
+
return "ChromaVectorStore"
|
|
256
|
+
|
|
257
|
+
def get_nodes(
|
|
258
|
+
self,
|
|
259
|
+
node_ids: Optional[List[str]],
|
|
260
|
+
filters: Optional[List[MetadataFilters]] = None,
|
|
261
|
+
) -> List[BaseNode]:
|
|
262
|
+
"""
|
|
263
|
+
Get nodes from index.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
node_ids (List[str]): list of node ids
|
|
267
|
+
filters (List[MetadataFilters]): list of metadata filters
|
|
268
|
+
|
|
269
|
+
"""
|
|
270
|
+
if not self._collection:
|
|
271
|
+
raise ValueError("Collection not initialized")
|
|
272
|
+
|
|
273
|
+
node_ids = node_ids or None
|
|
274
|
+
|
|
275
|
+
if filters:
|
|
276
|
+
where = _to_chroma_filter(filters)
|
|
277
|
+
else:
|
|
278
|
+
where = None
|
|
279
|
+
|
|
280
|
+
result = self._get(None, where=where, ids=node_ids)
|
|
281
|
+
|
|
282
|
+
return result.nodes
|
|
283
|
+
|
|
284
|
+
def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:
|
|
285
|
+
"""
|
|
286
|
+
Add nodes to index.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
nodes: List[BaseNode]: list of nodes with embeddings
|
|
290
|
+
|
|
291
|
+
"""
|
|
292
|
+
if not self._collection:
|
|
293
|
+
raise ValueError("Collection not initialized")
|
|
294
|
+
|
|
295
|
+
max_chunk_size = MAX_CHUNK_SIZE
|
|
296
|
+
node_chunks = chunk_list(nodes, max_chunk_size)
|
|
297
|
+
|
|
298
|
+
all_ids = []
|
|
299
|
+
for node_chunk in node_chunks:
|
|
300
|
+
embeddings = []
|
|
301
|
+
metadatas = []
|
|
302
|
+
ids = []
|
|
303
|
+
documents = []
|
|
304
|
+
for node in node_chunk:
|
|
305
|
+
embeddings.append(node.get_embedding())
|
|
306
|
+
metadata_dict = node_to_metadata_dict(
|
|
307
|
+
node, remove_text=True, flat_metadata=self.flat_metadata
|
|
308
|
+
)
|
|
309
|
+
for key in metadata_dict:
|
|
310
|
+
if metadata_dict[key] is None:
|
|
311
|
+
metadata_dict[key] = ""
|
|
312
|
+
metadatas.append(metadata_dict)
|
|
313
|
+
ids.append(node.node_id)
|
|
314
|
+
documents.append(node.get_content(metadata_mode=MetadataMode.NONE))
|
|
315
|
+
|
|
316
|
+
self._collection.add(
|
|
317
|
+
embeddings=embeddings,
|
|
318
|
+
ids=ids,
|
|
319
|
+
metadatas=metadatas,
|
|
320
|
+
documents=documents,
|
|
321
|
+
)
|
|
322
|
+
all_ids.extend(ids)
|
|
323
|
+
|
|
324
|
+
return all_ids
|
|
325
|
+
|
|
326
|
+
def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
|
|
327
|
+
"""
|
|
328
|
+
Delete nodes using with ref_doc_id.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
ref_doc_id (str): The doc_id of the document to delete.
|
|
332
|
+
|
|
333
|
+
"""
|
|
334
|
+
self._collection.delete(where={"document_id": ref_doc_id})
|
|
335
|
+
|
|
336
|
+
def delete_nodes(
|
|
337
|
+
self,
|
|
338
|
+
node_ids: Optional[List[str]] = None,
|
|
339
|
+
filters: Optional[List[MetadataFilters]] = None,
|
|
340
|
+
) -> None:
|
|
341
|
+
"""
|
|
342
|
+
Delete nodes from index.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
node_ids (List[str]): list of node ids
|
|
346
|
+
filters (List[MetadataFilters]): list of metadata filters
|
|
347
|
+
|
|
348
|
+
"""
|
|
349
|
+
if not self._collection:
|
|
350
|
+
raise ValueError("Collection not initialized")
|
|
351
|
+
|
|
352
|
+
node_ids = node_ids or []
|
|
353
|
+
|
|
354
|
+
if filters:
|
|
355
|
+
where = _to_chroma_filter(filters)
|
|
356
|
+
self._collection.delete(ids=node_ids, where=where)
|
|
357
|
+
|
|
358
|
+
else:
|
|
359
|
+
self._collection.delete(ids=node_ids)
|
|
360
|
+
|
|
361
|
+
def clear(self) -> None:
|
|
362
|
+
"""Clear the collection."""
|
|
363
|
+
ids = self._collection.get()["ids"]
|
|
364
|
+
self._collection.delete(ids=ids)
|
|
365
|
+
|
|
366
|
+
@property
|
|
367
|
+
def client(self) -> Any:
|
|
368
|
+
"""Return client."""
|
|
369
|
+
return self._collection
|
|
370
|
+
|
|
371
|
+
def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
|
|
372
|
+
"""
|
|
373
|
+
Query index for top k most similar nodes.
|
|
374
|
+
|
|
375
|
+
Args:
|
|
376
|
+
query (VectorStoreQuery): Query object containing:
|
|
377
|
+
- query_embedding (List[float]): query embedding
|
|
378
|
+
- similarity_top_k (int): top k most similar nodes
|
|
379
|
+
- filters (Optional[MetadataFilters]): metadata filters to apply
|
|
380
|
+
- mode (VectorStoreQueryMode): query mode (default or MMR)
|
|
381
|
+
**kwargs: Additional keyword arguments passed to ChromaDB query method.
|
|
382
|
+
For MMR mode, supports:
|
|
383
|
+
- mmr_threshold (Optional[float]): MMR threshold between 0 and 1
|
|
384
|
+
- mmr_prefetch_factor (Optional[float]): Factor to multiply similarity_top_k
|
|
385
|
+
for prefetching candidates (default: 4.0)
|
|
386
|
+
- mmr_prefetch_k (Optional[int]): Explicit number of candidates to prefetch
|
|
387
|
+
(cannot be used with mmr_prefetch_factor)
|
|
388
|
+
For ChromaDB-specific parameters:
|
|
389
|
+
- where (dict): ChromaDB where clause (use query.filters instead for standard filtering)
|
|
390
|
+
- include (List[str]): ChromaDB include parameter
|
|
391
|
+
- where_document (dict): ChromaDB where_document parameter
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
VectorStoreQueryResult: Query result containing matched nodes, similarities, and IDs.
|
|
395
|
+
|
|
396
|
+
Raises:
|
|
397
|
+
ValueError: If MMR parameters are invalid or if both query.filters and
|
|
398
|
+
where kwargs are specified.
|
|
399
|
+
|
|
400
|
+
"""
|
|
401
|
+
if query.filters is not None:
|
|
402
|
+
if "where" in kwargs:
|
|
403
|
+
raise ValueError(
|
|
404
|
+
"Cannot specify metadata filters via both query and kwargs. "
|
|
405
|
+
"Use kwargs only for chroma specific items that are "
|
|
406
|
+
"not supported via the generic query interface."
|
|
407
|
+
)
|
|
408
|
+
where = _to_chroma_filter(query.filters)
|
|
409
|
+
else:
|
|
410
|
+
where = kwargs.pop("where", None)
|
|
411
|
+
|
|
412
|
+
if not query.query_embedding:
|
|
413
|
+
return self._get(limit=query.similarity_top_k, where=where, **kwargs)
|
|
414
|
+
|
|
415
|
+
# Handle MMR mode
|
|
416
|
+
if query.mode == VectorStoreQueryMode.MMR:
|
|
417
|
+
return self._mmr_search(query, where, **kwargs)
|
|
418
|
+
|
|
419
|
+
return self._query(
|
|
420
|
+
query_embeddings=query.query_embedding,
|
|
421
|
+
n_results=query.similarity_top_k,
|
|
422
|
+
where=where,
|
|
423
|
+
**kwargs,
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
def _query(
|
|
427
|
+
self, query_embeddings: List["float"], n_results: int, where: dict, **kwargs
|
|
428
|
+
) -> VectorStoreQueryResult:
|
|
429
|
+
if where:
|
|
430
|
+
results = self._collection.query(
|
|
431
|
+
query_embeddings=query_embeddings,
|
|
432
|
+
n_results=n_results,
|
|
433
|
+
where=where,
|
|
434
|
+
**kwargs,
|
|
435
|
+
)
|
|
436
|
+
else:
|
|
437
|
+
results = self._collection.query(
|
|
438
|
+
query_embeddings=query_embeddings,
|
|
439
|
+
n_results=n_results,
|
|
440
|
+
**kwargs,
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
logger.debug(f"> Top {len(results['documents'][0])} nodes:")
|
|
444
|
+
nodes = []
|
|
445
|
+
similarities = []
|
|
446
|
+
ids = []
|
|
447
|
+
for node_id, text, metadata, distance in zip(
|
|
448
|
+
results["ids"][0],
|
|
449
|
+
results["documents"][0],
|
|
450
|
+
results["metadatas"][0],
|
|
451
|
+
results["distances"][0],
|
|
452
|
+
):
|
|
453
|
+
try:
|
|
454
|
+
node = metadata_dict_to_node(metadata, text=text)
|
|
455
|
+
except Exception:
|
|
456
|
+
# NOTE: deprecated legacy logic for backward compatibility
|
|
457
|
+
metadata, node_info, relationships = legacy_metadata_dict_to_node(
|
|
458
|
+
metadata
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
node = TextNode(
|
|
462
|
+
text=text or "",
|
|
463
|
+
id_=node_id,
|
|
464
|
+
metadata=metadata,
|
|
465
|
+
start_char_idx=node_info.get("start", None),
|
|
466
|
+
end_char_idx=node_info.get("end", None),
|
|
467
|
+
relationships=relationships,
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
nodes.append(node)
|
|
471
|
+
|
|
472
|
+
similarity_score = math.exp(-distance)
|
|
473
|
+
similarities.append(similarity_score)
|
|
474
|
+
|
|
475
|
+
logger.debug(
|
|
476
|
+
f"> [Node {node_id}] [Similarity score: {similarity_score}] "
|
|
477
|
+
f"{truncate_text(str(text), 100)}"
|
|
478
|
+
)
|
|
479
|
+
ids.append(node_id)
|
|
480
|
+
|
|
481
|
+
return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)
|
|
482
|
+
|
|
483
|
+
def _mmr_search(
|
|
484
|
+
self, query: VectorStoreQuery, where: dict, **kwargs
|
|
485
|
+
) -> VectorStoreQueryResult:
|
|
486
|
+
"""
|
|
487
|
+
Perform MMR search using ChromaDB.
|
|
488
|
+
|
|
489
|
+
Args:
|
|
490
|
+
query: VectorStoreQuery object containing the query parameters
|
|
491
|
+
where: ChromaDB filter conditions
|
|
492
|
+
**kwargs: Additional keyword arguments including mmr_threshold
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
VectorStoreQueryResult: Query result with MMR-applied nodes
|
|
496
|
+
|
|
497
|
+
"""
|
|
498
|
+
# Extract MMR parameters
|
|
499
|
+
mmr_threshold = kwargs.get("mmr_threshold")
|
|
500
|
+
|
|
501
|
+
# Validate MMR parameters
|
|
502
|
+
if mmr_threshold is not None and (
|
|
503
|
+
not isinstance(mmr_threshold, (int, float))
|
|
504
|
+
or mmr_threshold < 0
|
|
505
|
+
or mmr_threshold > 1
|
|
506
|
+
):
|
|
507
|
+
raise ValueError("mmr_threshold must be a float between 0 and 1")
|
|
508
|
+
|
|
509
|
+
# Validate prefetch parameters (check before popping)
|
|
510
|
+
raw_prefetch_factor = kwargs.get("mmr_prefetch_factor")
|
|
511
|
+
raw_prefetch_k = kwargs.get("mmr_prefetch_k")
|
|
512
|
+
if raw_prefetch_factor is not None and raw_prefetch_k is not None:
|
|
513
|
+
raise ValueError(
|
|
514
|
+
"'mmr_prefetch_factor' and 'mmr_prefetch_k' "
|
|
515
|
+
"cannot coexist in a call to query()"
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
# Strip MMR-only kwargs so they aren't forwarded to Chroma
|
|
519
|
+
mmr_threshold = kwargs.pop("mmr_threshold", None)
|
|
520
|
+
prefetch_k_override = kwargs.pop("mmr_prefetch_k", None)
|
|
521
|
+
prefetch_factor = kwargs.pop("mmr_prefetch_factor", DEFAULT_MMR_PREFETCH_FACTOR)
|
|
522
|
+
|
|
523
|
+
# Calculate prefetch size (get more candidates than needed for MMR)
|
|
524
|
+
if prefetch_k_override is not None:
|
|
525
|
+
prefetch_k = int(prefetch_k_override)
|
|
526
|
+
else:
|
|
527
|
+
prefetch_k = int(query.similarity_top_k * prefetch_factor)
|
|
528
|
+
|
|
529
|
+
# Ensure prefetch_k is at least as large as similarity_top_k
|
|
530
|
+
prefetch_k = max(prefetch_k, query.similarity_top_k)
|
|
531
|
+
|
|
532
|
+
logger.debug(
|
|
533
|
+
f"MMR search: prefetching {prefetch_k} candidates for {query.similarity_top_k} final results"
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
# Query ChromaDB for more candidates than needed (kwargs now safe)
|
|
537
|
+
if where:
|
|
538
|
+
prefetch_results = self._collection.query(
|
|
539
|
+
query_embeddings=query.query_embedding,
|
|
540
|
+
n_results=prefetch_k,
|
|
541
|
+
where=where,
|
|
542
|
+
include=["embeddings", "documents", "metadatas", "distances"],
|
|
543
|
+
**kwargs,
|
|
544
|
+
)
|
|
545
|
+
else:
|
|
546
|
+
prefetch_results = self._collection.query(
|
|
547
|
+
query_embeddings=query.query_embedding,
|
|
548
|
+
n_results=prefetch_k,
|
|
549
|
+
include=["embeddings", "documents", "metadatas", "distances"],
|
|
550
|
+
**kwargs,
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
# Extract embeddings and metadata for MMR processing
|
|
554
|
+
prefetch_embeddings = []
|
|
555
|
+
prefetch_ids = []
|
|
556
|
+
prefetch_metadata = []
|
|
557
|
+
prefetch_documents = []
|
|
558
|
+
prefetch_distances = []
|
|
559
|
+
|
|
560
|
+
# Process prefetch results
|
|
561
|
+
for i in range(len(prefetch_results["ids"][0])):
|
|
562
|
+
node_id = prefetch_results["ids"][0][i]
|
|
563
|
+
text = prefetch_results["documents"][0][i]
|
|
564
|
+
metadata = prefetch_results["metadatas"][0][i]
|
|
565
|
+
distance = prefetch_results["distances"][0][i]
|
|
566
|
+
|
|
567
|
+
# Get the actual embedding from ChromaDB results
|
|
568
|
+
if "embeddings" in prefetch_results and prefetch_results["embeddings"]:
|
|
569
|
+
embedding = prefetch_results["embeddings"][0][i]
|
|
570
|
+
else:
|
|
571
|
+
# Fallback: if embeddings not available, we'll use distance-based approach
|
|
572
|
+
embedding = None
|
|
573
|
+
|
|
574
|
+
# Store for MMR processing
|
|
575
|
+
prefetch_embeddings.append(embedding)
|
|
576
|
+
prefetch_ids.append(node_id)
|
|
577
|
+
prefetch_metadata.append(metadata)
|
|
578
|
+
prefetch_documents.append(text)
|
|
579
|
+
prefetch_distances.append(distance)
|
|
580
|
+
|
|
581
|
+
if not prefetch_embeddings:
|
|
582
|
+
logger.warning("No results found during MMR prefetch")
|
|
583
|
+
return VectorStoreQueryResult(nodes=[], similarities=[], ids=[])
|
|
584
|
+
|
|
585
|
+
# Check if we have valid embeddings for MMR
|
|
586
|
+
valid_embeddings = [emb for emb in prefetch_embeddings if emb is not None]
|
|
587
|
+
|
|
588
|
+
if len(valid_embeddings) < query.similarity_top_k:
|
|
589
|
+
logger.warning(
|
|
590
|
+
f"Not enough valid embeddings for MMR: {len(valid_embeddings)} < {query.similarity_top_k}"
|
|
591
|
+
)
|
|
592
|
+
# Fallback to regular similarity search
|
|
593
|
+
return self._query(
|
|
594
|
+
query_embeddings=query.query_embedding,
|
|
595
|
+
n_results=query.similarity_top_k,
|
|
596
|
+
where=where,
|
|
597
|
+
**kwargs,
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
# Apply MMR algorithm using the core utility function
|
|
601
|
+
mmr_similarities, mmr_indices = get_top_k_mmr_embeddings(
|
|
602
|
+
query_embedding=query.query_embedding,
|
|
603
|
+
embeddings=valid_embeddings,
|
|
604
|
+
similarity_top_k=query.similarity_top_k,
|
|
605
|
+
embedding_ids=list(range(len(valid_embeddings))),
|
|
606
|
+
mmr_threshold=mmr_threshold,
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
# Build final results based on MMR selection
|
|
610
|
+
final_nodes = []
|
|
611
|
+
final_similarities = []
|
|
612
|
+
final_ids = []
|
|
613
|
+
|
|
614
|
+
# Create a mapping from valid embedding indices to original prefetch indices
|
|
615
|
+
valid_indices = [
|
|
616
|
+
i for i, emb in enumerate(prefetch_embeddings) if emb is not None
|
|
617
|
+
]
|
|
618
|
+
|
|
619
|
+
for mmr_index in mmr_indices:
|
|
620
|
+
if mmr_index < len(valid_indices):
|
|
621
|
+
original_index = valid_indices[mmr_index]
|
|
622
|
+
if original_index < len(prefetch_ids):
|
|
623
|
+
node_id = prefetch_ids[original_index]
|
|
624
|
+
text = prefetch_documents[original_index]
|
|
625
|
+
metadata = prefetch_metadata[original_index]
|
|
626
|
+
distance = prefetch_distances[original_index]
|
|
627
|
+
|
|
628
|
+
# Create node (reusing logic from _query method)
|
|
629
|
+
try:
|
|
630
|
+
node = metadata_dict_to_node(metadata, text=text)
|
|
631
|
+
except Exception:
|
|
632
|
+
# NOTE: deprecated legacy logic for backward compatibility
|
|
633
|
+
metadata, node_info, relationships = (
|
|
634
|
+
legacy_metadata_dict_to_node(metadata)
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
node = TextNode(
|
|
638
|
+
text=text or "",
|
|
639
|
+
id_=node_id,
|
|
640
|
+
metadata=metadata,
|
|
641
|
+
start_char_idx=node_info.get("start", None),
|
|
642
|
+
end_char_idx=node_info.get("end", None),
|
|
643
|
+
relationships=relationships,
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
final_nodes.append(node)
|
|
647
|
+
final_similarities.append(math.exp(-distance))
|
|
648
|
+
final_ids.append(node_id)
|
|
649
|
+
|
|
650
|
+
logger.debug(
|
|
651
|
+
f"MMR search completed: {len(final_nodes)} results selected from {len(prefetch_embeddings)} candidates"
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
return VectorStoreQueryResult(
|
|
655
|
+
nodes=final_nodes, similarities=final_similarities, ids=final_ids
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
def _get(
|
|
659
|
+
self, limit: Optional[int], where: dict, **kwargs
|
|
660
|
+
) -> VectorStoreQueryResult:
|
|
661
|
+
if where:
|
|
662
|
+
results = self._collection.get(
|
|
663
|
+
limit=limit,
|
|
664
|
+
where=where,
|
|
665
|
+
**kwargs,
|
|
666
|
+
)
|
|
667
|
+
else:
|
|
668
|
+
results = self._collection.get(
|
|
669
|
+
limit=limit,
|
|
670
|
+
**kwargs,
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
logger.debug(f"> Top {len(results['documents'])} nodes:")
|
|
674
|
+
nodes = []
|
|
675
|
+
ids = []
|
|
676
|
+
|
|
677
|
+
if not results["ids"]:
|
|
678
|
+
results["ids"] = [[]]
|
|
679
|
+
|
|
680
|
+
for node_id, text, metadata in zip(
|
|
681
|
+
results["ids"], results["documents"], results["metadatas"]
|
|
682
|
+
):
|
|
683
|
+
try:
|
|
684
|
+
node = metadata_dict_to_node(metadata, text=text)
|
|
685
|
+
except Exception:
|
|
686
|
+
# NOTE: deprecated legacy logic for backward compatibility
|
|
687
|
+
metadata, node_info, relationships = legacy_metadata_dict_to_node(
|
|
688
|
+
metadata
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
node = TextNode(
|
|
692
|
+
text=text or "",
|
|
693
|
+
id_=node_id,
|
|
694
|
+
metadata=metadata,
|
|
695
|
+
start_char_idx=node_info.get("start", None),
|
|
696
|
+
end_char_idx=node_info.get("end", None),
|
|
697
|
+
relationships=relationships,
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
nodes.append(node)
|
|
701
|
+
|
|
702
|
+
logger.debug(
|
|
703
|
+
f"> [Node {node_id}] [Similarity score: N/A - using get()] "
|
|
704
|
+
f"{truncate_text(str(text), 100)}"
|
|
705
|
+
)
|
|
706
|
+
ids.append(node_id)
|
|
707
|
+
|
|
708
|
+
return VectorStoreQueryResult(nodes=nodes, ids=ids)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llama-index-vector-stores-chroma
|
|
3
|
+
Version: 0.5.5
|
|
4
|
+
Summary: llama-index vector_stores chroma integration
|
|
5
|
+
Author-email: Your Name <you@example.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Python: <4.0,>=3.10
|
|
9
|
+
Requires-Dist: chromadb>=0.5.17
|
|
10
|
+
Requires-Dist: llama-index-core<0.15,>=0.13.0
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# LlamaIndex Vector_Stores Integration: Chroma
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
llama_index/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
llama_index/vector_stores/chroma/__init__.py,sha256=QNMK-nHKEt-wmks5mhWfdOKDybpmsqrL4neV-HCA6N4,101
|
|
3
|
+
llama_index/vector_stores/chroma/base.py,sha256=a1vzhTREGAM-CaEGpbki6u3rnKqIQQzNQonVOozTgyQ,24473
|
|
4
|
+
llama_index_vector_stores_chroma-0.5.5.dist-info/METADATA,sha256=Jrl0l3LuHI6TSgN4m8gDuYdbyUp6OggpP4dmSoof_Yc,413
|
|
5
|
+
llama_index_vector_stores_chroma-0.5.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
+
llama_index_vector_stores_chroma-0.5.5.dist-info/licenses/LICENSE,sha256=JPQLUZD9rKvCTdu192Nk0V5PAwklIg6jANii3UmTyMs,1065
|
|
7
|
+
llama_index_vector_stores_chroma-0.5.5.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) Jerry Liu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|