llama-index-vector-stores-chroma 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of llama-index-vector-stores-chroma might be problematic. Click here for more details.

@@ -7,12 +7,14 @@ from typing import Any, Dict, Generator, List, Optional, Union, cast
7
7
  import chromadb
8
8
  from chromadb.api.models.Collection import Collection
9
9
  from llama_index.core.bridge.pydantic import Field, PrivateAttr
10
+ from llama_index.core.indices.query.embedding_utils import get_top_k_mmr_embeddings
10
11
  from llama_index.core.schema import BaseNode, MetadataMode, TextNode
11
12
  from llama_index.core.utils import truncate_text
12
13
  from llama_index.core.vector_stores.types import (
13
14
  BasePydanticVectorStore,
14
15
  MetadataFilters,
15
16
  VectorStoreQuery,
17
+ VectorStoreQueryMode,
16
18
  VectorStoreQueryResult,
17
19
  )
18
20
  from llama_index.core.vector_stores.utils import (
@@ -23,6 +25,9 @@ from llama_index.core.vector_stores.utils import (
23
25
 
24
26
  logger = logging.getLogger(__name__)
25
27
 
28
+ # MMR constants
29
+ DEFAULT_MMR_PREFETCH_FACTOR = 4.0
30
+
26
31
 
27
32
  def _transform_chroma_filter_condition(condition: str) -> str:
28
33
  """Translate standard metadata filter op to Chroma specific spec."""
@@ -119,12 +124,15 @@ class ChromaVectorStore(BasePydanticVectorStore):
119
124
  During query time, the index uses ChromaDB to query for the top
120
125
  k most similar nodes.
121
126
 
127
+ Supports MMR (Maximum Marginal Relevance) search mode for improved diversity
128
+ in search results.
129
+
122
130
  Args:
123
131
  chroma_collection (chromadb.api.models.Collection.Collection):
124
132
  ChromaDB collection instance
125
133
 
126
134
  Examples:
127
- `pip install llama-index-vector-stores-chroma`
135
+ `uv add llama-index-vector-stores-chroma`
128
136
 
129
137
  ```python
130
138
  import chromadb
@@ -136,6 +144,12 @@ class ChromaVectorStore(BasePydanticVectorStore):
136
144
 
137
145
  # Set up the ChromaVectorStore and StorageContext
138
146
  vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
147
+
148
+ # Use MMR mode with threshold
149
+ query_engine = index.as_query_engine(
150
+ vector_store_query_mode="mmr",
151
+ vector_store_kwargs={"mmr_threshold": 0.5}
152
+ )
139
153
  ```
140
154
 
141
155
  """
@@ -375,6 +389,10 @@ class ChromaVectorStore(BasePydanticVectorStore):
375
389
  if not query.query_embedding:
376
390
  return self._get(limit=query.similarity_top_k, where=where, **kwargs)
377
391
 
392
+ # Handle MMR mode
393
+ if query.mode == VectorStoreQueryMode.MMR:
394
+ return self._mmr_search(query, where, **kwargs)
395
+
378
396
  return self._query(
379
397
  query_embeddings=query.query_embedding,
380
398
  n_results=query.similarity_top_k,
@@ -440,6 +458,182 @@ class ChromaVectorStore(BasePydanticVectorStore):
440
458
 
441
459
  return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)
442
460
 
461
+ def _mmr_search(
462
+ self, query: VectorStoreQuery, where: dict, **kwargs
463
+ ) -> VectorStoreQueryResult:
464
+ """
465
+ Perform MMR search using ChromaDB.
466
+
467
+ Args:
468
+ query: VectorStoreQuery object containing the query parameters
469
+ where: ChromaDB filter conditions
470
+ **kwargs: Additional keyword arguments including mmr_threshold
471
+
472
+ Returns:
473
+ VectorStoreQueryResult: Query result with MMR-applied nodes
474
+
475
+ """
476
+ # Extract MMR parameters
477
+ mmr_threshold = kwargs.get("mmr_threshold")
478
+
479
+ # Validate MMR parameters
480
+ if mmr_threshold is not None and (
481
+ not isinstance(mmr_threshold, (int, float))
482
+ or mmr_threshold < 0
483
+ or mmr_threshold > 1
484
+ ):
485
+ raise ValueError("mmr_threshold must be a float between 0 and 1")
486
+
487
+ # Validate prefetch parameters (check before popping)
488
+ raw_prefetch_factor = kwargs.get("mmr_prefetch_factor")
489
+ raw_prefetch_k = kwargs.get("mmr_prefetch_k")
490
+ if raw_prefetch_factor is not None and raw_prefetch_k is not None:
491
+ raise ValueError(
492
+ "'mmr_prefetch_factor' and 'mmr_prefetch_k' "
493
+ "cannot coexist in a call to query()"
494
+ )
495
+
496
+ # Strip MMR-only kwargs so they aren't forwarded to Chroma
497
+ mmr_threshold = kwargs.pop("mmr_threshold", None)
498
+ prefetch_k_override = kwargs.pop("mmr_prefetch_k", None)
499
+ prefetch_factor = kwargs.pop("mmr_prefetch_factor", DEFAULT_MMR_PREFETCH_FACTOR)
500
+
501
+ # Calculate prefetch size (get more candidates than needed for MMR)
502
+ if prefetch_k_override is not None:
503
+ prefetch_k = int(prefetch_k_override)
504
+ else:
505
+ prefetch_k = int(query.similarity_top_k * prefetch_factor)
506
+
507
+ # Ensure prefetch_k is at least as large as similarity_top_k
508
+ prefetch_k = max(prefetch_k, query.similarity_top_k)
509
+
510
+ logger.debug(
511
+ f"MMR search: prefetching {prefetch_k} candidates for {query.similarity_top_k} final results"
512
+ )
513
+
514
+ # Query ChromaDB for more candidates than needed (kwargs now safe)
515
+ if where:
516
+ prefetch_results = self._collection.query(
517
+ query_embeddings=query.query_embedding,
518
+ n_results=prefetch_k,
519
+ where=where,
520
+ include=["embeddings", "documents", "metadatas", "distances"],
521
+ **kwargs,
522
+ )
523
+ else:
524
+ prefetch_results = self._collection.query(
525
+ query_embeddings=query.query_embedding,
526
+ n_results=prefetch_k,
527
+ include=["embeddings", "documents", "metadatas", "distances"],
528
+ **kwargs,
529
+ )
530
+
531
+ # Extract embeddings and metadata for MMR processing
532
+ prefetch_embeddings = []
533
+ prefetch_ids = []
534
+ prefetch_metadata = []
535
+ prefetch_documents = []
536
+ prefetch_distances = []
537
+
538
+ # Process prefetch results
539
+ for i in range(len(prefetch_results["ids"][0])):
540
+ node_id = prefetch_results["ids"][0][i]
541
+ text = prefetch_results["documents"][0][i]
542
+ metadata = prefetch_results["metadatas"][0][i]
543
+ distance = prefetch_results["distances"][0][i]
544
+
545
+ # Get the actual embedding from ChromaDB results
546
+ if "embeddings" in prefetch_results and prefetch_results["embeddings"]:
547
+ embedding = prefetch_results["embeddings"][0][i]
548
+ else:
549
+ # Fallback: if embeddings not available, we'll use distance-based approach
550
+ embedding = None
551
+
552
+ # Store for MMR processing
553
+ prefetch_embeddings.append(embedding)
554
+ prefetch_ids.append(node_id)
555
+ prefetch_metadata.append(metadata)
556
+ prefetch_documents.append(text)
557
+ prefetch_distances.append(distance)
558
+
559
+ if not prefetch_embeddings:
560
+ logger.warning("No results found during MMR prefetch")
561
+ return VectorStoreQueryResult(nodes=[], similarities=[], ids=[])
562
+
563
+ # Check if we have valid embeddings for MMR
564
+ valid_embeddings = [emb for emb in prefetch_embeddings if emb is not None]
565
+
566
+ if len(valid_embeddings) < query.similarity_top_k:
567
+ logger.warning(
568
+ f"Not enough valid embeddings for MMR: {len(valid_embeddings)} < {query.similarity_top_k}"
569
+ )
570
+ # Fallback to regular similarity search
571
+ return self._query(
572
+ query_embeddings=query.query_embedding,
573
+ n_results=query.similarity_top_k,
574
+ where=where,
575
+ **kwargs,
576
+ )
577
+
578
+ # Apply MMR algorithm using the core utility function
579
+ mmr_similarities, mmr_indices = get_top_k_mmr_embeddings(
580
+ query_embedding=query.query_embedding,
581
+ embeddings=valid_embeddings,
582
+ similarity_top_k=query.similarity_top_k,
583
+ embedding_ids=list(range(len(valid_embeddings))),
584
+ mmr_threshold=mmr_threshold,
585
+ )
586
+
587
+ # Build final results based on MMR selection
588
+ final_nodes = []
589
+ final_similarities = []
590
+ final_ids = []
591
+
592
+ # Create a mapping from valid embedding indices to original prefetch indices
593
+ valid_indices = [
594
+ i for i, emb in enumerate(prefetch_embeddings) if emb is not None
595
+ ]
596
+
597
+ for mmr_index in mmr_indices:
598
+ if mmr_index < len(valid_indices):
599
+ original_index = valid_indices[mmr_index]
600
+ if original_index < len(prefetch_ids):
601
+ node_id = prefetch_ids[original_index]
602
+ text = prefetch_documents[original_index]
603
+ metadata = prefetch_metadata[original_index]
604
+ distance = prefetch_distances[original_index]
605
+
606
+ # Create node (reusing logic from _query method)
607
+ try:
608
+ node = metadata_dict_to_node(metadata)
609
+ node.set_content(text)
610
+ except Exception:
611
+ # NOTE: deprecated legacy logic for backward compatibility
612
+ metadata, node_info, relationships = (
613
+ legacy_metadata_dict_to_node(metadata)
614
+ )
615
+
616
+ node = TextNode(
617
+ text=text,
618
+ id_=node_id,
619
+ metadata=metadata,
620
+ start_char_idx=node_info.get("start", None),
621
+ end_char_idx=node_info.get("end", None),
622
+ relationships=relationships,
623
+ )
624
+
625
+ final_nodes.append(node)
626
+ final_similarities.append(math.exp(-distance))
627
+ final_ids.append(node_id)
628
+
629
+ logger.debug(
630
+ f"MMR search completed: {len(final_nodes)} results selected from {len(prefetch_embeddings)} candidates"
631
+ )
632
+
633
+ return VectorStoreQueryResult(
634
+ nodes=final_nodes, similarities=final_similarities, ids=final_ids
635
+ )
636
+
443
637
  def _get(
444
638
  self, limit: Optional[int], where: dict, **kwargs
445
639
  ) -> VectorStoreQueryResult:
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llama-index-vector-stores-chroma
3
- Version: 0.5.1
3
+ Version: 0.5.3
4
4
  Summary: llama-index vector_stores chroma integration
5
5
  Author-email: Your Name <you@example.com>
6
6
  License-Expression: MIT
7
7
  License-File: LICENSE
8
8
  Requires-Python: <4.0,>=3.10
9
9
  Requires-Dist: chromadb>=0.5.17
10
- Requires-Dist: llama-index-core<0.14,>=0.13.0
10
+ Requires-Dist: llama-index-core<0.15,>=0.13.0
11
11
  Description-Content-Type: text/markdown
12
12
 
13
13
  # LlamaIndex Vector_Stores Integration: Chroma
@@ -0,0 +1,7 @@
1
+ llama_index/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ llama_index/vector_stores/chroma/__init__.py,sha256=QNMK-nHKEt-wmks5mhWfdOKDybpmsqrL4neV-HCA6N4,101
3
+ llama_index/vector_stores/chroma/base.py,sha256=xXGVw1ByVQtAOhodvlxvPpUA3kc_SocSUUUfW2rBUzQ,23133
4
+ llama_index_vector_stores_chroma-0.5.3.dist-info/METADATA,sha256=bzk67pd5HOPJkBp-ULANUzw1yrUeqMfjPtYWv3o0PHs,413
5
+ llama_index_vector_stores_chroma-0.5.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
+ llama_index_vector_stores_chroma-0.5.3.dist-info/licenses/LICENSE,sha256=JPQLUZD9rKvCTdu192Nk0V5PAwklIg6jANii3UmTyMs,1065
7
+ llama_index_vector_stores_chroma-0.5.3.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- llama_index/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- llama_index/vector_stores/chroma/__init__.py,sha256=QNMK-nHKEt-wmks5mhWfdOKDybpmsqrL4neV-HCA6N4,101
3
- llama_index/vector_stores/chroma/base.py,sha256=XG_ul6oP7Qnh8ppphWb0hWnDby__Fc5KbkkBT22V8kM,15361
4
- llama_index_vector_stores_chroma-0.5.1.dist-info/METADATA,sha256=HJIoiGLND2vyOvv6xl_IUO714gaxXWBvH1cLZ1qlAFM,413
5
- llama_index_vector_stores_chroma-0.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
- llama_index_vector_stores_chroma-0.5.1.dist-info/licenses/LICENSE,sha256=JPQLUZD9rKvCTdu192Nk0V5PAwklIg6jANii3UmTyMs,1065
7
- llama_index_vector_stores_chroma-0.5.1.dist-info/RECORD,,