llama-index-vector-stores-chroma 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of llama-index-vector-stores-chroma might be problematic. Click here for more details.

@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.1
2
+ Name: llama-index-vector-stores-chroma
3
+ Version: 0.0.1
4
+ Summary: llama-index vector_stores chroma integration
5
+ License: MIT
6
+ Author: Your Name
7
+ Author-email: you@example.com
8
+ Requires-Python: >=3.8.1,<3.12
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Requires-Dist: chromadb (>=0.4.22,<0.5.0)
15
+ Requires-Dist: llama-index-core (>=0.9.32,<0.10.0)
16
+ Description-Content-Type: text/markdown
17
+
18
+ # LlamaIndex Vector_Stores Integration: Chroma
19
+
@@ -0,0 +1 @@
1
+ # LlamaIndex Vector_Stores Integration: Chroma
@@ -0,0 +1,4 @@
1
+ from llama_index.vector_stores.chroma.base import ChromaVectorStore
2
+
3
+
4
+ __all__ = ["ChromaVectorStore"]
@@ -0,0 +1,339 @@
1
+ """Chroma vector store."""
2
+ import logging
3
+ import math
4
+ from typing import Any, Dict, Generator, List, Optional, cast
5
+
6
+ import chromadb
7
+ from chromadb.api.models.Collection import Collection
8
+
9
+ from llama_index.core.bridge.pydantic import Field, PrivateAttr
10
+ from llama_index.core.schema import BaseNode, MetadataMode, TextNode
11
+ from llama_index.core.utils import truncate_text
12
+ from llama_index.core.vector_stores.types import (
13
+ BasePydanticVectorStore,
14
+ MetadataFilters,
15
+ VectorStoreQuery,
16
+ VectorStoreQueryResult,
17
+ )
18
+ from llama_index.core.vector_stores.utils import (
19
+ legacy_metadata_dict_to_node,
20
+ metadata_dict_to_node,
21
+ node_to_metadata_dict,
22
+ )
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ def _transform_chroma_filter_condition(condition: str) -> str:
28
+ """Translate standard metadata filter op to Chroma specific spec."""
29
+ if condition == "and":
30
+ return "$and"
31
+ elif condition == "or":
32
+ return "$or"
33
+ else:
34
+ raise ValueError(f"Filter condition {condition} not supported")
35
+
36
+
37
+ def _transform_chroma_filter_operator(operator: str) -> str:
38
+ """Translate standard metadata filter operator to Chroma specific spec."""
39
+ if operator == "!=":
40
+ return "$ne"
41
+ elif operator == "==":
42
+ return "$eq"
43
+ elif operator == ">":
44
+ return "$gt"
45
+ elif operator == "<":
46
+ return "$lt"
47
+ elif operator == ">=":
48
+ return "$gte"
49
+ elif operator == "<=":
50
+ return "$lte"
51
+ else:
52
+ raise ValueError(f"Filter operator {operator} not supported")
53
+
54
+
55
+ def _to_chroma_filter(
56
+ standard_filters: MetadataFilters,
57
+ ) -> dict:
58
+ """Translate standard metadata filters to Chroma specific spec."""
59
+ filters = {}
60
+ filters_list = []
61
+ condition = standard_filters.condition or "and"
62
+ condition = _transform_chroma_filter_condition(condition)
63
+ if standard_filters.filters:
64
+ for filter in standard_filters.filters:
65
+ if filter.operator:
66
+ filters_list.append(
67
+ {
68
+ filter.key: {
69
+ _transform_chroma_filter_operator(
70
+ filter.operator
71
+ ): filter.value
72
+ }
73
+ }
74
+ )
75
+ else:
76
+ filters_list.append({filter.key: filter.value})
77
+
78
+ if len(filters_list) == 1:
79
+ # If there is only one filter, return it directly
80
+ return filters_list[0]
81
+ elif len(filters_list) > 1:
82
+ filters[condition] = filters_list
83
+ return filters
84
+
85
+
86
+ import_err_msg = "`chromadb` package not found, please run `pip install chromadb`"
87
+
88
+ MAX_CHUNK_SIZE = 41665 # One less than the max chunk size for ChromaDB
89
+
90
+
91
+ def chunk_list(
92
+ lst: List[BaseNode], max_chunk_size: int
93
+ ) -> Generator[List[BaseNode], None, None]:
94
+ """Yield successive max_chunk_size-sized chunks from lst.
95
+
96
+ Args:
97
+ lst (List[BaseNode]): list of nodes with embeddings
98
+ max_chunk_size (int): max chunk size
99
+
100
+ Yields:
101
+ Generator[List[BaseNode], None, None]: list of nodes with embeddings
102
+ """
103
+ for i in range(0, len(lst), max_chunk_size):
104
+ yield lst[i : i + max_chunk_size]
105
+
106
+
107
+ class ChromaVectorStore(BasePydanticVectorStore):
108
+ """Chroma vector store.
109
+
110
+ In this vector store, embeddings are stored within a ChromaDB collection.
111
+
112
+ During query time, the index uses ChromaDB to query for the top
113
+ k most similar nodes.
114
+
115
+ Args:
116
+ chroma_collection (chromadb.api.models.Collection.Collection):
117
+ ChromaDB collection instance
118
+
119
+ """
120
+
121
+ stores_text: bool = True
122
+ flat_metadata: bool = True
123
+
124
+ collection_name: Optional[str]
125
+ host: Optional[str]
126
+ port: Optional[str]
127
+ ssl: bool
128
+ headers: Optional[Dict[str, str]]
129
+ persist_dir: Optional[str]
130
+ collection_kwargs: Dict[str, Any] = Field(default_factory=dict)
131
+
132
+ _collection: Any = PrivateAttr()
133
+
134
+ def __init__(
135
+ self,
136
+ chroma_collection: Optional[Any] = None,
137
+ collection_name: Optional[str] = None,
138
+ host: Optional[str] = None,
139
+ port: Optional[str] = None,
140
+ ssl: bool = False,
141
+ headers: Optional[Dict[str, str]] = None,
142
+ persist_dir: Optional[str] = None,
143
+ collection_kwargs: Optional[dict] = None,
144
+ **kwargs: Any,
145
+ ) -> None:
146
+ """Init params."""
147
+ if chroma_collection is None:
148
+ client = chromadb.HttpClient(host=host, port=port, ssl=ssl, headers=headers)
149
+ self._collection = client.get_or_create_collection(
150
+ name=collection_name, **collection_kwargs
151
+ )
152
+ else:
153
+ self._collection = cast(Collection, chroma_collection)
154
+
155
+ super().__init__(
156
+ host=host,
157
+ port=port,
158
+ ssl=ssl,
159
+ headers=headers,
160
+ collection_name=collection_name,
161
+ persist_dir=persist_dir,
162
+ collection_kwargs=collection_kwargs or {},
163
+ )
164
+
165
+ @classmethod
166
+ def from_collection(cls, collection: Any) -> "ChromaVectorStore":
167
+ try:
168
+ from chromadb import Collection
169
+ except ImportError:
170
+ raise ImportError(import_err_msg)
171
+
172
+ if not isinstance(collection, Collection):
173
+ raise Exception("argument is not chromadb collection instance")
174
+
175
+ return cls(chroma_collection=collection)
176
+
177
+ @classmethod
178
+ def from_params(
179
+ cls,
180
+ collection_name: str,
181
+ host: Optional[str] = None,
182
+ port: Optional[str] = None,
183
+ ssl: bool = False,
184
+ headers: Optional[Dict[str, str]] = None,
185
+ persist_dir: Optional[str] = None,
186
+ collection_kwargs: dict = {},
187
+ **kwargs: Any,
188
+ ) -> "ChromaVectorStore":
189
+ if persist_dir:
190
+ client = chromadb.PersistentClient(path=persist_dir)
191
+ collection = client.get_or_create_collection(
192
+ name=collection_name, **collection_kwargs
193
+ )
194
+ elif host and port:
195
+ client = chromadb.HttpClient(host=host, port=port, ssl=ssl, headers=headers)
196
+ collection = client.get_or_create_collection(
197
+ name=collection_name, **collection_kwargs
198
+ )
199
+ else:
200
+ raise ValueError(
201
+ "Either `persist_dir` or (`host`,`port`) must be specified"
202
+ )
203
+ return cls(
204
+ chroma_collection=collection,
205
+ host=host,
206
+ port=port,
207
+ ssl=ssl,
208
+ headers=headers,
209
+ persist_dir=persist_dir,
210
+ collection_kwargs=collection_kwargs,
211
+ **kwargs,
212
+ )
213
+
214
+ @classmethod
215
+ def class_name(cls) -> str:
216
+ return "ChromaVectorStore"
217
+
218
+ def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:
219
+ """Add nodes to index.
220
+
221
+ Args:
222
+ nodes: List[BaseNode]: list of nodes with embeddings
223
+
224
+ """
225
+ if not self._collection:
226
+ raise ValueError("Collection not initialized")
227
+
228
+ max_chunk_size = MAX_CHUNK_SIZE
229
+ node_chunks = chunk_list(nodes, max_chunk_size)
230
+
231
+ all_ids = []
232
+ for node_chunk in node_chunks:
233
+ embeddings = []
234
+ metadatas = []
235
+ ids = []
236
+ documents = []
237
+ for node in node_chunk:
238
+ embeddings.append(node.get_embedding())
239
+ metadata_dict = node_to_metadata_dict(
240
+ node, remove_text=True, flat_metadata=self.flat_metadata
241
+ )
242
+ for key in metadata_dict:
243
+ if metadata_dict[key] is None:
244
+ metadata_dict[key] = ""
245
+ metadatas.append(metadata_dict)
246
+ ids.append(node.node_id)
247
+ documents.append(node.get_content(metadata_mode=MetadataMode.NONE))
248
+
249
+ self._collection.add(
250
+ embeddings=embeddings,
251
+ ids=ids,
252
+ metadatas=metadatas,
253
+ documents=documents,
254
+ )
255
+ all_ids.extend(ids)
256
+
257
+ return all_ids
258
+
259
+ def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
260
+ """
261
+ Delete nodes using with ref_doc_id.
262
+
263
+ Args:
264
+ ref_doc_id (str): The doc_id of the document to delete.
265
+
266
+ """
267
+ self._collection.delete(where={"document_id": ref_doc_id})
268
+
269
+ @property
270
+ def client(self) -> Any:
271
+ """Return client."""
272
+ return self._collection
273
+
274
+ def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
275
+ """Query index for top k most similar nodes.
276
+
277
+ Args:
278
+ query_embedding (List[float]): query embedding
279
+ similarity_top_k (int): top k most similar nodes
280
+
281
+ """
282
+ if query.filters is not None:
283
+ if "where" in kwargs:
284
+ raise ValueError(
285
+ "Cannot specify metadata filters via both query and kwargs. "
286
+ "Use kwargs only for chroma specific items that are "
287
+ "not supported via the generic query interface."
288
+ )
289
+ where = _to_chroma_filter(query.filters)
290
+ else:
291
+ where = kwargs.pop("where", {})
292
+
293
+ results = self._collection.query(
294
+ query_embeddings=query.query_embedding,
295
+ n_results=query.similarity_top_k,
296
+ where=where,
297
+ **kwargs,
298
+ )
299
+
300
+ logger.debug(f"> Top {len(results['documents'])} nodes:")
301
+ nodes = []
302
+ similarities = []
303
+ ids = []
304
+ for node_id, text, metadata, distance in zip(
305
+ results["ids"][0],
306
+ results["documents"][0],
307
+ results["metadatas"][0],
308
+ results["distances"][0],
309
+ ):
310
+ try:
311
+ node = metadata_dict_to_node(metadata)
312
+ node.set_content(text)
313
+ except Exception:
314
+ # NOTE: deprecated legacy logic for backward compatibility
315
+ metadata, node_info, relationships = legacy_metadata_dict_to_node(
316
+ metadata
317
+ )
318
+
319
+ node = TextNode(
320
+ text=text,
321
+ id_=node_id,
322
+ metadata=metadata,
323
+ start_char_idx=node_info.get("start", None),
324
+ end_char_idx=node_info.get("end", None),
325
+ relationships=relationships,
326
+ )
327
+
328
+ nodes.append(node)
329
+
330
+ similarity_score = math.exp(-distance)
331
+ similarities.append(similarity_score)
332
+
333
+ logger.debug(
334
+ f"> [Node {node_id}] [Similarity score: {similarity_score}] "
335
+ f"{truncate_text(str(text), 100)}"
336
+ )
337
+ ids.append(node_id)
338
+
339
+ return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)
@@ -0,0 +1,50 @@
1
+ [build-system]
2
+ requires = ["poetry-core"]
3
+ build-backend = "poetry.core.masonry.api"
4
+
5
+ [tool.codespell]
6
+ check-filenames = true
7
+ check-hidden = true
8
+ # Feel free to un-skip examples, and experimental, you will just need to
9
+ # work through many typos (--write-changes and --interactive will help)
10
+ skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"
11
+
12
+ [tool.mypy]
13
+ disallow_untyped_defs = true
14
+ # Remove venv skip when integrated with pre-commit
15
+ exclude = ["_static", "build", "examples", "notebooks", "venv"]
16
+ ignore_missing_imports = true
17
+ python_version = "3.8"
18
+
19
+ [tool.poetry]
20
+ name = "llama-index-vector-stores-chroma"
21
+ version = "0.0.1"
22
+ description = "llama-index vector_stores chroma integration"
23
+ authors = ["Your Name <you@example.com>"]
24
+ license = "MIT"
25
+ readme = "README.md"
26
+ packages = [{include = "llama_index/"}]
27
+
28
+ [tool.poetry.dependencies]
29
+ python = ">=3.8.1,<3.12"
30
+ llama-index-core = "^0.9.32"
31
+ chromadb = "^0.4.22"
32
+
33
+ [tool.poetry.group.dev.dependencies]
34
+ black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"}
35
+ codespell = {extras = ["toml"], version = ">=v2.2.6"}
36
+ ipython = "8.10.0"
37
+ jupyter = "^1.0.0"
38
+ mypy = "0.991"
39
+ pre-commit = "3.2.0"
40
+ pylint = "2.15.10"
41
+ pytest = "7.2.1"
42
+ pytest-mock = "3.11.1"
43
+ ruff = "0.0.292"
44
+ tree-sitter-languages = "^1.8.0"
45
+ types-Deprecated = ">=0.1.0"
46
+ types-PyYAML = "^6.0.12.12"
47
+ types-protobuf = "^4.24.0.4"
48
+ types-redis = "4.5.5.0"
49
+ types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991
50
+ types-setuptools = "67.1.0.0"