llama-index-vector-stores-chroma 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of llama-index-vector-stores-chroma might be problematic. Click here for more details.
- llama_index_vector_stores_chroma-0.0.1/PKG-INFO +19 -0
- llama_index_vector_stores_chroma-0.0.1/README.md +1 -0
- llama_index_vector_stores_chroma-0.0.1/llama_index/vector_stores/chroma/__init__.py +4 -0
- llama_index_vector_stores_chroma-0.0.1/llama_index/vector_stores/chroma/base.py +339 -0
- llama_index_vector_stores_chroma-0.0.1/pyproject.toml +50 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: llama-index-vector-stores-chroma
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: llama-index vector_stores chroma integration
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Your Name
|
|
7
|
+
Author-email: you@example.com
|
|
8
|
+
Requires-Python: >=3.8.1,<3.12
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Requires-Dist: chromadb (>=0.4.22,<0.5.0)
|
|
15
|
+
Requires-Dist: llama-index-core (>=0.9.32,<0.10.0)
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# LlamaIndex Vector_Stores Integration: Chroma
|
|
19
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# LlamaIndex Vector_Stores Integration: Chroma
|
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
"""Chroma vector store."""
|
|
2
|
+
import logging
|
|
3
|
+
import math
|
|
4
|
+
from typing import Any, Dict, Generator, List, Optional, cast
|
|
5
|
+
|
|
6
|
+
import chromadb
|
|
7
|
+
from chromadb.api.models.Collection import Collection
|
|
8
|
+
|
|
9
|
+
from llama_index.core.bridge.pydantic import Field, PrivateAttr
|
|
10
|
+
from llama_index.core.schema import BaseNode, MetadataMode, TextNode
|
|
11
|
+
from llama_index.core.utils import truncate_text
|
|
12
|
+
from llama_index.core.vector_stores.types import (
|
|
13
|
+
BasePydanticVectorStore,
|
|
14
|
+
MetadataFilters,
|
|
15
|
+
VectorStoreQuery,
|
|
16
|
+
VectorStoreQueryResult,
|
|
17
|
+
)
|
|
18
|
+
from llama_index.core.vector_stores.utils import (
|
|
19
|
+
legacy_metadata_dict_to_node,
|
|
20
|
+
metadata_dict_to_node,
|
|
21
|
+
node_to_metadata_dict,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _transform_chroma_filter_condition(condition: str) -> str:
|
|
28
|
+
"""Translate standard metadata filter op to Chroma specific spec."""
|
|
29
|
+
if condition == "and":
|
|
30
|
+
return "$and"
|
|
31
|
+
elif condition == "or":
|
|
32
|
+
return "$or"
|
|
33
|
+
else:
|
|
34
|
+
raise ValueError(f"Filter condition {condition} not supported")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _transform_chroma_filter_operator(operator: str) -> str:
|
|
38
|
+
"""Translate standard metadata filter operator to Chroma specific spec."""
|
|
39
|
+
if operator == "!=":
|
|
40
|
+
return "$ne"
|
|
41
|
+
elif operator == "==":
|
|
42
|
+
return "$eq"
|
|
43
|
+
elif operator == ">":
|
|
44
|
+
return "$gt"
|
|
45
|
+
elif operator == "<":
|
|
46
|
+
return "$lt"
|
|
47
|
+
elif operator == ">=":
|
|
48
|
+
return "$gte"
|
|
49
|
+
elif operator == "<=":
|
|
50
|
+
return "$lte"
|
|
51
|
+
else:
|
|
52
|
+
raise ValueError(f"Filter operator {operator} not supported")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _to_chroma_filter(
|
|
56
|
+
standard_filters: MetadataFilters,
|
|
57
|
+
) -> dict:
|
|
58
|
+
"""Translate standard metadata filters to Chroma specific spec."""
|
|
59
|
+
filters = {}
|
|
60
|
+
filters_list = []
|
|
61
|
+
condition = standard_filters.condition or "and"
|
|
62
|
+
condition = _transform_chroma_filter_condition(condition)
|
|
63
|
+
if standard_filters.filters:
|
|
64
|
+
for filter in standard_filters.filters:
|
|
65
|
+
if filter.operator:
|
|
66
|
+
filters_list.append(
|
|
67
|
+
{
|
|
68
|
+
filter.key: {
|
|
69
|
+
_transform_chroma_filter_operator(
|
|
70
|
+
filter.operator
|
|
71
|
+
): filter.value
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
)
|
|
75
|
+
else:
|
|
76
|
+
filters_list.append({filter.key: filter.value})
|
|
77
|
+
|
|
78
|
+
if len(filters_list) == 1:
|
|
79
|
+
# If there is only one filter, return it directly
|
|
80
|
+
return filters_list[0]
|
|
81
|
+
elif len(filters_list) > 1:
|
|
82
|
+
filters[condition] = filters_list
|
|
83
|
+
return filters
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
import_err_msg = "`chromadb` package not found, please run `pip install chromadb`"
|
|
87
|
+
|
|
88
|
+
MAX_CHUNK_SIZE = 41665 # One less than the max chunk size for ChromaDB
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def chunk_list(
|
|
92
|
+
lst: List[BaseNode], max_chunk_size: int
|
|
93
|
+
) -> Generator[List[BaseNode], None, None]:
|
|
94
|
+
"""Yield successive max_chunk_size-sized chunks from lst.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
lst (List[BaseNode]): list of nodes with embeddings
|
|
98
|
+
max_chunk_size (int): max chunk size
|
|
99
|
+
|
|
100
|
+
Yields:
|
|
101
|
+
Generator[List[BaseNode], None, None]: list of nodes with embeddings
|
|
102
|
+
"""
|
|
103
|
+
for i in range(0, len(lst), max_chunk_size):
|
|
104
|
+
yield lst[i : i + max_chunk_size]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class ChromaVectorStore(BasePydanticVectorStore):
|
|
108
|
+
"""Chroma vector store.
|
|
109
|
+
|
|
110
|
+
In this vector store, embeddings are stored within a ChromaDB collection.
|
|
111
|
+
|
|
112
|
+
During query time, the index uses ChromaDB to query for the top
|
|
113
|
+
k most similar nodes.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
chroma_collection (chromadb.api.models.Collection.Collection):
|
|
117
|
+
ChromaDB collection instance
|
|
118
|
+
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
stores_text: bool = True
|
|
122
|
+
flat_metadata: bool = True
|
|
123
|
+
|
|
124
|
+
collection_name: Optional[str]
|
|
125
|
+
host: Optional[str]
|
|
126
|
+
port: Optional[str]
|
|
127
|
+
ssl: bool
|
|
128
|
+
headers: Optional[Dict[str, str]]
|
|
129
|
+
persist_dir: Optional[str]
|
|
130
|
+
collection_kwargs: Dict[str, Any] = Field(default_factory=dict)
|
|
131
|
+
|
|
132
|
+
_collection: Any = PrivateAttr()
|
|
133
|
+
|
|
134
|
+
def __init__(
|
|
135
|
+
self,
|
|
136
|
+
chroma_collection: Optional[Any] = None,
|
|
137
|
+
collection_name: Optional[str] = None,
|
|
138
|
+
host: Optional[str] = None,
|
|
139
|
+
port: Optional[str] = None,
|
|
140
|
+
ssl: bool = False,
|
|
141
|
+
headers: Optional[Dict[str, str]] = None,
|
|
142
|
+
persist_dir: Optional[str] = None,
|
|
143
|
+
collection_kwargs: Optional[dict] = None,
|
|
144
|
+
**kwargs: Any,
|
|
145
|
+
) -> None:
|
|
146
|
+
"""Init params."""
|
|
147
|
+
if chroma_collection is None:
|
|
148
|
+
client = chromadb.HttpClient(host=host, port=port, ssl=ssl, headers=headers)
|
|
149
|
+
self._collection = client.get_or_create_collection(
|
|
150
|
+
name=collection_name, **collection_kwargs
|
|
151
|
+
)
|
|
152
|
+
else:
|
|
153
|
+
self._collection = cast(Collection, chroma_collection)
|
|
154
|
+
|
|
155
|
+
super().__init__(
|
|
156
|
+
host=host,
|
|
157
|
+
port=port,
|
|
158
|
+
ssl=ssl,
|
|
159
|
+
headers=headers,
|
|
160
|
+
collection_name=collection_name,
|
|
161
|
+
persist_dir=persist_dir,
|
|
162
|
+
collection_kwargs=collection_kwargs or {},
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
@classmethod
|
|
166
|
+
def from_collection(cls, collection: Any) -> "ChromaVectorStore":
|
|
167
|
+
try:
|
|
168
|
+
from chromadb import Collection
|
|
169
|
+
except ImportError:
|
|
170
|
+
raise ImportError(import_err_msg)
|
|
171
|
+
|
|
172
|
+
if not isinstance(collection, Collection):
|
|
173
|
+
raise Exception("argument is not chromadb collection instance")
|
|
174
|
+
|
|
175
|
+
return cls(chroma_collection=collection)
|
|
176
|
+
|
|
177
|
+
@classmethod
|
|
178
|
+
def from_params(
|
|
179
|
+
cls,
|
|
180
|
+
collection_name: str,
|
|
181
|
+
host: Optional[str] = None,
|
|
182
|
+
port: Optional[str] = None,
|
|
183
|
+
ssl: bool = False,
|
|
184
|
+
headers: Optional[Dict[str, str]] = None,
|
|
185
|
+
persist_dir: Optional[str] = None,
|
|
186
|
+
collection_kwargs: dict = {},
|
|
187
|
+
**kwargs: Any,
|
|
188
|
+
) -> "ChromaVectorStore":
|
|
189
|
+
if persist_dir:
|
|
190
|
+
client = chromadb.PersistentClient(path=persist_dir)
|
|
191
|
+
collection = client.get_or_create_collection(
|
|
192
|
+
name=collection_name, **collection_kwargs
|
|
193
|
+
)
|
|
194
|
+
elif host and port:
|
|
195
|
+
client = chromadb.HttpClient(host=host, port=port, ssl=ssl, headers=headers)
|
|
196
|
+
collection = client.get_or_create_collection(
|
|
197
|
+
name=collection_name, **collection_kwargs
|
|
198
|
+
)
|
|
199
|
+
else:
|
|
200
|
+
raise ValueError(
|
|
201
|
+
"Either `persist_dir` or (`host`,`port`) must be specified"
|
|
202
|
+
)
|
|
203
|
+
return cls(
|
|
204
|
+
chroma_collection=collection,
|
|
205
|
+
host=host,
|
|
206
|
+
port=port,
|
|
207
|
+
ssl=ssl,
|
|
208
|
+
headers=headers,
|
|
209
|
+
persist_dir=persist_dir,
|
|
210
|
+
collection_kwargs=collection_kwargs,
|
|
211
|
+
**kwargs,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
@classmethod
|
|
215
|
+
def class_name(cls) -> str:
|
|
216
|
+
return "ChromaVectorStore"
|
|
217
|
+
|
|
218
|
+
def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:
|
|
219
|
+
"""Add nodes to index.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
nodes: List[BaseNode]: list of nodes with embeddings
|
|
223
|
+
|
|
224
|
+
"""
|
|
225
|
+
if not self._collection:
|
|
226
|
+
raise ValueError("Collection not initialized")
|
|
227
|
+
|
|
228
|
+
max_chunk_size = MAX_CHUNK_SIZE
|
|
229
|
+
node_chunks = chunk_list(nodes, max_chunk_size)
|
|
230
|
+
|
|
231
|
+
all_ids = []
|
|
232
|
+
for node_chunk in node_chunks:
|
|
233
|
+
embeddings = []
|
|
234
|
+
metadatas = []
|
|
235
|
+
ids = []
|
|
236
|
+
documents = []
|
|
237
|
+
for node in node_chunk:
|
|
238
|
+
embeddings.append(node.get_embedding())
|
|
239
|
+
metadata_dict = node_to_metadata_dict(
|
|
240
|
+
node, remove_text=True, flat_metadata=self.flat_metadata
|
|
241
|
+
)
|
|
242
|
+
for key in metadata_dict:
|
|
243
|
+
if metadata_dict[key] is None:
|
|
244
|
+
metadata_dict[key] = ""
|
|
245
|
+
metadatas.append(metadata_dict)
|
|
246
|
+
ids.append(node.node_id)
|
|
247
|
+
documents.append(node.get_content(metadata_mode=MetadataMode.NONE))
|
|
248
|
+
|
|
249
|
+
self._collection.add(
|
|
250
|
+
embeddings=embeddings,
|
|
251
|
+
ids=ids,
|
|
252
|
+
metadatas=metadatas,
|
|
253
|
+
documents=documents,
|
|
254
|
+
)
|
|
255
|
+
all_ids.extend(ids)
|
|
256
|
+
|
|
257
|
+
return all_ids
|
|
258
|
+
|
|
259
|
+
def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
|
|
260
|
+
"""
|
|
261
|
+
Delete nodes using with ref_doc_id.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
ref_doc_id (str): The doc_id of the document to delete.
|
|
265
|
+
|
|
266
|
+
"""
|
|
267
|
+
self._collection.delete(where={"document_id": ref_doc_id})
|
|
268
|
+
|
|
269
|
+
@property
|
|
270
|
+
def client(self) -> Any:
|
|
271
|
+
"""Return client."""
|
|
272
|
+
return self._collection
|
|
273
|
+
|
|
274
|
+
def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
|
|
275
|
+
"""Query index for top k most similar nodes.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
query_embedding (List[float]): query embedding
|
|
279
|
+
similarity_top_k (int): top k most similar nodes
|
|
280
|
+
|
|
281
|
+
"""
|
|
282
|
+
if query.filters is not None:
|
|
283
|
+
if "where" in kwargs:
|
|
284
|
+
raise ValueError(
|
|
285
|
+
"Cannot specify metadata filters via both query and kwargs. "
|
|
286
|
+
"Use kwargs only for chroma specific items that are "
|
|
287
|
+
"not supported via the generic query interface."
|
|
288
|
+
)
|
|
289
|
+
where = _to_chroma_filter(query.filters)
|
|
290
|
+
else:
|
|
291
|
+
where = kwargs.pop("where", {})
|
|
292
|
+
|
|
293
|
+
results = self._collection.query(
|
|
294
|
+
query_embeddings=query.query_embedding,
|
|
295
|
+
n_results=query.similarity_top_k,
|
|
296
|
+
where=where,
|
|
297
|
+
**kwargs,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
logger.debug(f"> Top {len(results['documents'])} nodes:")
|
|
301
|
+
nodes = []
|
|
302
|
+
similarities = []
|
|
303
|
+
ids = []
|
|
304
|
+
for node_id, text, metadata, distance in zip(
|
|
305
|
+
results["ids"][0],
|
|
306
|
+
results["documents"][0],
|
|
307
|
+
results["metadatas"][0],
|
|
308
|
+
results["distances"][0],
|
|
309
|
+
):
|
|
310
|
+
try:
|
|
311
|
+
node = metadata_dict_to_node(metadata)
|
|
312
|
+
node.set_content(text)
|
|
313
|
+
except Exception:
|
|
314
|
+
# NOTE: deprecated legacy logic for backward compatibility
|
|
315
|
+
metadata, node_info, relationships = legacy_metadata_dict_to_node(
|
|
316
|
+
metadata
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
node = TextNode(
|
|
320
|
+
text=text,
|
|
321
|
+
id_=node_id,
|
|
322
|
+
metadata=metadata,
|
|
323
|
+
start_char_idx=node_info.get("start", None),
|
|
324
|
+
end_char_idx=node_info.get("end", None),
|
|
325
|
+
relationships=relationships,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
nodes.append(node)
|
|
329
|
+
|
|
330
|
+
similarity_score = math.exp(-distance)
|
|
331
|
+
similarities.append(similarity_score)
|
|
332
|
+
|
|
333
|
+
logger.debug(
|
|
334
|
+
f"> [Node {node_id}] [Similarity score: {similarity_score}] "
|
|
335
|
+
f"{truncate_text(str(text), 100)}"
|
|
336
|
+
)
|
|
337
|
+
ids.append(node_id)
|
|
338
|
+
|
|
339
|
+
return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["poetry-core"]
|
|
3
|
+
build-backend = "poetry.core.masonry.api"
|
|
4
|
+
|
|
5
|
+
[tool.codespell]
|
|
6
|
+
check-filenames = true
|
|
7
|
+
check-hidden = true
|
|
8
|
+
# Feel free to un-skip examples, and experimental, you will just need to
|
|
9
|
+
# work through many typos (--write-changes and --interactive will help)
|
|
10
|
+
skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"
|
|
11
|
+
|
|
12
|
+
[tool.mypy]
|
|
13
|
+
disallow_untyped_defs = true
|
|
14
|
+
# Remove venv skip when integrated with pre-commit
|
|
15
|
+
exclude = ["_static", "build", "examples", "notebooks", "venv"]
|
|
16
|
+
ignore_missing_imports = true
|
|
17
|
+
python_version = "3.8"
|
|
18
|
+
|
|
19
|
+
[tool.poetry]
|
|
20
|
+
name = "llama-index-vector-stores-chroma"
|
|
21
|
+
version = "0.0.1"
|
|
22
|
+
description = "llama-index vector_stores chroma integration"
|
|
23
|
+
authors = ["Your Name <you@example.com>"]
|
|
24
|
+
license = "MIT"
|
|
25
|
+
readme = "README.md"
|
|
26
|
+
packages = [{include = "llama_index/"}]
|
|
27
|
+
|
|
28
|
+
[tool.poetry.dependencies]
|
|
29
|
+
python = ">=3.8.1,<3.12"
|
|
30
|
+
llama-index-core = "^0.9.32"
|
|
31
|
+
chromadb = "^0.4.22"
|
|
32
|
+
|
|
33
|
+
[tool.poetry.group.dev.dependencies]
|
|
34
|
+
black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"}
|
|
35
|
+
codespell = {extras = ["toml"], version = ">=v2.2.6"}
|
|
36
|
+
ipython = "8.10.0"
|
|
37
|
+
jupyter = "^1.0.0"
|
|
38
|
+
mypy = "0.991"
|
|
39
|
+
pre-commit = "3.2.0"
|
|
40
|
+
pylint = "2.15.10"
|
|
41
|
+
pytest = "7.2.1"
|
|
42
|
+
pytest-mock = "3.11.1"
|
|
43
|
+
ruff = "0.0.292"
|
|
44
|
+
tree-sitter-languages = "^1.8.0"
|
|
45
|
+
types-Deprecated = ">=0.1.0"
|
|
46
|
+
types-PyYAML = "^6.0.12.12"
|
|
47
|
+
types-protobuf = "^4.24.0.4"
|
|
48
|
+
types-redis = "4.5.5.0"
|
|
49
|
+
types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991
|
|
50
|
+
types-setuptools = "67.1.0.0"
|