langroid 0.31.2__py3-none-any.whl → 0.33.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. {langroid-0.31.2.dist-info → langroid-0.33.3.dist-info}/METADATA +150 -124
  2. langroid-0.33.3.dist-info/RECORD +7 -0
  3. {langroid-0.31.2.dist-info → langroid-0.33.3.dist-info}/WHEEL +1 -1
  4. langroid-0.33.3.dist-info/entry_points.txt +4 -0
  5. pyproject.toml +317 -212
  6. langroid/__init__.py +0 -106
  7. langroid/agent/.chainlit/config.toml +0 -121
  8. langroid/agent/.chainlit/translations/bn.json +0 -231
  9. langroid/agent/.chainlit/translations/en-US.json +0 -229
  10. langroid/agent/.chainlit/translations/gu.json +0 -231
  11. langroid/agent/.chainlit/translations/he-IL.json +0 -231
  12. langroid/agent/.chainlit/translations/hi.json +0 -231
  13. langroid/agent/.chainlit/translations/kn.json +0 -231
  14. langroid/agent/.chainlit/translations/ml.json +0 -231
  15. langroid/agent/.chainlit/translations/mr.json +0 -231
  16. langroid/agent/.chainlit/translations/ta.json +0 -231
  17. langroid/agent/.chainlit/translations/te.json +0 -231
  18. langroid/agent/.chainlit/translations/zh-CN.json +0 -229
  19. langroid/agent/__init__.py +0 -41
  20. langroid/agent/base.py +0 -1981
  21. langroid/agent/batch.py +0 -398
  22. langroid/agent/callbacks/__init__.py +0 -0
  23. langroid/agent/callbacks/chainlit.py +0 -598
  24. langroid/agent/chat_agent.py +0 -1899
  25. langroid/agent/chat_document.py +0 -454
  26. langroid/agent/helpers.py +0 -0
  27. langroid/agent/junk +0 -13
  28. langroid/agent/openai_assistant.py +0 -882
  29. langroid/agent/special/__init__.py +0 -59
  30. langroid/agent/special/arangodb/__init__.py +0 -0
  31. langroid/agent/special/arangodb/arangodb_agent.py +0 -656
  32. langroid/agent/special/arangodb/system_messages.py +0 -186
  33. langroid/agent/special/arangodb/tools.py +0 -107
  34. langroid/agent/special/arangodb/utils.py +0 -36
  35. langroid/agent/special/doc_chat_agent.py +0 -1466
  36. langroid/agent/special/lance_doc_chat_agent.py +0 -262
  37. langroid/agent/special/lance_rag/__init__.py +0 -9
  38. langroid/agent/special/lance_rag/critic_agent.py +0 -198
  39. langroid/agent/special/lance_rag/lance_rag_task.py +0 -82
  40. langroid/agent/special/lance_rag/query_planner_agent.py +0 -260
  41. langroid/agent/special/lance_tools.py +0 -61
  42. langroid/agent/special/neo4j/__init__.py +0 -0
  43. langroid/agent/special/neo4j/csv_kg_chat.py +0 -174
  44. langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -433
  45. langroid/agent/special/neo4j/system_messages.py +0 -120
  46. langroid/agent/special/neo4j/tools.py +0 -32
  47. langroid/agent/special/relevance_extractor_agent.py +0 -127
  48. langroid/agent/special/retriever_agent.py +0 -56
  49. langroid/agent/special/sql/__init__.py +0 -17
  50. langroid/agent/special/sql/sql_chat_agent.py +0 -654
  51. langroid/agent/special/sql/utils/__init__.py +0 -21
  52. langroid/agent/special/sql/utils/description_extractors.py +0 -190
  53. langroid/agent/special/sql/utils/populate_metadata.py +0 -85
  54. langroid/agent/special/sql/utils/system_message.py +0 -35
  55. langroid/agent/special/sql/utils/tools.py +0 -64
  56. langroid/agent/special/table_chat_agent.py +0 -263
  57. langroid/agent/structured_message.py +0 -9
  58. langroid/agent/task.py +0 -2093
  59. langroid/agent/tool_message.py +0 -393
  60. langroid/agent/tools/__init__.py +0 -38
  61. langroid/agent/tools/duckduckgo_search_tool.py +0 -50
  62. langroid/agent/tools/file_tools.py +0 -234
  63. langroid/agent/tools/google_search_tool.py +0 -39
  64. langroid/agent/tools/metaphor_search_tool.py +0 -67
  65. langroid/agent/tools/orchestration.py +0 -303
  66. langroid/agent/tools/recipient_tool.py +0 -235
  67. langroid/agent/tools/retrieval_tool.py +0 -32
  68. langroid/agent/tools/rewind_tool.py +0 -137
  69. langroid/agent/tools/segment_extract_tool.py +0 -41
  70. langroid/agent/typed_task.py +0 -19
  71. langroid/agent/xml_tool_message.py +0 -382
  72. langroid/agent_config.py +0 -0
  73. langroid/cachedb/__init__.py +0 -17
  74. langroid/cachedb/base.py +0 -58
  75. langroid/cachedb/momento_cachedb.py +0 -108
  76. langroid/cachedb/redis_cachedb.py +0 -153
  77. langroid/embedding_models/__init__.py +0 -39
  78. langroid/embedding_models/base.py +0 -74
  79. langroid/embedding_models/clustering.py +0 -189
  80. langroid/embedding_models/models.py +0 -461
  81. langroid/embedding_models/protoc/__init__.py +0 -0
  82. langroid/embedding_models/protoc/embeddings.proto +0 -19
  83. langroid/embedding_models/protoc/embeddings_pb2.py +0 -33
  84. langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -50
  85. langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -79
  86. langroid/embedding_models/remote_embeds.py +0 -153
  87. langroid/exceptions.py +0 -65
  88. langroid/experimental/team-save.py +0 -391
  89. langroid/language_models/.chainlit/config.toml +0 -121
  90. langroid/language_models/.chainlit/translations/en-US.json +0 -231
  91. langroid/language_models/__init__.py +0 -53
  92. langroid/language_models/azure_openai.py +0 -153
  93. langroid/language_models/base.py +0 -678
  94. langroid/language_models/config.py +0 -18
  95. langroid/language_models/mock_lm.py +0 -124
  96. langroid/language_models/openai_gpt.py +0 -1923
  97. langroid/language_models/prompt_formatter/__init__.py +0 -16
  98. langroid/language_models/prompt_formatter/base.py +0 -40
  99. langroid/language_models/prompt_formatter/hf_formatter.py +0 -132
  100. langroid/language_models/prompt_formatter/llama2_formatter.py +0 -75
  101. langroid/language_models/utils.py +0 -147
  102. langroid/mytypes.py +0 -84
  103. langroid/parsing/__init__.py +0 -52
  104. langroid/parsing/agent_chats.py +0 -38
  105. langroid/parsing/code-parsing.md +0 -86
  106. langroid/parsing/code_parser.py +0 -121
  107. langroid/parsing/config.py +0 -0
  108. langroid/parsing/document_parser.py +0 -718
  109. langroid/parsing/image_text.py +0 -32
  110. langroid/parsing/para_sentence_split.py +0 -62
  111. langroid/parsing/parse_json.py +0 -155
  112. langroid/parsing/parser.py +0 -313
  113. langroid/parsing/repo_loader.py +0 -790
  114. langroid/parsing/routing.py +0 -36
  115. langroid/parsing/search.py +0 -275
  116. langroid/parsing/spider.py +0 -102
  117. langroid/parsing/table_loader.py +0 -94
  118. langroid/parsing/url_loader.py +0 -111
  119. langroid/parsing/url_loader_cookies.py +0 -73
  120. langroid/parsing/urls.py +0 -273
  121. langroid/parsing/utils.py +0 -373
  122. langroid/parsing/web_search.py +0 -155
  123. langroid/prompts/__init__.py +0 -9
  124. langroid/prompts/chat-gpt4-system-prompt.md +0 -68
  125. langroid/prompts/dialog.py +0 -17
  126. langroid/prompts/prompts_config.py +0 -5
  127. langroid/prompts/templates.py +0 -141
  128. langroid/pydantic_v1/__init__.py +0 -10
  129. langroid/pydantic_v1/main.py +0 -4
  130. langroid/utils/.chainlit/config.toml +0 -121
  131. langroid/utils/.chainlit/translations/en-US.json +0 -231
  132. langroid/utils/__init__.py +0 -19
  133. langroid/utils/algorithms/__init__.py +0 -3
  134. langroid/utils/algorithms/graph.py +0 -103
  135. langroid/utils/configuration.py +0 -98
  136. langroid/utils/constants.py +0 -30
  137. langroid/utils/docker.py +0 -37
  138. langroid/utils/git_utils.py +0 -252
  139. langroid/utils/globals.py +0 -49
  140. langroid/utils/llms/__init__.py +0 -0
  141. langroid/utils/llms/strings.py +0 -8
  142. langroid/utils/logging.py +0 -135
  143. langroid/utils/object_registry.py +0 -66
  144. langroid/utils/output/__init__.py +0 -20
  145. langroid/utils/output/citations.py +0 -41
  146. langroid/utils/output/printing.py +0 -99
  147. langroid/utils/output/status.py +0 -40
  148. langroid/utils/pandas_utils.py +0 -30
  149. langroid/utils/pydantic_utils.py +0 -602
  150. langroid/utils/system.py +0 -286
  151. langroid/utils/types.py +0 -93
  152. langroid/utils/web/__init__.py +0 -0
  153. langroid/utils/web/login.py +0 -83
  154. langroid/vector_store/__init__.py +0 -50
  155. langroid/vector_store/base.py +0 -357
  156. langroid/vector_store/chromadb.py +0 -214
  157. langroid/vector_store/lancedb.py +0 -401
  158. langroid/vector_store/meilisearch.py +0 -299
  159. langroid/vector_store/momento.py +0 -278
  160. langroid/vector_store/qdrant_cloud.py +0 -6
  161. langroid/vector_store/qdrantdb.py +0 -468
  162. langroid-0.31.2.dist-info/RECORD +0 -162
  163. {langroid-0.31.2.dist-info → langroid-0.33.3.dist-info/licenses}/LICENSE +0 -0
@@ -1,357 +0,0 @@
1
- import copy
2
- import logging
3
- from abc import ABC, abstractmethod
4
- from typing import Dict, List, Optional, Sequence, Tuple, Type
5
-
6
- import numpy as np
7
- import pandas as pd
8
-
9
- from langroid.embedding_models.base import EmbeddingModel, EmbeddingModelsConfig
10
- from langroid.embedding_models.models import OpenAIEmbeddingsConfig
11
- from langroid.mytypes import DocMetaData, Document
12
- from langroid.pydantic_v1 import BaseSettings
13
- from langroid.utils.algorithms.graph import components, topological_sort
14
- from langroid.utils.configuration import settings
15
- from langroid.utils.object_registry import ObjectRegistry
16
- from langroid.utils.output.printing import print_long_text
17
- from langroid.utils.pandas_utils import stringify
18
- from langroid.utils.pydantic_utils import flatten_dict
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- class VectorStoreConfig(BaseSettings):
24
- type: str = "" # deprecated, keeping it for backward compatibility
25
- collection_name: str | None = "temp"
26
- replace_collection: bool = False # replace collection if it already exists
27
- storage_path: str = ".qdrant/data"
28
- cloud: bool = False
29
- batch_size: int = 200
30
- embedding: EmbeddingModelsConfig = OpenAIEmbeddingsConfig(
31
- model_type="openai",
32
- )
33
- embedding_model: Optional[EmbeddingModel] = None
34
- timeout: int = 60
35
- host: str = "127.0.0.1"
36
- port: int = 6333
37
- # used when parsing search results back as Document objects
38
- document_class: Type[Document] = Document
39
- metadata_class: Type[DocMetaData] = DocMetaData
40
- # compose_file: str = "langroid/vector_store/docker-compose-qdrant.yml"
41
-
42
-
43
- class VectorStore(ABC):
44
- """
45
- Abstract base class for a vector store.
46
- """
47
-
48
- def __init__(self, config: VectorStoreConfig):
49
- self.config = config
50
- if config.embedding_model is None:
51
- self.embedding_model = EmbeddingModel.create(config.embedding)
52
- else:
53
- self.embedding_model = config.embedding_model
54
-
55
- @staticmethod
56
- def create(config: VectorStoreConfig) -> Optional["VectorStore"]:
57
- from langroid.vector_store.chromadb import ChromaDB, ChromaDBConfig
58
- from langroid.vector_store.lancedb import LanceDB, LanceDBConfig
59
- from langroid.vector_store.meilisearch import MeiliSearch, MeiliSearchConfig
60
- from langroid.vector_store.momento import MomentoVI, MomentoVIConfig
61
- from langroid.vector_store.qdrantdb import QdrantDB, QdrantDBConfig
62
-
63
- if isinstance(config, QdrantDBConfig):
64
- return QdrantDB(config)
65
- elif isinstance(config, ChromaDBConfig):
66
- return ChromaDB(config)
67
- elif isinstance(config, MomentoVIConfig):
68
- return MomentoVI(config)
69
- elif isinstance(config, LanceDBConfig):
70
- return LanceDB(config)
71
- elif isinstance(config, MeiliSearchConfig):
72
- return MeiliSearch(config)
73
-
74
- else:
75
- logger.warning(
76
- f"""
77
- Unknown vector store config: {config.__repr_name__()},
78
- so skipping vector store creation!
79
- If you intended to use a vector-store, please set a specific
80
- vector-store in your script, typically in the `vecdb` field of a
81
- `ChatAgentConfig`, otherwise set it to None.
82
- """
83
- )
84
- return None
85
-
86
- @abstractmethod
87
- def clear_empty_collections(self) -> int:
88
- """Clear all empty collections in the vector store.
89
- Returns the number of collections deleted.
90
- """
91
- pass
92
-
93
- @abstractmethod
94
- def clear_all_collections(self, really: bool = False, prefix: str = "") -> int:
95
- """
96
- Clear all collections in the vector store.
97
-
98
- Args:
99
- really (bool, optional): Whether to really clear all collections.
100
- Defaults to False.
101
- prefix (str, optional): Prefix of collections to clear.
102
- Returns:
103
- int: Number of collections deleted.
104
- """
105
- pass
106
-
107
- @abstractmethod
108
- def list_collections(self, empty: bool = False) -> List[str]:
109
- """List all collections in the vector store
110
- (only non empty collections if empty=False).
111
- """
112
- pass
113
-
114
- def set_collection(self, collection_name: str, replace: bool = False) -> None:
115
- """
116
- Set the current collection to the given collection name.
117
- Args:
118
- collection_name (str): Name of the collection.
119
- replace (bool, optional): Whether to replace the collection if it
120
- already exists. Defaults to False.
121
- """
122
-
123
- self.config.collection_name = collection_name
124
- self.config.replace_collection = replace
125
-
126
- @abstractmethod
127
- def create_collection(self, collection_name: str, replace: bool = False) -> None:
128
- """Create a collection with the given name.
129
- Args:
130
- collection_name (str): Name of the collection.
131
- replace (bool, optional): Whether to replace the
132
- collection if it already exists. Defaults to False.
133
- """
134
- pass
135
-
136
- @abstractmethod
137
- def add_documents(self, documents: Sequence[Document]) -> None:
138
- pass
139
-
140
- def compute_from_docs(self, docs: List[Document], calc: str) -> str:
141
- """Compute a result on a set of documents,
142
- using a dataframe calc string like `df.groupby('state')['income'].mean()`.
143
- """
144
- # convert each doc to a dict, using dotted paths for nested fields
145
- dicts = [flatten_dict(doc.dict(by_alias=True)) for doc in docs]
146
- df = pd.DataFrame(dicts)
147
-
148
- try:
149
- result = pd.eval( # safer than eval but limited to single expression
150
- calc,
151
- engine="python",
152
- parser="pandas",
153
- local_dict={"df": df},
154
- )
155
- except Exception as e:
156
- # return error message so LLM can fix the calc string if needed
157
- err = f"""
158
- Error encountered in pandas eval: {str(e)}
159
- """
160
- if isinstance(e, KeyError) and "not in index" in str(e):
161
- # Pd.eval sometimes fails on a perfectly valid exprn like
162
- # df.loc[..., 'column'] with a KeyError.
163
- err += """
164
- Maybe try a different way, e.g.
165
- instead of df.loc[..., 'column'], try df.loc[...]['column']
166
- """
167
- return err
168
- return stringify(result)
169
-
170
- def maybe_add_ids(self, documents: Sequence[Document]) -> None:
171
- """Add ids to metadata if absent, since some
172
- vecdbs don't like having blank ids."""
173
- for d in documents:
174
- if d.metadata.id in [None, ""]:
175
- d.metadata.id = ObjectRegistry.new_id()
176
-
177
- @abstractmethod
178
- def similar_texts_with_scores(
179
- self,
180
- text: str,
181
- k: int = 1,
182
- where: Optional[str] = None,
183
- ) -> List[Tuple[Document, float]]:
184
- """
185
- Find k most similar texts to the given text, in terms of vector distance metric
186
- (e.g., cosine similarity).
187
-
188
- Args:
189
- text (str): The text to find similar texts for.
190
- k (int, optional): Number of similar texts to retrieve. Defaults to 1.
191
- where (Optional[str], optional): Where clause to filter the search.
192
-
193
- Returns:
194
- List[Tuple[Document,float]]: List of (Document, score) tuples.
195
-
196
- """
197
- pass
198
-
199
- def add_context_window(
200
- self, docs_scores: List[Tuple[Document, float]], neighbors: int = 0
201
- ) -> List[Tuple[Document, float]]:
202
- """
203
- In each doc's metadata, there may be a window_ids field indicating
204
- the ids of the chunks around the current chunk.
205
- These window_ids may overlap, so we
206
- - coalesce each overlapping groups into a single window (maintaining ordering),
207
- - create a new document for each part, preserving metadata,
208
-
209
- We may have stored a longer set of window_ids than we need during chunking.
210
- Now, we just want `neighbors` on each side of the center of the window_ids list.
211
-
212
- Args:
213
- docs_scores (List[Tuple[Document, float]]): List of pairs of documents
214
- to add context windows to together with their match scores.
215
- neighbors (int, optional): Number of neighbors on "each side" of match to
216
- retrieve. Defaults to 0.
217
- "Each side" here means before and after the match,
218
- in the original text.
219
-
220
- Returns:
221
- List[Tuple[Document, float]]: List of (Document, score) tuples.
222
- """
223
- # We return a larger context around each match, i.e.
224
- # a window of `neighbors` on each side of the match.
225
- docs = [d for d, s in docs_scores]
226
- scores = [s for d, s in docs_scores]
227
- if neighbors == 0:
228
- return docs_scores
229
- doc_chunks = [d for d in docs if d.metadata.is_chunk]
230
- if len(doc_chunks) == 0:
231
- return docs_scores
232
- window_ids_list = []
233
- id2metadata = {}
234
- # id -> highest score of a doc it appears in
235
- id2max_score: Dict[int | str, float] = {}
236
- for i, d in enumerate(docs):
237
- window_ids = d.metadata.window_ids
238
- if len(window_ids) == 0:
239
- window_ids = [d.id()]
240
- id2metadata.update({id: d.metadata for id in window_ids})
241
-
242
- id2max_score.update(
243
- {id: max(id2max_score.get(id, 0), scores[i]) for id in window_ids}
244
- )
245
- n = len(window_ids)
246
- chunk_idx = window_ids.index(d.id())
247
- neighbor_ids = window_ids[
248
- max(0, chunk_idx - neighbors) : min(n, chunk_idx + neighbors + 1)
249
- ]
250
- window_ids_list += [neighbor_ids]
251
-
252
- # window_ids could be from different docs,
253
- # and they may overlap, so we coalesce overlapping groups into
254
- # separate windows.
255
- window_ids_list = self.remove_overlaps(window_ids_list)
256
- final_docs = []
257
- final_scores = []
258
- for w in window_ids_list:
259
- metadata = copy.deepcopy(id2metadata[w[0]])
260
- metadata.window_ids = w
261
- document = Document(
262
- content=" ".join([d.content for d in self.get_documents_by_ids(w)]),
263
- metadata=metadata,
264
- )
265
- # make a fresh id since content is in general different
266
- document.metadata.id = ObjectRegistry.new_id()
267
- final_docs += [document]
268
- final_scores += [max(id2max_score[id] for id in w)]
269
- return list(zip(final_docs, final_scores))
270
-
271
- @staticmethod
272
- def remove_overlaps(windows: List[List[str]]) -> List[List[str]]:
273
- """
274
- Given a collection of windows, where each window is a sequence of ids,
275
- identify groups of overlapping windows, and for each overlapping group,
276
- order the chunk-ids using topological sort so they appear in the original
277
- order in the text.
278
-
279
- Args:
280
- windows (List[int|str]): List of windows, where each window is a
281
- sequence of ids.
282
-
283
- Returns:
284
- List[int|str]: List of windows, where each window is a sequence of ids,
285
- and no two windows overlap.
286
- """
287
- ids = set(id for w in windows for id in w)
288
- # id -> {win -> # pos}
289
- id2win2pos: Dict[str, Dict[int, int]] = {id: {} for id in ids}
290
-
291
- for i, w in enumerate(windows):
292
- for j, id in enumerate(w):
293
- id2win2pos[id][i] = j
294
-
295
- n = len(windows)
296
- # relation between windows:
297
- order = np.zeros((n, n), dtype=np.int8)
298
- for i, w in enumerate(windows):
299
- for j, x in enumerate(windows):
300
- if i == j:
301
- continue
302
- if len(set(w).intersection(x)) == 0:
303
- continue
304
- id = list(set(w).intersection(x))[0] # any common id
305
- if id2win2pos[id][i] > id2win2pos[id][j]:
306
- order[i, j] = -1 # win i is before win j
307
- else:
308
- order[i, j] = 1 # win i is after win j
309
-
310
- # find groups of windows that overlap, like connected components in a graph
311
- groups = components(np.abs(order))
312
-
313
- # order the chunk-ids in each group using topological sort
314
- new_windows = []
315
- for g in groups:
316
- # find total ordering among windows in group based on order matrix
317
- # (this is a topological sort)
318
- _g = np.array(g)
319
- order_matrix = order[_g][:, _g]
320
- ordered_window_indices = topological_sort(order_matrix)
321
- ordered_window_ids = [windows[i] for i in _g[ordered_window_indices]]
322
- flattened = [id for w in ordered_window_ids for id in w]
323
- flattened_deduped = list(dict.fromkeys(flattened))
324
- # Note we are not going to split these, and instead we'll return
325
- # larger windows from concatenating the connected groups.
326
- # This ensures context is retained for LLM q/a
327
- new_windows += [flattened_deduped]
328
-
329
- return new_windows
330
-
331
- @abstractmethod
332
- def get_all_documents(self, where: str = "") -> List[Document]:
333
- """
334
- Get all documents in the current collection, possibly filtered by `where`.
335
- """
336
- pass
337
-
338
- @abstractmethod
339
- def get_documents_by_ids(self, ids: List[str]) -> List[Document]:
340
- """
341
- Get documents by their ids.
342
- Args:
343
- ids (List[str]): List of document ids.
344
-
345
- Returns:
346
- List[Document]: List of documents
347
- """
348
- pass
349
-
350
- @abstractmethod
351
- def delete_collection(self, collection_name: str) -> None:
352
- pass
353
-
354
- def show_if_debug(self, doc_score_pairs: List[Tuple[Document, float]]) -> None:
355
- if settings.debug:
356
- for i, (d, s) in enumerate(doc_score_pairs):
357
- print_long_text("red", "italic red", f"\nMATCH-{i}\n", d.content)
@@ -1,214 +0,0 @@
1
- import json
2
- import logging
3
- from typing import Any, Dict, List, Optional, Sequence, Tuple
4
-
5
- from langroid.embedding_models.base import (
6
- EmbeddingModelsConfig,
7
- )
8
- from langroid.embedding_models.models import OpenAIEmbeddingsConfig
9
- from langroid.exceptions import LangroidImportError
10
- from langroid.mytypes import Document
11
- from langroid.utils.configuration import settings
12
- from langroid.utils.output.printing import print_long_text
13
- from langroid.vector_store.base import VectorStore, VectorStoreConfig
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
-
18
- class ChromaDBConfig(VectorStoreConfig):
19
- collection_name: str = "temp"
20
- storage_path: str = ".chroma/data"
21
- embedding: EmbeddingModelsConfig = OpenAIEmbeddingsConfig()
22
- host: str = "127.0.0.1"
23
- port: int = 6333
24
-
25
-
26
- class ChromaDB(VectorStore):
27
- def __init__(self, config: ChromaDBConfig = ChromaDBConfig()):
28
- super().__init__(config)
29
- try:
30
- import chromadb
31
- except ImportError:
32
- raise LangroidImportError("chromadb", "chromadb")
33
- self.config = config
34
- self.embedding_fn = self.embedding_model.embedding_fn()
35
- self.client = chromadb.Client(
36
- chromadb.config.Settings(
37
- # chroma_db_impl="duckdb+parquet",
38
- # is_persistent=bool(config.storage_path),
39
- persist_directory=config.storage_path,
40
- )
41
- )
42
- if self.config.collection_name is not None:
43
- self.create_collection(
44
- self.config.collection_name,
45
- replace=self.config.replace_collection,
46
- )
47
-
48
- def clear_all_collections(self, really: bool = False, prefix: str = "") -> int:
49
- """Clear all collections in the vector store with the given prefix."""
50
-
51
- if not really:
52
- logger.warning("Not deleting all collections, set really=True to confirm")
53
- return 0
54
- coll = [c for c in self.client.list_collections() if c.name.startswith(prefix)]
55
- if len(coll) == 0:
56
- logger.warning(f"No collections found with prefix {prefix}")
57
- return 0
58
- n_empty_deletes = 0
59
- n_non_empty_deletes = 0
60
- for c in coll:
61
- n_empty_deletes += c.count() == 0
62
- n_non_empty_deletes += c.count() > 0
63
- self.client.delete_collection(name=c.name)
64
- logger.warning(
65
- f"""
66
- Deleted {n_empty_deletes} empty collections and
67
- {n_non_empty_deletes} non-empty collections.
68
- """
69
- )
70
- return n_empty_deletes + n_non_empty_deletes
71
-
72
- def clear_empty_collections(self) -> int:
73
- colls = self.client.list_collections()
74
- n_deletes = 0
75
- for coll in colls:
76
- if coll.count() == 0:
77
- n_deletes += 1
78
- self.client.delete_collection(name=coll.name)
79
- return n_deletes
80
-
81
- def list_collections(self, empty: bool = False) -> List[str]:
82
- """
83
- List non-empty collections in the vector store.
84
- Args:
85
- empty (bool, optional): Whether to list empty collections.
86
- Returns:
87
- List[str]: List of non-empty collection names.
88
- """
89
- colls = self.client.list_collections()
90
- if empty:
91
- return [coll.name for coll in colls]
92
- return [coll.name for coll in colls if coll.count() > 0]
93
-
94
- def create_collection(self, collection_name: str, replace: bool = False) -> None:
95
- """
96
- Create a collection in the vector store, optionally replacing an existing
97
- collection if `replace` is True.
98
- Args:
99
- collection_name (str): Name of the collection to create or replace.
100
- replace (bool, optional): Whether to replace an existing collection.
101
- Defaults to False.
102
-
103
- """
104
- self.config.collection_name = collection_name
105
- if collection_name in self.list_collections(empty=True) and replace:
106
- logger.warning(f"Replacing existing collection {collection_name}")
107
- self.client.delete_collection(collection_name)
108
- self.collection = self.client.create_collection(
109
- name=self.config.collection_name,
110
- embedding_function=self.embedding_fn,
111
- get_or_create=not replace,
112
- )
113
-
114
- def add_documents(self, documents: Sequence[Document]) -> None:
115
- super().maybe_add_ids(documents)
116
- if documents is None:
117
- return
118
- contents: List[str] = [document.content for document in documents]
119
- # convert metadatas to dicts so chroma can handle them
120
- metadata_dicts: List[dict[str, Any]] = [
121
- d.metadata.dict_bool_int() for d in documents
122
- ]
123
- for m in metadata_dicts:
124
- # chroma does not handle non-atomic types in metadata
125
- m["window_ids"] = ",".join(m["window_ids"])
126
-
127
- ids = [str(d.id()) for d in documents]
128
-
129
- colls = self.list_collections(empty=True)
130
- if self.config.collection_name is None:
131
- raise ValueError("No collection name set, cannot ingest docs")
132
- if self.config.collection_name not in colls:
133
- self.create_collection(self.config.collection_name, replace=True)
134
-
135
- self.collection.add(
136
- # embedding_models=embedding_models,
137
- documents=contents,
138
- metadatas=metadata_dicts,
139
- ids=ids,
140
- )
141
-
142
- def get_all_documents(self, where: str = "") -> List[Document]:
143
- filter = json.loads(where) if where else None
144
- results = self.collection.get(
145
- include=["documents", "metadatas"],
146
- where=filter,
147
- )
148
- results["documents"] = [results["documents"]]
149
- results["metadatas"] = [results["metadatas"]]
150
- return self._docs_from_results(results)
151
-
152
- def get_documents_by_ids(self, ids: List[str]) -> List[Document]:
153
- # get them one by one since chroma mangles the order of the results
154
- # when fetched from a list of ids.
155
- results = [
156
- self.collection.get(ids=[id], include=["documents", "metadatas"])
157
- for id in ids
158
- ]
159
- final_results = {}
160
- final_results["documents"] = [[r["documents"][0] for r in results]]
161
- final_results["metadatas"] = [[r["metadatas"][0] for r in results]]
162
- return self._docs_from_results(final_results)
163
-
164
- def delete_collection(self, collection_name: str) -> None:
165
- try:
166
- self.client.delete_collection(name=collection_name)
167
- except Exception:
168
- pass
169
-
170
- def similar_texts_with_scores(
171
- self, text: str, k: int = 1, where: Optional[str] = None
172
- ) -> List[Tuple[Document, float]]:
173
- n = self.collection.count()
174
- filter = json.loads(where) if where else None
175
- results = self.collection.query(
176
- query_texts=[text],
177
- n_results=min(n, k),
178
- where=filter,
179
- include=["documents", "distances", "metadatas"],
180
- )
181
- docs = self._docs_from_results(results)
182
- # chroma distances are 1 - cosine.
183
- scores = [1 - s for s in results["distances"][0]]
184
- return list(zip(docs, scores))
185
-
186
- def _docs_from_results(self, results: Dict[str, Any]) -> List[Document]:
187
- """
188
- Helper function to convert results from ChromaDB to a list of Documents
189
- Args:
190
- results (dict): results from ChromaDB
191
-
192
- Returns:
193
- List[Document]: list of Documents
194
- """
195
- if len(results["documents"][0]) == 0:
196
- return []
197
- contents = results["documents"][0]
198
- if settings.debug:
199
- for i, c in enumerate(contents):
200
- print_long_text("red", "italic red", f"MATCH-{i}", c)
201
- metadatas = results["metadatas"][0]
202
- for m in metadatas:
203
- # restore the stringified list of window_ids into the original List[str]
204
- if m["window_ids"].strip() == "":
205
- m["window_ids"] = []
206
- else:
207
- m["window_ids"] = m["window_ids"].split(",")
208
- docs = [
209
- self.config.document_class(
210
- content=d, metadata=self.config.metadata_class(**m)
211
- )
212
- for d, m in zip(contents, metadatas)
213
- ]
214
- return docs