langroid 0.31.1__py3-none-any.whl → 0.33.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info}/METADATA +150 -124
  2. langroid-0.33.3.dist-info/RECORD +7 -0
  3. {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info}/WHEEL +1 -1
  4. langroid-0.33.3.dist-info/entry_points.txt +4 -0
  5. pyproject.toml +317 -212
  6. langroid/__init__.py +0 -106
  7. langroid/agent/.chainlit/config.toml +0 -121
  8. langroid/agent/.chainlit/translations/bn.json +0 -231
  9. langroid/agent/.chainlit/translations/en-US.json +0 -229
  10. langroid/agent/.chainlit/translations/gu.json +0 -231
  11. langroid/agent/.chainlit/translations/he-IL.json +0 -231
  12. langroid/agent/.chainlit/translations/hi.json +0 -231
  13. langroid/agent/.chainlit/translations/kn.json +0 -231
  14. langroid/agent/.chainlit/translations/ml.json +0 -231
  15. langroid/agent/.chainlit/translations/mr.json +0 -231
  16. langroid/agent/.chainlit/translations/ta.json +0 -231
  17. langroid/agent/.chainlit/translations/te.json +0 -231
  18. langroid/agent/.chainlit/translations/zh-CN.json +0 -229
  19. langroid/agent/__init__.py +0 -41
  20. langroid/agent/base.py +0 -1981
  21. langroid/agent/batch.py +0 -398
  22. langroid/agent/callbacks/__init__.py +0 -0
  23. langroid/agent/callbacks/chainlit.py +0 -598
  24. langroid/agent/chat_agent.py +0 -1899
  25. langroid/agent/chat_document.py +0 -454
  26. langroid/agent/helpers.py +0 -0
  27. langroid/agent/junk +0 -13
  28. langroid/agent/openai_assistant.py +0 -882
  29. langroid/agent/special/__init__.py +0 -59
  30. langroid/agent/special/arangodb/__init__.py +0 -0
  31. langroid/agent/special/arangodb/arangodb_agent.py +0 -656
  32. langroid/agent/special/arangodb/system_messages.py +0 -186
  33. langroid/agent/special/arangodb/tools.py +0 -107
  34. langroid/agent/special/arangodb/utils.py +0 -36
  35. langroid/agent/special/doc_chat_agent.py +0 -1466
  36. langroid/agent/special/lance_doc_chat_agent.py +0 -262
  37. langroid/agent/special/lance_rag/__init__.py +0 -9
  38. langroid/agent/special/lance_rag/critic_agent.py +0 -198
  39. langroid/agent/special/lance_rag/lance_rag_task.py +0 -82
  40. langroid/agent/special/lance_rag/query_planner_agent.py +0 -260
  41. langroid/agent/special/lance_tools.py +0 -61
  42. langroid/agent/special/neo4j/__init__.py +0 -0
  43. langroid/agent/special/neo4j/csv_kg_chat.py +0 -174
  44. langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -433
  45. langroid/agent/special/neo4j/system_messages.py +0 -120
  46. langroid/agent/special/neo4j/tools.py +0 -32
  47. langroid/agent/special/relevance_extractor_agent.py +0 -127
  48. langroid/agent/special/retriever_agent.py +0 -56
  49. langroid/agent/special/sql/__init__.py +0 -17
  50. langroid/agent/special/sql/sql_chat_agent.py +0 -654
  51. langroid/agent/special/sql/utils/__init__.py +0 -21
  52. langroid/agent/special/sql/utils/description_extractors.py +0 -190
  53. langroid/agent/special/sql/utils/populate_metadata.py +0 -85
  54. langroid/agent/special/sql/utils/system_message.py +0 -35
  55. langroid/agent/special/sql/utils/tools.py +0 -64
  56. langroid/agent/special/table_chat_agent.py +0 -263
  57. langroid/agent/structured_message.py +0 -9
  58. langroid/agent/task.py +0 -2093
  59. langroid/agent/tool_message.py +0 -393
  60. langroid/agent/tools/__init__.py +0 -38
  61. langroid/agent/tools/duckduckgo_search_tool.py +0 -50
  62. langroid/agent/tools/file_tools.py +0 -234
  63. langroid/agent/tools/google_search_tool.py +0 -39
  64. langroid/agent/tools/metaphor_search_tool.py +0 -67
  65. langroid/agent/tools/orchestration.py +0 -303
  66. langroid/agent/tools/recipient_tool.py +0 -235
  67. langroid/agent/tools/retrieval_tool.py +0 -32
  68. langroid/agent/tools/rewind_tool.py +0 -137
  69. langroid/agent/tools/segment_extract_tool.py +0 -41
  70. langroid/agent/typed_task.py +0 -19
  71. langroid/agent/xml_tool_message.py +0 -382
  72. langroid/agent_config.py +0 -0
  73. langroid/cachedb/__init__.py +0 -17
  74. langroid/cachedb/base.py +0 -58
  75. langroid/cachedb/momento_cachedb.py +0 -108
  76. langroid/cachedb/redis_cachedb.py +0 -153
  77. langroid/embedding_models/__init__.py +0 -39
  78. langroid/embedding_models/base.py +0 -74
  79. langroid/embedding_models/clustering.py +0 -189
  80. langroid/embedding_models/models.py +0 -461
  81. langroid/embedding_models/protoc/__init__.py +0 -0
  82. langroid/embedding_models/protoc/embeddings.proto +0 -19
  83. langroid/embedding_models/protoc/embeddings_pb2.py +0 -33
  84. langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -50
  85. langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -79
  86. langroid/embedding_models/remote_embeds.py +0 -153
  87. langroid/exceptions.py +0 -65
  88. langroid/experimental/team-save.py +0 -391
  89. langroid/language_models/.chainlit/config.toml +0 -121
  90. langroid/language_models/.chainlit/translations/en-US.json +0 -231
  91. langroid/language_models/__init__.py +0 -53
  92. langroid/language_models/azure_openai.py +0 -153
  93. langroid/language_models/base.py +0 -678
  94. langroid/language_models/config.py +0 -18
  95. langroid/language_models/mock_lm.py +0 -124
  96. langroid/language_models/openai_gpt.py +0 -1923
  97. langroid/language_models/prompt_formatter/__init__.py +0 -16
  98. langroid/language_models/prompt_formatter/base.py +0 -40
  99. langroid/language_models/prompt_formatter/hf_formatter.py +0 -132
  100. langroid/language_models/prompt_formatter/llama2_formatter.py +0 -75
  101. langroid/language_models/utils.py +0 -147
  102. langroid/mytypes.py +0 -84
  103. langroid/parsing/__init__.py +0 -52
  104. langroid/parsing/agent_chats.py +0 -38
  105. langroid/parsing/code-parsing.md +0 -86
  106. langroid/parsing/code_parser.py +0 -121
  107. langroid/parsing/config.py +0 -0
  108. langroid/parsing/document_parser.py +0 -718
  109. langroid/parsing/image_text.py +0 -32
  110. langroid/parsing/para_sentence_split.py +0 -62
  111. langroid/parsing/parse_json.py +0 -155
  112. langroid/parsing/parser.py +0 -313
  113. langroid/parsing/repo_loader.py +0 -790
  114. langroid/parsing/routing.py +0 -36
  115. langroid/parsing/search.py +0 -275
  116. langroid/parsing/spider.py +0 -102
  117. langroid/parsing/table_loader.py +0 -94
  118. langroid/parsing/url_loader.py +0 -111
  119. langroid/parsing/url_loader_cookies.py +0 -73
  120. langroid/parsing/urls.py +0 -273
  121. langroid/parsing/utils.py +0 -373
  122. langroid/parsing/web_search.py +0 -155
  123. langroid/prompts/__init__.py +0 -9
  124. langroid/prompts/chat-gpt4-system-prompt.md +0 -68
  125. langroid/prompts/dialog.py +0 -17
  126. langroid/prompts/prompts_config.py +0 -5
  127. langroid/prompts/templates.py +0 -141
  128. langroid/pydantic_v1/__init__.py +0 -10
  129. langroid/pydantic_v1/main.py +0 -4
  130. langroid/utils/.chainlit/config.toml +0 -121
  131. langroid/utils/.chainlit/translations/en-US.json +0 -231
  132. langroid/utils/__init__.py +0 -19
  133. langroid/utils/algorithms/__init__.py +0 -3
  134. langroid/utils/algorithms/graph.py +0 -103
  135. langroid/utils/configuration.py +0 -98
  136. langroid/utils/constants.py +0 -30
  137. langroid/utils/docker.py +0 -37
  138. langroid/utils/git_utils.py +0 -252
  139. langroid/utils/globals.py +0 -49
  140. langroid/utils/llms/__init__.py +0 -0
  141. langroid/utils/llms/strings.py +0 -8
  142. langroid/utils/logging.py +0 -135
  143. langroid/utils/object_registry.py +0 -66
  144. langroid/utils/output/__init__.py +0 -20
  145. langroid/utils/output/citations.py +0 -41
  146. langroid/utils/output/printing.py +0 -99
  147. langroid/utils/output/status.py +0 -40
  148. langroid/utils/pandas_utils.py +0 -30
  149. langroid/utils/pydantic_utils.py +0 -602
  150. langroid/utils/system.py +0 -286
  151. langroid/utils/types.py +0 -93
  152. langroid/utils/web/__init__.py +0 -0
  153. langroid/utils/web/login.py +0 -83
  154. langroid/vector_store/__init__.py +0 -50
  155. langroid/vector_store/base.py +0 -357
  156. langroid/vector_store/chromadb.py +0 -214
  157. langroid/vector_store/lancedb.py +0 -401
  158. langroid/vector_store/meilisearch.py +0 -299
  159. langroid/vector_store/momento.py +0 -278
  160. langroid/vector_store/qdrant_cloud.py +0 -6
  161. langroid/vector_store/qdrantdb.py +0 -468
  162. langroid-0.31.1.dist-info/RECORD +0 -162
  163. {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info/licenses}/LICENSE +0 -0
@@ -1,401 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import logging
4
- from typing import (
5
- TYPE_CHECKING,
6
- Any,
7
- Dict,
8
- Generator,
9
- List,
10
- Optional,
11
- Sequence,
12
- Tuple,
13
- Type,
14
- )
15
-
16
- import pandas as pd
17
- from dotenv import load_dotenv
18
-
19
- from langroid.pydantic_v1 import BaseModel, ValidationError, create_model
20
-
21
- if TYPE_CHECKING:
22
- from lancedb.query import LanceVectorQueryBuilder
23
-
24
- from langroid.embedding_models.base import (
25
- EmbeddingModelsConfig,
26
- )
27
- from langroid.embedding_models.models import OpenAIEmbeddingsConfig
28
- from langroid.exceptions import LangroidImportError
29
- from langroid.mytypes import Document, EmbeddingFunction
30
- from langroid.utils.configuration import settings
31
- from langroid.utils.pydantic_utils import (
32
- dataframe_to_document_model,
33
- dataframe_to_documents,
34
- )
35
- from langroid.vector_store.base import VectorStore, VectorStoreConfig
36
-
37
- try:
38
- import lancedb
39
- from lancedb.pydantic import LanceModel, Vector
40
-
41
- has_lancedb = True
42
- except ImportError:
43
- has_lancedb = False
44
-
45
- logger = logging.getLogger(__name__)
46
-
47
-
48
- class LanceDBConfig(VectorStoreConfig):
49
- cloud: bool = False
50
- collection_name: str | None = "temp"
51
- storage_path: str = ".lancedb/data"
52
- embedding: EmbeddingModelsConfig = OpenAIEmbeddingsConfig()
53
- distance: str = "cosine"
54
-
55
-
56
- class LanceDB(VectorStore):
57
- def __init__(self, config: LanceDBConfig = LanceDBConfig()):
58
- super().__init__(config)
59
- if not has_lancedb:
60
- raise LangroidImportError("lancedb", "lancedb")
61
-
62
- self.config: LanceDBConfig = config
63
- self.embedding_fn: EmbeddingFunction = self.embedding_model.embedding_fn()
64
- self.embedding_dim = self.embedding_model.embedding_dims
65
- self.host = config.host
66
- self.port = config.port
67
- self.is_from_dataframe = False # were docs ingested from a dataframe?
68
- self.df_metadata_columns: List[str] = [] # metadata columns from dataframe
69
-
70
- load_dotenv()
71
- if self.config.cloud:
72
- logger.warning(
73
- "LanceDB Cloud is not available yet. Switching to local storage."
74
- )
75
- config.cloud = False
76
- else:
77
- try:
78
- self.client = lancedb.connect(
79
- uri=config.storage_path,
80
- )
81
- except Exception as e:
82
- new_storage_path = config.storage_path + ".new"
83
- logger.warning(
84
- f"""
85
- Error connecting to local LanceDB at {config.storage_path}:
86
- {e}
87
- Switching to {new_storage_path}
88
- """
89
- )
90
- self.client = lancedb.connect(
91
- uri=new_storage_path,
92
- )
93
-
94
- def clear_empty_collections(self) -> int:
95
- coll_names = self.list_collections()
96
- n_deletes = 0
97
- for name in coll_names:
98
- nr = self.client.open_table(name).head(1).shape[0]
99
- if nr == 0:
100
- n_deletes += 1
101
- self.client.drop_table(name)
102
- return n_deletes
103
-
104
- def clear_all_collections(self, really: bool = False, prefix: str = "") -> int:
105
- """Clear all collections with the given prefix."""
106
- if not really:
107
- logger.warning("Not deleting all collections, set really=True to confirm")
108
- return 0
109
- coll_names = [
110
- c for c in self.list_collections(empty=True) if c.startswith(prefix)
111
- ]
112
- if len(coll_names) == 0:
113
- logger.warning(f"No collections found with prefix {prefix}")
114
- return 0
115
- n_empty_deletes = 0
116
- n_non_empty_deletes = 0
117
- for name in coll_names:
118
- nr = self.client.open_table(name).head(1).shape[0]
119
- n_empty_deletes += nr == 0
120
- n_non_empty_deletes += nr > 0
121
- self.client.drop_table(name)
122
- logger.warning(
123
- f"""
124
- Deleted {n_empty_deletes} empty collections and
125
- {n_non_empty_deletes} non-empty collections.
126
- """
127
- )
128
- return n_empty_deletes + n_non_empty_deletes
129
-
130
- def list_collections(self, empty: bool = False) -> List[str]:
131
- """
132
- Returns:
133
- List of collection names that have at least one vector.
134
-
135
- Args:
136
- empty (bool, optional): Whether to include empty collections.
137
- """
138
- colls = self.client.table_names(limit=None)
139
- if len(colls) == 0:
140
- return []
141
- if empty: # include empty tbls
142
- return colls # type: ignore
143
- counts = [self.client.open_table(coll).head(1).shape[0] for coll in colls]
144
- return [coll for coll, count in zip(colls, counts) if count > 0]
145
-
146
- def _create_lance_schema(self, doc_cls: Type[Document]) -> Type[BaseModel]:
147
- """
148
- NOTE: NOT USED, but leaving it here as it may be useful.
149
-
150
- Create a subclass of LanceModel with fields:
151
- - id (str)
152
- - Vector field that has dims equal to
153
- the embedding dimension of the embedding model, and a data field of type
154
- DocClass.
155
- - other fields from doc_cls
156
-
157
- Args:
158
- doc_cls (Type[Document]): A Pydantic model which should be a subclass of
159
- Document, to be used as the type for the data field.
160
-
161
- Returns:
162
- Type[BaseModel]: A new Pydantic model subclassing from LanceModel.
163
-
164
- Raises:
165
- ValueError: If `n` is not a non-negative integer or if `DocClass` is not a
166
- subclass of Document.
167
- """
168
- if not issubclass(doc_cls, Document):
169
- raise ValueError("DocClass must be a subclass of Document")
170
-
171
- if not has_lancedb:
172
- raise LangroidImportError("lancedb", "lancedb")
173
-
174
- n = self.embedding_dim
175
-
176
- # Prepare fields for the new model
177
- fields = {"id": (str, ...), "vector": (Vector(n), ...)}
178
-
179
- sorted_fields = dict(
180
- sorted(doc_cls.__fields__.items(), key=lambda item: item[0])
181
- )
182
- # Add both statically and dynamically defined fields from doc_cls
183
- for field_name, field in sorted_fields.items():
184
- fields[field_name] = (field.outer_type_, field.default)
185
-
186
- # Create the new model with dynamic fields
187
- NewModel = create_model(
188
- "NewModel", __base__=LanceModel, **fields
189
- ) # type: ignore
190
- return NewModel # type: ignore
191
-
192
- def create_collection(self, collection_name: str, replace: bool = False) -> None:
193
- self.config.replace_collection = replace
194
-
195
- def add_documents(self, documents: Sequence[Document]) -> None:
196
- super().maybe_add_ids(documents)
197
- colls = self.list_collections(empty=True)
198
- if len(documents) == 0:
199
- return
200
- embedding_vecs = self.embedding_fn([doc.content for doc in documents])
201
- coll_name = self.config.collection_name
202
- if coll_name is None:
203
- raise ValueError("No collection name set, cannot ingest docs")
204
- # self._maybe_set_doc_class_schema(documents[0])
205
- table_exists = False
206
- if (
207
- coll_name in colls
208
- and self.client.open_table(coll_name).head(1).shape[0] > 0
209
- ):
210
- # collection exists and is not empty:
211
- # if replace_collection is True, we'll overwrite the existing collection,
212
- # else we'll append to it.
213
- if self.config.replace_collection:
214
- self.client.drop_table(coll_name)
215
- else:
216
- table_exists = True
217
-
218
- ids = [str(d.id()) for d in documents]
219
- # don't insert all at once, batch in chunks of b,
220
- # else we get an API error
221
- b = self.config.batch_size
222
-
223
- def make_batches() -> Generator[List[Dict[str, Any]], None, None]:
224
- for i in range(0, len(ids), b):
225
- batch = [
226
- dict(
227
- id=ids[i + j],
228
- vector=embedding_vecs[i + j],
229
- **doc.dict(),
230
- )
231
- for j, doc in enumerate(documents[i : i + b])
232
- ]
233
- yield batch
234
-
235
- try:
236
- if table_exists:
237
- tbl = self.client.open_table(coll_name)
238
- tbl.add(make_batches())
239
- else:
240
- batch_gen = make_batches()
241
- batch = next(batch_gen)
242
- # use first batch to create table...
243
- tbl = self.client.create_table(
244
- coll_name,
245
- data=batch,
246
- mode="create",
247
- )
248
- # ... and add the rest
249
- tbl.add(batch_gen)
250
- except Exception as e:
251
- logger.error(
252
- f"""
253
- Error adding documents to LanceDB: {e}
254
- POSSIBLE REMEDY: Delete the LancdDB storage directory
255
- {self.config.storage_path} and try again.
256
- """
257
- )
258
-
259
- def add_dataframe(
260
- self,
261
- df: pd.DataFrame,
262
- content: str = "content",
263
- metadata: List[str] = [],
264
- ) -> None:
265
- """
266
- Add a dataframe to the collection.
267
- Args:
268
- df (pd.DataFrame): A dataframe
269
- content (str): The name of the column in the dataframe that contains the
270
- text content to be embedded using the embedding model.
271
- metadata (List[str]): A list of column names in the dataframe that contain
272
- metadata to be stored in the database. Defaults to [].
273
- """
274
- self.is_from_dataframe = True
275
- actual_metadata = metadata.copy()
276
- self.df_metadata_columns = actual_metadata # could be updated below
277
- # get content column
278
- content_values = df[content].values.tolist()
279
- embedding_vecs = self.embedding_fn(content_values)
280
-
281
- # add vector column
282
- df["vector"] = embedding_vecs
283
- if content != "content":
284
- # rename content column to "content", leave existing column intact
285
- df = df.rename(columns={content: "content"}, inplace=False)
286
-
287
- if "id" not in df.columns:
288
- docs = dataframe_to_documents(df, content="content", metadata=metadata)
289
- ids = [str(d.id()) for d in docs]
290
- df["id"] = ids
291
-
292
- if "id" not in actual_metadata:
293
- actual_metadata += ["id"]
294
-
295
- colls = self.list_collections(empty=True)
296
- coll_name = self.config.collection_name
297
- if (
298
- coll_name not in colls
299
- or self.client.open_table(coll_name).head(1).shape[0] == 0
300
- ):
301
- # collection either doesn't exist or is empty, so replace it
302
- # and set new schema from df
303
- self.client.create_table(
304
- self.config.collection_name,
305
- data=df,
306
- mode="overwrite",
307
- )
308
- doc_cls = dataframe_to_document_model(
309
- df,
310
- content=content,
311
- metadata=actual_metadata,
312
- exclude=["vector"],
313
- )
314
- self.config.document_class = doc_cls # type: ignore
315
- else:
316
- # collection exists and is not empty, so append to it
317
- tbl = self.client.open_table(self.config.collection_name)
318
- tbl.add(df)
319
-
320
- def delete_collection(self, collection_name: str) -> None:
321
- self.client.drop_table(collection_name, ignore_missing=True)
322
-
323
- def _lance_result_to_docs(
324
- self, result: "LanceVectorQueryBuilder"
325
- ) -> List[Document]:
326
- if self.is_from_dataframe:
327
- df = result.to_pandas()
328
- return dataframe_to_documents(
329
- df,
330
- content="content",
331
- metadata=self.df_metadata_columns,
332
- doc_cls=self.config.document_class,
333
- )
334
- else:
335
- records = result.to_arrow().to_pylist()
336
- return self._records_to_docs(records)
337
-
338
- def _records_to_docs(self, records: List[Dict[str, Any]]) -> List[Document]:
339
- try:
340
- docs = [self.config.document_class(**rec) for rec in records]
341
- except ValidationError as e:
342
- raise ValueError(
343
- f"""
344
- Error validating LanceDB result: {e}
345
- HINT: This could happen when you're re-using an
346
- existing LanceDB store with a different schema.
347
- Try deleting your local lancedb storage at `{self.config.storage_path}`
348
- re-ingesting your documents and/or replacing the collections.
349
- """
350
- )
351
- return docs
352
-
353
- def get_all_documents(self, where: str = "") -> List[Document]:
354
- if self.config.collection_name is None:
355
- raise ValueError("No collection name set, cannot retrieve docs")
356
- tbl = self.client.open_table(self.config.collection_name)
357
- pre_result = tbl.search(None).where(where or None).limit(None)
358
- return self._lance_result_to_docs(pre_result)
359
-
360
- def get_documents_by_ids(self, ids: List[str]) -> List[Document]:
361
- if self.config.collection_name is None:
362
- raise ValueError("No collection name set, cannot retrieve docs")
363
- _ids = [str(id) for id in ids]
364
- tbl = self.client.open_table(self.config.collection_name)
365
- docs = []
366
- for _id in _ids:
367
- results = self._lance_result_to_docs(tbl.search().where(f"id == '{_id}'"))
368
- if len(results) > 0:
369
- docs.append(results[0])
370
- return docs
371
-
372
- def similar_texts_with_scores(
373
- self,
374
- text: str,
375
- k: int = 1,
376
- where: Optional[str] = None,
377
- ) -> List[Tuple[Document, float]]:
378
- embedding = self.embedding_fn([text])[0]
379
- tbl = self.client.open_table(self.config.collection_name)
380
- result = (
381
- tbl.search(embedding)
382
- .metric(self.config.distance)
383
- .where(where, prefilter=True)
384
- .limit(k)
385
- )
386
- docs = self._lance_result_to_docs(result)
387
- # note _distance is 1 - cosine
388
- if self.is_from_dataframe:
389
- scores = [
390
- 1 - rec["_distance"] for rec in result.to_pandas().to_dict("records")
391
- ]
392
- else:
393
- scores = [1 - rec["_distance"] for rec in result.to_arrow().to_pylist()]
394
- if len(docs) == 0:
395
- logger.warning(f"No matches found for {text}")
396
- return []
397
- if settings.debug:
398
- logger.info(f"Found {len(docs)} matches, max score: {max(scores)}")
399
- doc_score_pairs = list(zip(docs, scores))
400
- self.show_if_debug(doc_score_pairs)
401
- return doc_score_pairs
@@ -1,299 +0,0 @@
1
- """
2
- MeiliSearch as a pure document store, without its
3
- (experimental) vector-store functionality.
4
- We aim to use MeiliSearch for fast lexical search.
5
- Note that what we call "Collection" in Langroid is referred to as
6
- "Index" in MeiliSearch. Each data-store has its own terminology,
7
- but for uniformity we use the Langroid terminology here.
8
- """
9
-
10
- from __future__ import annotations
11
-
12
- import asyncio
13
- import logging
14
- import os
15
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Tuple
16
-
17
- from dotenv import load_dotenv
18
-
19
- if TYPE_CHECKING:
20
- from meilisearch_python_sdk.index import AsyncIndex
21
- from meilisearch_python_sdk.models.documents import DocumentsInfo
22
-
23
-
24
- from langroid.exceptions import LangroidImportError
25
- from langroid.mytypes import DocMetaData, Document
26
- from langroid.utils.configuration import settings
27
- from langroid.vector_store.base import VectorStore, VectorStoreConfig
28
-
29
- logger = logging.getLogger(__name__)
30
-
31
-
32
- class MeiliSearchConfig(VectorStoreConfig):
33
- cloud: bool = False
34
- collection_name: str | None = None
35
- primary_key: str = "id"
36
- port = 7700
37
-
38
-
39
- class MeiliSearch(VectorStore):
40
- def __init__(self, config: MeiliSearchConfig = MeiliSearchConfig()):
41
- super().__init__(config)
42
- try:
43
- import meilisearch_python_sdk as meilisearch
44
- except ImportError:
45
- raise LangroidImportError("meilisearch", "meilisearch")
46
-
47
- self.config: MeiliSearchConfig = config
48
- self.host = config.host
49
- self.port = config.port
50
- load_dotenv()
51
- self.key = os.getenv("MEILISEARCH_API_KEY") or "masterKey"
52
- self.url = os.getenv("MEILISEARCH_API_URL") or f"http://{self.host}:{self.port}"
53
- if config.cloud and None in [self.key, self.url]:
54
- logger.warning(
55
- f"""MEILISEARCH_API_KEY, MEILISEARCH_API_URL env variable must be set
56
- to use MeiliSearch in cloud mode. Please set these values
57
- in your .env file. Switching to local MeiliSearch at
58
- {self.url}
59
- """
60
- )
61
- config.cloud = False
62
-
63
- self.client: Callable[[], meilisearch.AsyncClient] = lambda: (
64
- meilisearch.AsyncClient(url=self.url, api_key=self.key)
65
- )
66
-
67
- # Note: Only create collection if a non-null collection name is provided.
68
- # This is useful to delay creation of db until we have a suitable
69
- # collection name (e.g. we could get it from the url or folder path).
70
- if config.collection_name is not None:
71
- self.create_collection(
72
- config.collection_name, replace=config.replace_collection
73
- )
74
-
75
- def clear_empty_collections(self) -> int:
76
- """All collections are treated as non-empty in MeiliSearch, so this is a
77
- no-op"""
78
- return 0
79
-
80
- async def _async_delete_indices(self, uids: List[str]) -> List[bool]:
81
- """Delete any indicecs in `uids` that exist.
82
- Returns list of bools indicating whether the index has been deleted"""
83
- async with self.client() as client:
84
- result = await asyncio.gather(
85
- *[client.delete_index_if_exists(uid=uid) for uid in uids]
86
- )
87
- return result
88
-
89
- def clear_all_collections(self, really: bool = False, prefix: str = "") -> int:
90
- """Delete all indices whose names start with `prefix`"""
91
- if not really:
92
- logger.warning("Not deleting all collections, set really=True to confirm")
93
- return 0
94
- coll_names = [c for c in self.list_collections() if c.startswith(prefix)]
95
- deletes = asyncio.run(self._async_delete_indices(coll_names))
96
- n_deletes = sum(deletes)
97
- logger.warning(f"Deleted {n_deletes} indices in MeiliSearch")
98
- return n_deletes
99
-
100
- def _list_all_collections(self) -> List[str]:
101
- """
102
- List all collections, including empty ones.
103
- Returns:
104
- List of collection names.
105
- """
106
- return self.list_collections()
107
-
108
- async def _async_get_indexes(self) -> List[AsyncIndex]:
109
- async with self.client() as client:
110
- indexes = await client.get_indexes(limit=10_000)
111
- return [] if indexes is None else indexes # type: ignore
112
-
113
- async def _async_get_index(self, index_uid: str) -> "AsyncIndex":
114
- async with self.client() as client:
115
- index = await client.get_index(index_uid)
116
- return index # type: ignore
117
-
118
- def list_collections(self, empty: bool = False) -> List[str]:
119
- """
120
- Returns:
121
- List of index names stored. We treat any existing index as non-empty.
122
- """
123
- indexes = asyncio.run(self._async_get_indexes())
124
- if len(indexes) == 0:
125
- return []
126
- else:
127
- return [ind.uid for ind in indexes]
128
-
129
- async def _async_create_index(self, collection_name: str) -> "AsyncIndex":
130
- async with self.client() as client:
131
- index = await client.create_index(
132
- uid=collection_name,
133
- primary_key=self.config.primary_key,
134
- )
135
- return index
136
-
137
- async def _async_delete_index(self, collection_name: str) -> bool:
138
- """Delete index if it exists. Returns True iff index was deleted"""
139
- async with self.client() as client:
140
- result = await client.delete_index_if_exists(uid=collection_name)
141
- return result # type: ignore
142
-
143
- def create_collection(self, collection_name: str, replace: bool = False) -> None:
144
- """
145
- Create a collection with the given name, optionally replacing an existing
146
- collection if `replace` is True.
147
- Args:
148
- collection_name (str): Name of the collection to create.
149
- replace (bool): Whether to replace an existing collection
150
- with the same name. Defaults to False.
151
- """
152
- self.config.collection_name = collection_name
153
- collections = self.list_collections()
154
- if collection_name in collections:
155
- logger.warning(
156
- f"MeiliSearch Non-empty Index {collection_name} already exists"
157
- )
158
- if not replace:
159
- logger.warning("Not replacing collection")
160
- return
161
- else:
162
- logger.warning("Recreating fresh collection")
163
- asyncio.run(self._async_delete_index(collection_name))
164
- asyncio.run(self._async_create_index(collection_name))
165
- collection_info = asyncio.run(self._async_get_index(collection_name))
166
- if settings.debug:
167
- level = logger.getEffectiveLevel()
168
- logger.setLevel(logging.INFO)
169
- logger.info(collection_info)
170
- logger.setLevel(level)
171
-
172
- async def _async_add_documents(
173
- self, collection_name: str, documents: Sequence[Dict[str, Any]]
174
- ) -> None:
175
- async with self.client() as client:
176
- index = client.index(collection_name)
177
- await index.add_documents_in_batches(
178
- documents=documents,
179
- batch_size=self.config.batch_size,
180
- primary_key=self.config.primary_key,
181
- )
182
-
183
- def add_documents(self, documents: Sequence[Document]) -> None:
184
- super().maybe_add_ids(documents)
185
- if len(documents) == 0:
186
- return
187
- colls = self._list_all_collections()
188
- if self.config.collection_name is None:
189
- raise ValueError("No collection name set, cannot ingest docs")
190
- if self.config.collection_name not in colls:
191
- self.create_collection(self.config.collection_name, replace=True)
192
- docs = [
193
- dict(
194
- id=d.id(),
195
- content=d.content,
196
- metadata=d.metadata.dict(),
197
- )
198
- for d in documents
199
- ]
200
- asyncio.run(self._async_add_documents(self.config.collection_name, docs))
201
-
202
- def delete_collection(self, collection_name: str) -> None:
203
- asyncio.run(self._async_delete_index(collection_name))
204
-
205
- def _to_int_or_uuid(self, id: str) -> int | str:
206
- try:
207
- return int(id)
208
- except ValueError:
209
- return id
210
-
211
- async def _async_get_documents(self, where: str = "") -> "DocumentsInfo":
212
- if self.config.collection_name is None:
213
- raise ValueError("No collection name set, cannot retrieve docs")
214
- filter = [] if where is None else where
215
- async with self.client() as client:
216
- index = client.index(self.config.collection_name)
217
- documents = await index.get_documents(limit=10_000, filter=filter)
218
- return documents
219
-
220
- def get_all_documents(self, where: str = "") -> List[Document]:
221
- if self.config.collection_name is None:
222
- raise ValueError("No collection name set, cannot retrieve docs")
223
- docs = asyncio.run(self._async_get_documents(where))
224
- if docs is None:
225
- return []
226
- doc_results = docs.results
227
- return [
228
- Document(
229
- content=d["content"],
230
- metadata=DocMetaData(**d["metadata"]),
231
- )
232
- for d in doc_results
233
- ]
234
-
235
- async def _async_get_documents_by_ids(self, ids: List[str]) -> List[Dict[str, Any]]:
236
- if self.config.collection_name is None:
237
- raise ValueError("No collection name set, cannot retrieve docs")
238
- async with self.client() as client:
239
- index = client.index(self.config.collection_name)
240
- documents = await asyncio.gather(*[index.get_document(id) for id in ids])
241
- return documents
242
-
243
- def get_documents_by_ids(self, ids: List[str]) -> List[Document]:
244
- if self.config.collection_name is None:
245
- raise ValueError("No collection name set, cannot retrieve docs")
246
- docs = asyncio.run(self._async_get_documents_by_ids(ids))
247
- return [
248
- Document(
249
- content=d["content"],
250
- metadata=DocMetaData(**d["metadata"]),
251
- )
252
- for d in docs
253
- ]
254
-
255
- async def _async_search(
256
- self,
257
- query: str,
258
- k: int = 20,
259
- filter: str | list[str | list[str]] | None = None,
260
- ) -> List[Dict[str, Any]]:
261
- if self.config.collection_name is None:
262
- raise ValueError("No collection name set, cannot search")
263
- async with self.client() as client:
264
- index = client.index(self.config.collection_name)
265
- results = await index.search(
266
- query,
267
- limit=k,
268
- show_ranking_score=True,
269
- filter=filter,
270
- )
271
- return results.hits # type: ignore
272
-
273
- def similar_texts_with_scores(
274
- self,
275
- text: str,
276
- k: int = 20,
277
- where: Optional[str] = None,
278
- neighbors: int = 0, # ignored
279
- ) -> List[Tuple[Document, float]]:
280
- filter = [] if where is None else where
281
- if self.config.collection_name is None:
282
- raise ValueError("No collection name set, cannot search")
283
- _docs = asyncio.run(self._async_search(text, k, filter)) # type: ignore
284
- if len(_docs) == 0:
285
- logger.warning(f"No matches found for {text}")
286
- return []
287
- scores = [h["_rankingScore"] for h in _docs]
288
- if settings.debug:
289
- logger.info(f"Found {len(_docs)} matches, max score: {max(scores)}")
290
- docs = [
291
- Document(
292
- content=d["content"],
293
- metadata=DocMetaData(**d["metadata"]),
294
- )
295
- for d in _docs
296
- ]
297
- doc_score_pairs = list(zip(docs, scores))
298
- self.show_if_debug(doc_score_pairs)
299
- return doc_score_pairs