langchain-weaviate 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 LangChain, Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,58 @@
1
+ Metadata-Version: 2.1
2
+ Name: langchain-weaviate
3
+ Version: 0.0.1
4
+ Summary: An integration package connecting Weaviate and LangChain
5
+ Requires-Python: >=3.9,<4.0
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: Programming Language :: Python :: 3.9
8
+ Classifier: Programming Language :: Python :: 3.10
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Requires-Dist: langchain-core (>=0.1.33,<0.2.0)
12
+ Requires-Dist: numpy (>=1.26.2,<2.0.0)
13
+ Requires-Dist: simsimd (>=3.6.1,<5.0.0)
14
+ Requires-Dist: weaviate-client (>=4.0.0,<5.0.0)
15
+ Description-Content-Type: text/markdown
16
+
17
+ # langchain-weaviate
18
+
19
+ ## About
20
+
21
+ This package contains the [Weaviate](https://github.com/weaviate/weaviate) integrations for [LangChain](https://github.com/langchain-ai/langchain).
22
+
23
+ - **Weaviate** is an open source, AI-native vector database that helps developers create intuitive and reliable AI-powered applications.
24
+ - **LangChain** is a framework for developing applications powered by language models.
25
+
26
+ Using this package, LangChain users can conveniently set Weaviate as their vector store to store and retrieve embeddings.
27
+
28
+ ## Requirements
29
+
30
+ To use this package, you need to have a running Weaviate instance.
31
+
32
+ Weaviate can be [deployed in many different ways](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate) such as in containerized environments, on Kubernetes, or in the cloud as a managed service, on-premises, or through a cloud provider such as AWS or Google Cloud.
33
+
34
+ The deployment method to choose depends on your use case and infrastructure requirements.
35
+
36
+ Two of the most common ways to deploy Weaviate are:
37
+ - [Docker Compose](https://weaviate.io/developers/weaviate/installation/docker-compose)
38
+ - [Weaviate Cloud Services (WCS)](https://console.weaviate.cloud)
39
+
40
+ ## Installation and Setup
41
+
42
+ As an integration package, this assumes you have already installed LangChain. If not, please refer to the [LangChain installation guide](https://python.langchain.com/docs/get_started/installation).
43
+
44
+ Then, install this package:
45
+
46
+ ```bash
47
+ pip install langchain-weaviate
48
+ ```
49
+
50
+ ## Usage
51
+
52
+ Please see the included [Jupyter notebook](docs/vectorstores.ipynb) for an example of how to use this package.
53
+
54
+ ## Further resources
55
+
56
+ - [LangChain documentation](https://python.langchain.com/docs)
57
+ - [Weaviate documentation](https://weaviate.io/developers/weaviate)
58
+
@@ -0,0 +1,41 @@
1
+ # langchain-weaviate
2
+
3
+ ## About
4
+
5
+ This package contains the [Weaviate](https://github.com/weaviate/weaviate) integrations for [LangChain](https://github.com/langchain-ai/langchain).
6
+
7
+ - **Weaviate** is an open source, AI-native vector database that helps developers create intuitive and reliable AI-powered applications.
8
+ - **LangChain** is a framework for developing applications powered by language models.
9
+
10
+ Using this package, LangChain users can conveniently set Weaviate as their vector store to store and retrieve embeddings.
11
+
12
+ ## Requirements
13
+
14
+ To use this package, you need to have a running Weaviate instance.
15
+
16
+ Weaviate can be [deployed in many different ways](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate) such as in containerized environments, on Kubernetes, or in the cloud as a managed service, on-premises, or through a cloud provider such as AWS or Google Cloud.
17
+
18
+ The deployment method to choose depends on your use case and infrastructure requirements.
19
+
20
+ Two of the most common ways to deploy Weaviate are:
21
+ - [Docker Compose](https://weaviate.io/developers/weaviate/installation/docker-compose)
22
+ - [Weaviate Cloud Services (WCS)](https://console.weaviate.cloud)
23
+
24
+ ## Installation and Setup
25
+
26
+ As an integration package, this assumes you have already installed LangChain. If not, please refer to the [LangChain installation guide](https://python.langchain.com/docs/get_started/installation).
27
+
28
+ Then, install this package:
29
+
30
+ ```bash
31
+ pip install langchain-weaviate
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ Please see the included [Jupyter notebook](docs/vectorstores.ipynb) for an example of how to use this package.
37
+
38
+ ## Further resources
39
+
40
+ - [LangChain documentation](https://python.langchain.com/docs)
41
+ - [Weaviate documentation](https://weaviate.io/developers/weaviate)
@@ -0,0 +1,5 @@
1
+ from langchain_weaviate.vectorstores import WeaviateVectorStore
2
+
3
+ __all__ = [
4
+ "WeaviateVectorStore",
5
+ ]
@@ -0,0 +1,63 @@
1
+ """Math utils."""
2
+
3
+ import logging
4
+ from typing import List, Optional, Tuple, Union
5
+
6
+ import numpy as np
7
+ import simsimd # type: ignore
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
12
+
13
+
14
+ def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
15
+ """Row-wise cosine similarity between two equal-width matrices."""
16
+ if len(X) == 0 or len(Y) == 0:
17
+ return np.array([])
18
+
19
+ X = np.array(X)
20
+ Y = np.array(Y)
21
+ if X.shape[1] != Y.shape[1]:
22
+ raise ValueError(
23
+ f"Number of columns in X and Y must be the same. X has shape {X.shape} "
24
+ f"and Y has shape {Y.shape}."
25
+ )
26
+
27
+ X = np.array(X, dtype=np.float32)
28
+ Y = np.array(Y, dtype=np.float32)
29
+ Z = 1 - np.array(simsimd.cdist(X, Y, metric="cosine"))
30
+ if isinstance(Z, float):
31
+ return np.array([Z])
32
+ return Z
33
+
34
+
35
+ def cosine_similarity_top_k(
36
+ X: Matrix,
37
+ Y: Matrix,
38
+ top_k: Optional[int] = 5,
39
+ score_threshold: Optional[float] = None,
40
+ ) -> Tuple[List[Tuple[int, int]], List[float]]:
41
+ """Row-wise cosine similarity with optional top-k and score threshold filtering.
42
+
43
+ Args:
44
+ X: Matrix.
45
+ Y: Matrix, same width as X.
46
+ top_k: Max number of results to return.
47
+ score_threshold: Minimum cosine similarity of results.
48
+
49
+ Returns:
50
+ Tuple of two lists. First contains two-tuples of indices (X_idx, Y_idx),
51
+ second contains corresponding cosine similarities.
52
+ """
53
+ if len(X) == 0 or len(Y) == 0:
54
+ return [], []
55
+ score_array = cosine_similarity(X, Y)
56
+ score_threshold = score_threshold or -1.0
57
+ score_array[score_array < score_threshold] = 0
58
+ top_k = min(top_k or len(score_array), np.count_nonzero(score_array))
59
+ top_k_idxs = np.argpartition(score_array, -top_k, axis=None)[-top_k:]
60
+ top_k_idxs = top_k_idxs[np.argsort(score_array.ravel()[top_k_idxs])][::-1]
61
+ ret_idxs = np.unravel_index(top_k_idxs, score_array.shape)
62
+ scores = score_array.ravel()[top_k_idxs].tolist()
63
+ return list(zip(*ret_idxs)), scores # type: ignore
File without changes
@@ -0,0 +1,53 @@
1
+ """Utility functions for working with vectors and vectorstores."""
2
+
3
+ from enum import Enum
4
+ from typing import List
5
+
6
+ import numpy as np
7
+
8
+ from langchain_weaviate._math import cosine_similarity
9
+
10
+
11
+ class DistanceStrategy(str, Enum):
12
+ """Enumerator of the Distance strategies for calculating distances
13
+ between vectors."""
14
+
15
+ EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE"
16
+ MAX_INNER_PRODUCT = "MAX_INNER_PRODUCT"
17
+ DOT_PRODUCT = "DOT_PRODUCT"
18
+ JACCARD = "JACCARD"
19
+ COSINE = "COSINE"
20
+
21
+
22
+ def maximal_marginal_relevance(
23
+ query_embedding: np.ndarray,
24
+ embedding_list: list,
25
+ lambda_mult: float = 0.5,
26
+ k: int = 4,
27
+ ) -> List[int]:
28
+ """Calculate maximal marginal relevance."""
29
+ if min(k, len(embedding_list)) <= 0:
30
+ return []
31
+ if query_embedding.ndim == 1:
32
+ query_embedding = np.expand_dims(query_embedding, axis=0)
33
+ similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0]
34
+ most_similar = int(np.argmax(similarity_to_query))
35
+ idxs = [most_similar]
36
+ selected = np.array([embedding_list[most_similar]])
37
+ while len(idxs) < min(k, len(embedding_list)):
38
+ best_score = -np.inf
39
+ idx_to_add = -1
40
+ similarity_to_selected = cosine_similarity(embedding_list, selected)
41
+ for i, query_score in enumerate(similarity_to_query):
42
+ if i in idxs:
43
+ continue
44
+ redundant_score = max(similarity_to_selected[i])
45
+ equation_score = (
46
+ lambda_mult * query_score - (1 - lambda_mult) * redundant_score
47
+ )
48
+ if equation_score > best_score:
49
+ best_score = equation_score
50
+ idx_to_add = i
51
+ idxs.append(idx_to_add)
52
+ selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
53
+ return idxs
@@ -0,0 +1,542 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime
4
+ import logging
5
+ from collections.abc import Generator
6
+ from contextlib import contextmanager
7
+ from typing import (
8
+ TYPE_CHECKING,
9
+ Any,
10
+ Callable,
11
+ Dict,
12
+ Iterable,
13
+ List,
14
+ Literal,
15
+ Optional,
16
+ Tuple,
17
+ Union,
18
+ overload,
19
+ )
20
+ from uuid import uuid4
21
+
22
+ import numpy as np
23
+ import weaviate # type: ignore
24
+ from langchain_core.documents import Document
25
+ from langchain_core.embeddings import Embeddings
26
+ from langchain_core.vectorstores import VectorStore
27
+
28
+ from langchain_weaviate.utils import maximal_marginal_relevance
29
+
30
+ if TYPE_CHECKING:
31
+ import weaviate
32
+
33
+
34
+ logger = logging.getLogger(__name__)
35
+ logger.setLevel(logging.DEBUG)
36
+
37
+ handler = logging.StreamHandler()
38
+ formatter = logging.Formatter(
39
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%b-%d %I:%M %p"
40
+ )
41
+ handler.setFormatter(formatter)
42
+
43
+ logger.addHandler(handler)
44
+
45
+
46
+ def _default_schema(index_name: str) -> Dict:
47
+ return {
48
+ "class": index_name,
49
+ "properties": [
50
+ {
51
+ "name": "text",
52
+ "dataType": ["text"],
53
+ }
54
+ ],
55
+ }
56
+
57
+
58
+ def _default_score_normalizer(val: float) -> float:
59
+ # prevent overflow
60
+ # use 709 because that's the largest exponent that doesn't overflow
61
+ # use -709 because that's the smallest exponent that doesn't underflow
62
+ val = np.clip(val, -709, 709)
63
+ return 1 - 1 / (1 + np.exp(val))
64
+
65
+
66
+ def _json_serializable(value: Any) -> Any:
67
+ if isinstance(value, datetime.datetime):
68
+ return value.isoformat()
69
+ return value
70
+
71
+
72
+ class WeaviateVectorStore(VectorStore):
73
+ """`Weaviate` vector store.
74
+
75
+ To use, you should have the ``weaviate-client`` python package installed.
76
+
77
+ Example:
78
+ .. code-block:: python
79
+
80
+ import weaviate
81
+ from langchain_community.vectorstores import Weaviate
82
+
83
+ client = weaviate.Client(url=os.environ["WEAVIATE_URL"], ...)
84
+ weaviate = Weaviate(client, index_name, text_key)
85
+
86
+ """
87
+
88
+ def __init__(
89
+ self,
90
+ client: weaviate.WeaviateClient,
91
+ index_name: Optional[str],
92
+ text_key: str,
93
+ embedding: Optional[Embeddings] = None,
94
+ attributes: Optional[List[str]] = None,
95
+ relevance_score_fn: Optional[
96
+ Callable[[float], float]
97
+ ] = _default_score_normalizer,
98
+ use_multi_tenancy: bool = False,
99
+ ):
100
+ """Initialize with Weaviate client."""
101
+
102
+ if not isinstance(client, weaviate.WeaviateClient):
103
+ raise ValueError(
104
+ "client should be an instance of"
105
+ f" weaviate.WeaviateClient, got {type(client)}"
106
+ )
107
+ self._client = client
108
+ self._index_name = index_name or f"LangChain_{uuid4().hex}"
109
+ self._embedding = embedding
110
+ self._text_key = text_key
111
+ self._query_attrs = [self._text_key]
112
+ self.relevance_score_fn = relevance_score_fn
113
+ if attributes is not None:
114
+ self._query_attrs.extend(attributes)
115
+
116
+ schema = _default_schema(self._index_name)
117
+ schema["MultiTenancyConfig"] = {"enabled": use_multi_tenancy}
118
+
119
+ # check whether the index already exists
120
+ if not client.collections.exists(self._index_name):
121
+ client.collections.create_from_dict(schema)
122
+
123
+ # store collection for convenience
124
+ # this does not actually send a request to weaviate
125
+ self._collection = client.collections.get(self._index_name)
126
+
127
+ # store this setting so we don't have to send a request to weaviate
128
+ # every time we want to do a CRUD operation
129
+ self._multi_tenancy_enabled = self._collection.config.get(
130
+ simple=False
131
+ ).multi_tenancy_config.enabled
132
+
133
+ @property
134
+ def embeddings(self) -> Optional[Embeddings]:
135
+ return self._embedding
136
+
137
+ def _select_relevance_score_fn(self) -> Callable[[float], float]:
138
+ return (
139
+ self.relevance_score_fn
140
+ if self.relevance_score_fn
141
+ else _default_score_normalizer
142
+ )
143
+
144
+ def add_texts(
145
+ self,
146
+ texts: Iterable[str],
147
+ metadatas: Optional[List[dict]] = None,
148
+ tenant: Optional[str] = None,
149
+ **kwargs: Any,
150
+ ) -> List[str]:
151
+ """Upload texts with metadata (properties) to Weaviate."""
152
+ from weaviate.util import get_valid_uuid # type: ignore
153
+
154
+ if tenant and not self._does_tenant_exist(tenant):
155
+ logger.info(
156
+ f"Tenant {tenant} does not exist in index {self._index_name}. "
157
+ "Creating tenant."
158
+ )
159
+ tenant_objs = [weaviate.classes.tenants.Tenant(name=tenant)]
160
+ self._collection.tenants.create(tenants=tenant_objs)
161
+
162
+ ids = []
163
+ embeddings: Optional[List[List[float]]] = None
164
+ if self._embedding:
165
+ embeddings = self._embedding.embed_documents(list(texts))
166
+
167
+ with self._client.batch.dynamic() as batch:
168
+ for i, text in enumerate(texts):
169
+ data_properties = {self._text_key: text}
170
+ if metadatas is not None:
171
+ for key, val in metadatas[i].items():
172
+ data_properties[key] = _json_serializable(val)
173
+
174
+ # Allow for ids (consistent w/ other methods)
175
+ # # Or uuids (backwards compatible w/ existing arg)
176
+ # If the UUID of one of the objects already exists
177
+ # then the existing object will be replaced by the new object.
178
+ _id = get_valid_uuid(uuid4())
179
+ if "uuids" in kwargs:
180
+ _id = kwargs["uuids"][i]
181
+ elif "ids" in kwargs:
182
+ _id = kwargs["ids"][i]
183
+
184
+ batch.add_object(
185
+ collection=self._index_name,
186
+ properties=data_properties,
187
+ uuid=_id,
188
+ vector=embeddings[i] if embeddings else None,
189
+ tenant=tenant,
190
+ )
191
+
192
+ ids.append(_id)
193
+
194
+ failed_objs = self._client.batch.failed_objects
195
+ for obj in failed_objs:
196
+ err_message = (
197
+ f"Failed to add object: {obj.original_uuid}\nReason: {obj.message}"
198
+ )
199
+
200
+ logger.error(err_message)
201
+
202
+ return ids
203
+
204
+ @overload
205
+ def _perform_search(
206
+ self,
207
+ query: Optional[str],
208
+ k: int,
209
+ return_score: Literal[False] = False,
210
+ tenant: Optional[str] = None,
211
+ **kwargs: Any,
212
+ ) -> List[Document]: ...
213
+ @overload
214
+ def _perform_search(
215
+ self,
216
+ query: Optional[str],
217
+ k: int,
218
+ return_score: Literal[True],
219
+ tenant: Optional[str] = None,
220
+ **kwargs: Any,
221
+ ) -> List[Tuple[Document, float]]: ...
222
+ def _perform_search(
223
+ self,
224
+ query: Optional[str],
225
+ k: int,
226
+ return_score: bool = False,
227
+ tenant: Optional[str] = None,
228
+ **kwargs: Any,
229
+ ) -> Union[List[Document], List[Tuple[Document, float]]]:
230
+ """
231
+ Perform a similarity search.
232
+
233
+ Parameters:
234
+ query (str): The query string to search for.
235
+ k (int): The number of results to return.
236
+ return_score (bool, optional): Whether to return the score along with the
237
+ document. Defaults to False.
238
+ tenant (Optional[str], optional): The tenant name. Defaults to None.
239
+ **kwargs: Additional parameters to pass to the search method. These parameters
240
+ will be directly passed to the underlying Weaviate client's search method.
241
+
242
+ Returns:
243
+ List[Union[Document, Tuple[Document, float]]]: A list of documents that match
244
+ the query. If return_score is True, each document is returned as a tuple
245
+ with the document and its score.
246
+
247
+ Raises:
248
+ ValueError: If _embedding is None or an invalid search method is provided.
249
+ """
250
+ if self._embedding is None:
251
+ raise ValueError("_embedding cannot be None for similarity_search")
252
+
253
+ if "return_metadata" not in kwargs:
254
+ kwargs["return_metadata"] = ["score"]
255
+ elif "score" not in kwargs["return_metadata"]:
256
+ kwargs["return_metadata"].append("score")
257
+
258
+ if (
259
+ "return_properties" in kwargs
260
+ and self._text_key not in kwargs["return_properties"]
261
+ ):
262
+ kwargs["return_properties"].append(self._text_key)
263
+
264
+ vector = kwargs.pop("vector", None)
265
+
266
+ # workaround to handle test_max_marginal_relevance_search
267
+ if vector is None:
268
+ if query is None:
269
+ # raise an error because weaviate will do a fetch object query
270
+ # if both query and vector are None
271
+ raise ValueError("Either query or vector must be provided.")
272
+ else:
273
+ vector = self._embedding.embed_query(query)
274
+
275
+ return_uuids = kwargs.pop("return_uuids", False)
276
+
277
+ with self._tenant_context(tenant) as collection:
278
+ try:
279
+ result = collection.query.hybrid(
280
+ query=query, vector=vector, limit=k, **kwargs
281
+ )
282
+ except weaviate.exceptions.WeaviateQueryException as e:
283
+ raise ValueError(f"Error during query: {e}")
284
+
285
+ docs_and_scores: List[Tuple[Document, float]] = []
286
+ for obj in result.objects:
287
+ text = obj.properties.pop(self._text_key)
288
+ filtered_metadata = {
289
+ k: v
290
+ for k, v in obj.metadata.__dict__.items()
291
+ if v is not None and k != "score"
292
+ }
293
+ merged_props = {
294
+ **obj.properties,
295
+ **filtered_metadata,
296
+ **({"vector": obj.vector["default"]} if obj.vector else {}),
297
+ **({"uuid": str(obj.uuid)} if return_uuids else {}),
298
+ }
299
+ doc = Document(page_content=text, metadata=merged_props)
300
+ score = obj.metadata.score
301
+ docs_and_scores.append((doc, score))
302
+
303
+ if return_score:
304
+ return docs_and_scores
305
+ else:
306
+ return [doc for doc, _ in docs_and_scores]
307
+
308
+ def similarity_search(
309
+ self, query: str, k: int = 4, **kwargs: Any
310
+ ) -> List[Document]:
311
+ """Return docs most similar to query.
312
+
313
+ Args:
314
+ query: Text to look up documents similar to.
315
+ k: Number of Documents to return. Defaults to 4.
316
+ **kwargs: Additional keyword arguments will be passed to the `hybrid()`
317
+ function of the weaviate client.
318
+
319
+ Returns:
320
+ List of Documents most similar to the query.
321
+ """
322
+
323
+ result = self._perform_search(query, k, **kwargs)
324
+ return result
325
+
326
+ def max_marginal_relevance_search(
327
+ self,
328
+ query: str,
329
+ k: int = 4,
330
+ fetch_k: int = 20,
331
+ lambda_mult: float = 0.5,
332
+ **kwargs: Any,
333
+ ) -> List[Document]:
334
+ """Return docs selected using the maximal marginal relevance.
335
+
336
+ Maximal marginal relevance optimizes for similarity to query AND diversity
337
+ among selected documents.
338
+
339
+ Args:
340
+ query: Text to look up documents similar to.
341
+ k: Number of Documents to return. Defaults to 4.
342
+ fetch_k: Number of Documents to fetch to pass to MMR algorithm.
343
+ lambda_mult: Number between 0 and 1 that determines the degree
344
+ of diversity among the results with 0 corresponding
345
+ to maximum diversity and 1 to minimum diversity.
346
+ Defaults to 0.5.
347
+
348
+ Returns:
349
+ List of Documents selected by maximal marginal relevance.
350
+ """
351
+ if self._embedding is not None:
352
+ embedding = self._embedding.embed_query(query)
353
+ else:
354
+ raise ValueError(
355
+ "max_marginal_relevance_search requires a suitable Embeddings object"
356
+ )
357
+
358
+ return self.max_marginal_relevance_search_by_vector(
359
+ embedding, k=k, fetch_k=fetch_k, lambda_mult=lambda_mult, **kwargs
360
+ )
361
+
362
+ def max_marginal_relevance_search_by_vector(
363
+ self,
364
+ embedding: List[float],
365
+ k: int = 4,
366
+ fetch_k: int = 20,
367
+ lambda_mult: float = 0.5,
368
+ **kwargs: Any,
369
+ ) -> List[Document]:
370
+ """Return docs selected using the maximal marginal relevance.
371
+
372
+ Maximal marginal relevance optimizes for similarity to query AND diversity
373
+ among selected documents.
374
+
375
+ Args:
376
+ embedding: Embedding to look up documents similar to.
377
+ k: Number of Documents to return. Defaults to 4.
378
+ fetch_k: Number of Documents to fetch to pass to MMR algorithm.
379
+ lambda_mult: Number between 0 and 1 that determines the degree
380
+ of diversity among the results with 0 corresponding
381
+ to maximum diversity and 1 to minimum diversity.
382
+ Defaults to 0.5.
383
+
384
+ Returns:
385
+ List of Documents selected by maximal marginal relevance.
386
+ """
387
+
388
+ results = self._perform_search(
389
+ query=None,
390
+ k=fetch_k,
391
+ include_vector=True,
392
+ vector=embedding,
393
+ **kwargs,
394
+ )
395
+
396
+ embeddings = [result.metadata["vector"] for result in results]
397
+ mmr_selected = maximal_marginal_relevance(
398
+ np.array(embedding), embeddings, k=k, lambda_mult=lambda_mult
399
+ )
400
+
401
+ docs = []
402
+
403
+ for idx in mmr_selected:
404
+ text = results[idx].page_content
405
+ results[idx].metadata.pop("vector")
406
+ docs.append(Document(page_content=text, metadata=results[idx].metadata))
407
+
408
+ return docs
409
+
410
+ def similarity_search_with_score(
411
+ self, query: str, k: int = 4, **kwargs: Any
412
+ ) -> List[Tuple[Document, float]]:
413
+ """
414
+ Return list of documents most similar to the query
415
+ text and cosine distance in float for each.
416
+ Lower score represents more similarity.
417
+ """
418
+
419
+ results = self._perform_search(query, k, return_score=True, **kwargs)
420
+
421
+ return results
422
+
423
+ @classmethod
424
+ def from_texts(
425
+ cls,
426
+ texts: List[str],
427
+ embedding: Optional[Embeddings],
428
+ metadatas: Optional[List[dict]] = None,
429
+ *,
430
+ tenant: Optional[str] = None,
431
+ client: weaviate.WeaviateClient = None,
432
+ index_name: Optional[str] = None,
433
+ text_key: str = "text",
434
+ relevance_score_fn: Optional[
435
+ Callable[[float], float]
436
+ ] = _default_score_normalizer,
437
+ **kwargs: Any,
438
+ ) -> WeaviateVectorStore:
439
+ """Construct Weaviate wrapper from raw documents.
440
+
441
+ This is a user-friendly interface that:
442
+ 1. Embeds documents.
443
+ 2. Creates a new index for the embeddings in the Weaviate instance.
444
+ 3. Adds the documents to the newly created Weaviate index.
445
+
446
+ This is intended to be a quick way to get started.
447
+
448
+ Args:
449
+ texts: Texts to add to vector store.
450
+ embedding: Text embedding model to use.
451
+ client: weaviate.Client to use.
452
+ metadatas: Metadata associated with each text.
453
+ tenant: The tenant name. Defaults to None.
454
+ index_name: Index name.
455
+ text_key: Key to use for uploading/retrieving text to/from vectorstore.
456
+ relevance_score_fn: Function for converting whatever distance function the
457
+ vector store uses to a relevance score, which is a normalized similarity
458
+ score (0 means dissimilar, 1 means similar).
459
+ **kwargs: Additional named parameters to pass to ``Weaviate.__init__()``.
460
+
461
+ Example:
462
+ .. code-block:: python
463
+
464
+ from langchain_community.embeddings import OpenAIEmbeddings
465
+ from langchain_community.vectorstores import Weaviate
466
+
467
+ embeddings = OpenAIEmbeddings()
468
+ weaviate = Weaviate.from_texts(
469
+ texts,
470
+ embeddings,
471
+ client=client
472
+ )
473
+ """
474
+
475
+ attributes = list(metadatas[0].keys()) if metadatas else None
476
+
477
+ weaviate_vector_store = cls(
478
+ client,
479
+ index_name,
480
+ text_key,
481
+ embedding=embedding,
482
+ attributes=attributes,
483
+ relevance_score_fn=relevance_score_fn,
484
+ use_multi_tenancy=tenant is not None,
485
+ )
486
+
487
+ weaviate_vector_store.add_texts(texts, metadatas, tenant=tenant, **kwargs)
488
+
489
+ return weaviate_vector_store
490
+
491
+ def delete(
492
+ self,
493
+ ids: Optional[List[str]] = None,
494
+ tenant: Optional[str] = None,
495
+ **kwargs: Any,
496
+ ) -> None:
497
+ """Delete by vector IDs.
498
+
499
+ Args:
500
+ ids: List of ids to delete.
501
+ tenant: The tenant name. Defaults to None.
502
+ """
503
+
504
+ if ids is None:
505
+ raise ValueError("No ids provided to delete.")
506
+
507
+ id_filter = weaviate.classes.query.Filter.by_id().contains_any(ids)
508
+
509
+ with self._tenant_context(tenant) as collection:
510
+ collection.data.delete_many(where=id_filter)
511
+
512
+ def _does_tenant_exist(self, tenant: str) -> bool:
513
+ """Check if tenant exists in Weaviate."""
514
+ assert (
515
+ self._multi_tenancy_enabled
516
+ ), "Cannot check for tenant existence when multi-tenancy is not enabled"
517
+ tenants = self._collection.tenants.get()
518
+
519
+ return tenant in tenants
520
+
521
+ @contextmanager
522
+ def _tenant_context(
523
+ self, tenant: Optional[str] = None
524
+ ) -> Generator[weaviate.collections.Collection, None, None]:
525
+ """Context manager for handling tenants.
526
+
527
+ Args:
528
+ tenant: The tenant name. Defaults to None.
529
+ """
530
+
531
+ if tenant is not None and not self._multi_tenancy_enabled:
532
+ raise ValueError(
533
+ "Cannot use tenant context when multi-tenancy is not enabled"
534
+ )
535
+
536
+ if tenant is None and self._multi_tenancy_enabled:
537
+ raise ValueError("Must use tenant context when multi-tenancy is enabled")
538
+
539
+ try:
540
+ yield self._collection.with_tenant(tenant)
541
+ finally:
542
+ pass
@@ -0,0 +1,93 @@
1
+ [tool.poetry]
2
+ name = "langchain-weaviate"
3
+ version = "0.0.1"
4
+ description = "An integration package connecting Weaviate and LangChain"
5
+ authors = []
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = ">=3.9,<4.0"
10
+ langchain-core = "^0.1.33"
11
+ weaviate-client = "^4.0.0"
12
+ numpy = "^1.26.2"
13
+ simsimd = ">=3.6.1,<5.0.0"
14
+
15
+ [tool.poetry.group.test]
16
+ optional = true
17
+
18
+ [tool.poetry.group.test.dependencies]
19
+ pytest = ">=7.3,<9.0"
20
+ freezegun = "^1.2.2"
21
+ pytest-mock = "^3.10.0"
22
+ syrupy = "^4.0.2"
23
+ pytest-watcher = ">=0.3.4,<0.5.0"
24
+ pytest-asyncio = ">=0.21.1,<0.24.0"
25
+ langchain = "^0.1.8"
26
+ pytest-docker = ">=2.0.1,<4.0.0"
27
+ pytest-xdist = "^3.5.0"
28
+ openai = "^1.6.0"
29
+ tiktoken = ">=0.5.2,<0.7.0"
30
+ pytest-cov = ">=4.1,<6.0"
31
+
32
+ [tool.poetry.group.codespell]
33
+ optional = true
34
+
35
+ [tool.poetry.group.codespell.dependencies]
36
+ codespell = "^2.2.0"
37
+
38
+ [tool.poetry.group.lint]
39
+ optional = true
40
+
41
+ [tool.poetry.group.lint.dependencies]
42
+ ruff = ">=0.1.5,<0.4.0"
43
+
44
+ [tool.poetry.group.typing.dependencies]
45
+ mypy = ">=0.991,<1.10"
46
+ types-requests = "^2.31.0.20240403"
47
+
48
+ [tool.poetry.group.dev]
49
+ optional = true
50
+
51
+ [tool.poetry.group.dev.dependencies]
52
+ ipykernel = "^6.27.1"
53
+
54
+ [tool.poetry.group.test_integration]
55
+ optional = true
56
+
57
+ [tool.poetry.group.test_integration.dependencies]
58
+ langchain-openai = ">=0.0.3,<0.2"
59
+
60
+ [tool.ruff]
61
+ lint.select = [
62
+ "E", # pycodestyle
63
+ "F", # pyflakes
64
+ "I", # isort
65
+ ]
66
+
67
+ [tool.mypy]
68
+ disallow_untyped_defs = "True"
69
+
70
+ [tool.coverage.run]
71
+ omit = ["tests/*"]
72
+
73
+ [build-system]
74
+ requires = ["poetry-core>=1.0.0"]
75
+ build-backend = "poetry.core.masonry.api"
76
+
77
+ [tool.pytest.ini_options]
78
+ # --strict-markers will raise errors on unknown marks.
79
+ # https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
80
+ #
81
+ # https://docs.pytest.org/en/7.1.x/reference/reference.html
82
+ # --strict-config any warnings encountered while parsing the `pytest`
83
+ # section of the configuration file raise errors.
84
+ #
85
+ # https://github.com/tophat/syrupy
86
+ # --snapshot-warn-unused Prints a warning on unused snapshots rather than fail the test suite.
87
+ addopts = "--strict-markers --strict-config --durations=5 -vv -s"
88
+ # Registering custom markers.
89
+ # https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers
90
+ markers = [
91
+ "compile: mark placeholder test used to compile integration tests without running them",
92
+ ]
93
+ #asyncio_mode = "auto"