langchain-weaviate 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langchain_weaviate-0.0.1/LICENSE +21 -0
- langchain_weaviate-0.0.1/PKG-INFO +58 -0
- langchain_weaviate-0.0.1/README.md +41 -0
- langchain_weaviate-0.0.1/langchain_weaviate/__init__.py +5 -0
- langchain_weaviate-0.0.1/langchain_weaviate/_math.py +63 -0
- langchain_weaviate-0.0.1/langchain_weaviate/py.typed +0 -0
- langchain_weaviate-0.0.1/langchain_weaviate/utils.py +53 -0
- langchain_weaviate-0.0.1/langchain_weaviate/vectorstores.py +542 -0
- langchain_weaviate-0.0.1/pyproject.toml +93 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 LangChain, Inc.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: langchain-weaviate
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: An integration package connecting Weaviate and LangChain
|
|
5
|
+
Requires-Python: >=3.9,<4.0
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Requires-Dist: langchain-core (>=0.1.33,<0.2.0)
|
|
12
|
+
Requires-Dist: numpy (>=1.26.2,<2.0.0)
|
|
13
|
+
Requires-Dist: simsimd (>=3.6.1,<5.0.0)
|
|
14
|
+
Requires-Dist: weaviate-client (>=4.0.0,<5.0.0)
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# langchain-weaviate
|
|
18
|
+
|
|
19
|
+
## About
|
|
20
|
+
|
|
21
|
+
This package contains the [Weaviate](https://github.com/weaviate/weaviate) integrations for [LangChain](https://github.com/langchain-ai/langchain).
|
|
22
|
+
|
|
23
|
+
- **Weaviate** is an open source, AI-native vector database that helps developers create intuitive and reliable AI-powered applications.
|
|
24
|
+
- **LangChain** is a framework for developing applications powered by language models.
|
|
25
|
+
|
|
26
|
+
Using this package, LangChain users can conveniently set Weaviate as their vector store to store and retrieve embeddings.
|
|
27
|
+
|
|
28
|
+
## Requirements
|
|
29
|
+
|
|
30
|
+
To use this package, you need to have a running Weaviate instance.
|
|
31
|
+
|
|
32
|
+
Weaviate can be [deployed in many different ways](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate) such as in containerized environments, on Kubernetes, or in the cloud as a managed service, on-premises, or through a cloud provider such as AWS or Google Cloud.
|
|
33
|
+
|
|
34
|
+
The deployment method to choose depends on your use case and infrastructure requirements.
|
|
35
|
+
|
|
36
|
+
Two of the most common ways to deploy Weaviate are:
|
|
37
|
+
- [Docker Compose](https://weaviate.io/developers/weaviate/installation/docker-compose)
|
|
38
|
+
- [Weaviate Cloud Services (WCS)](https://console.weaviate.cloud)
|
|
39
|
+
|
|
40
|
+
## Installation and Setup
|
|
41
|
+
|
|
42
|
+
As an integration package, this assumes you have already installed LangChain. If not, please refer to the [LangChain installation guide](https://python.langchain.com/docs/get_started/installation).
|
|
43
|
+
|
|
44
|
+
Then, install this package:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install langchain-weaviate
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Usage
|
|
51
|
+
|
|
52
|
+
Please see the included [Jupyter notebook](docs/vectorstores.ipynb) for an example of how to use this package.
|
|
53
|
+
|
|
54
|
+
## Further resources
|
|
55
|
+
|
|
56
|
+
- [LangChain documentation](https://python.langchain.com/docs)
|
|
57
|
+
- [Weaviate documentation](https://weaviate.io/developers/weaviate)
|
|
58
|
+
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# langchain-weaviate
|
|
2
|
+
|
|
3
|
+
## About
|
|
4
|
+
|
|
5
|
+
This package contains the [Weaviate](https://github.com/weaviate/weaviate) integrations for [LangChain](https://github.com/langchain-ai/langchain).
|
|
6
|
+
|
|
7
|
+
- **Weaviate** is an open source, AI-native vector database that helps developers create intuitive and reliable AI-powered applications.
|
|
8
|
+
- **LangChain** is a framework for developing applications powered by language models.
|
|
9
|
+
|
|
10
|
+
Using this package, LangChain users can conveniently set Weaviate as their vector store to store and retrieve embeddings.
|
|
11
|
+
|
|
12
|
+
## Requirements
|
|
13
|
+
|
|
14
|
+
To use this package, you need to have a running Weaviate instance.
|
|
15
|
+
|
|
16
|
+
Weaviate can be [deployed in many different ways](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate) such as in containerized environments, on Kubernetes, or in the cloud as a managed service, on-premises, or through a cloud provider such as AWS or Google Cloud.
|
|
17
|
+
|
|
18
|
+
The deployment method to choose depends on your use case and infrastructure requirements.
|
|
19
|
+
|
|
20
|
+
Two of the most common ways to deploy Weaviate are:
|
|
21
|
+
- [Docker Compose](https://weaviate.io/developers/weaviate/installation/docker-compose)
|
|
22
|
+
- [Weaviate Cloud Services (WCS)](https://console.weaviate.cloud)
|
|
23
|
+
|
|
24
|
+
## Installation and Setup
|
|
25
|
+
|
|
26
|
+
As an integration package, this assumes you have already installed LangChain. If not, please refer to the [LangChain installation guide](https://python.langchain.com/docs/get_started/installation).
|
|
27
|
+
|
|
28
|
+
Then, install this package:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install langchain-weaviate
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
Please see the included [Jupyter notebook](docs/vectorstores.ipynb) for an example of how to use this package.
|
|
37
|
+
|
|
38
|
+
## Further resources
|
|
39
|
+
|
|
40
|
+
- [LangChain documentation](https://python.langchain.com/docs)
|
|
41
|
+
- [Weaviate documentation](https://weaviate.io/developers/weaviate)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Math utils."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import List, Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import simsimd # type: ignore
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
|
|
15
|
+
"""Row-wise cosine similarity between two equal-width matrices."""
|
|
16
|
+
if len(X) == 0 or len(Y) == 0:
|
|
17
|
+
return np.array([])
|
|
18
|
+
|
|
19
|
+
X = np.array(X)
|
|
20
|
+
Y = np.array(Y)
|
|
21
|
+
if X.shape[1] != Y.shape[1]:
|
|
22
|
+
raise ValueError(
|
|
23
|
+
f"Number of columns in X and Y must be the same. X has shape {X.shape} "
|
|
24
|
+
f"and Y has shape {Y.shape}."
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
X = np.array(X, dtype=np.float32)
|
|
28
|
+
Y = np.array(Y, dtype=np.float32)
|
|
29
|
+
Z = 1 - np.array(simsimd.cdist(X, Y, metric="cosine"))
|
|
30
|
+
if isinstance(Z, float):
|
|
31
|
+
return np.array([Z])
|
|
32
|
+
return Z
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def cosine_similarity_top_k(
|
|
36
|
+
X: Matrix,
|
|
37
|
+
Y: Matrix,
|
|
38
|
+
top_k: Optional[int] = 5,
|
|
39
|
+
score_threshold: Optional[float] = None,
|
|
40
|
+
) -> Tuple[List[Tuple[int, int]], List[float]]:
|
|
41
|
+
"""Row-wise cosine similarity with optional top-k and score threshold filtering.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
X: Matrix.
|
|
45
|
+
Y: Matrix, same width as X.
|
|
46
|
+
top_k: Max number of results to return.
|
|
47
|
+
score_threshold: Minimum cosine similarity of results.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Tuple of two lists. First contains two-tuples of indices (X_idx, Y_idx),
|
|
51
|
+
second contains corresponding cosine similarities.
|
|
52
|
+
"""
|
|
53
|
+
if len(X) == 0 or len(Y) == 0:
|
|
54
|
+
return [], []
|
|
55
|
+
score_array = cosine_similarity(X, Y)
|
|
56
|
+
score_threshold = score_threshold or -1.0
|
|
57
|
+
score_array[score_array < score_threshold] = 0
|
|
58
|
+
top_k = min(top_k or len(score_array), np.count_nonzero(score_array))
|
|
59
|
+
top_k_idxs = np.argpartition(score_array, -top_k, axis=None)[-top_k:]
|
|
60
|
+
top_k_idxs = top_k_idxs[np.argsort(score_array.ravel()[top_k_idxs])][::-1]
|
|
61
|
+
ret_idxs = np.unravel_index(top_k_idxs, score_array.shape)
|
|
62
|
+
scores = score_array.ravel()[top_k_idxs].tolist()
|
|
63
|
+
return list(zip(*ret_idxs)), scores # type: ignore
|
|
File without changes
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Utility functions for working with vectors and vectorstores."""
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from langchain_weaviate._math import cosine_similarity
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DistanceStrategy(str, Enum):
|
|
12
|
+
"""Enumerator of the Distance strategies for calculating distances
|
|
13
|
+
between vectors."""
|
|
14
|
+
|
|
15
|
+
EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE"
|
|
16
|
+
MAX_INNER_PRODUCT = "MAX_INNER_PRODUCT"
|
|
17
|
+
DOT_PRODUCT = "DOT_PRODUCT"
|
|
18
|
+
JACCARD = "JACCARD"
|
|
19
|
+
COSINE = "COSINE"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def maximal_marginal_relevance(
|
|
23
|
+
query_embedding: np.ndarray,
|
|
24
|
+
embedding_list: list,
|
|
25
|
+
lambda_mult: float = 0.5,
|
|
26
|
+
k: int = 4,
|
|
27
|
+
) -> List[int]:
|
|
28
|
+
"""Calculate maximal marginal relevance."""
|
|
29
|
+
if min(k, len(embedding_list)) <= 0:
|
|
30
|
+
return []
|
|
31
|
+
if query_embedding.ndim == 1:
|
|
32
|
+
query_embedding = np.expand_dims(query_embedding, axis=0)
|
|
33
|
+
similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0]
|
|
34
|
+
most_similar = int(np.argmax(similarity_to_query))
|
|
35
|
+
idxs = [most_similar]
|
|
36
|
+
selected = np.array([embedding_list[most_similar]])
|
|
37
|
+
while len(idxs) < min(k, len(embedding_list)):
|
|
38
|
+
best_score = -np.inf
|
|
39
|
+
idx_to_add = -1
|
|
40
|
+
similarity_to_selected = cosine_similarity(embedding_list, selected)
|
|
41
|
+
for i, query_score in enumerate(similarity_to_query):
|
|
42
|
+
if i in idxs:
|
|
43
|
+
continue
|
|
44
|
+
redundant_score = max(similarity_to_selected[i])
|
|
45
|
+
equation_score = (
|
|
46
|
+
lambda_mult * query_score - (1 - lambda_mult) * redundant_score
|
|
47
|
+
)
|
|
48
|
+
if equation_score > best_score:
|
|
49
|
+
best_score = equation_score
|
|
50
|
+
idx_to_add = i
|
|
51
|
+
idxs.append(idx_to_add)
|
|
52
|
+
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
|
|
53
|
+
return idxs
|
|
@@ -0,0 +1,542 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import datetime
|
|
4
|
+
import logging
|
|
5
|
+
from collections.abc import Generator
|
|
6
|
+
from contextlib import contextmanager
|
|
7
|
+
from typing import (
|
|
8
|
+
TYPE_CHECKING,
|
|
9
|
+
Any,
|
|
10
|
+
Callable,
|
|
11
|
+
Dict,
|
|
12
|
+
Iterable,
|
|
13
|
+
List,
|
|
14
|
+
Literal,
|
|
15
|
+
Optional,
|
|
16
|
+
Tuple,
|
|
17
|
+
Union,
|
|
18
|
+
overload,
|
|
19
|
+
)
|
|
20
|
+
from uuid import uuid4
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
import weaviate # type: ignore
|
|
24
|
+
from langchain_core.documents import Document
|
|
25
|
+
from langchain_core.embeddings import Embeddings
|
|
26
|
+
from langchain_core.vectorstores import VectorStore
|
|
27
|
+
|
|
28
|
+
from langchain_weaviate.utils import maximal_marginal_relevance
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
import weaviate
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger(__name__)
|
|
35
|
+
logger.setLevel(logging.DEBUG)
|
|
36
|
+
|
|
37
|
+
handler = logging.StreamHandler()
|
|
38
|
+
formatter = logging.Formatter(
|
|
39
|
+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%b-%d %I:%M %p"
|
|
40
|
+
)
|
|
41
|
+
handler.setFormatter(formatter)
|
|
42
|
+
|
|
43
|
+
logger.addHandler(handler)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _default_schema(index_name: str) -> Dict:
|
|
47
|
+
return {
|
|
48
|
+
"class": index_name,
|
|
49
|
+
"properties": [
|
|
50
|
+
{
|
|
51
|
+
"name": "text",
|
|
52
|
+
"dataType": ["text"],
|
|
53
|
+
}
|
|
54
|
+
],
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _default_score_normalizer(val: float) -> float:
|
|
59
|
+
# prevent overflow
|
|
60
|
+
# use 709 because that's the largest exponent that doesn't overflow
|
|
61
|
+
# use -709 because that's the smallest exponent that doesn't underflow
|
|
62
|
+
val = np.clip(val, -709, 709)
|
|
63
|
+
return 1 - 1 / (1 + np.exp(val))
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _json_serializable(value: Any) -> Any:
|
|
67
|
+
if isinstance(value, datetime.datetime):
|
|
68
|
+
return value.isoformat()
|
|
69
|
+
return value
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class WeaviateVectorStore(VectorStore):
|
|
73
|
+
"""`Weaviate` vector store.
|
|
74
|
+
|
|
75
|
+
To use, you should have the ``weaviate-client`` python package installed.
|
|
76
|
+
|
|
77
|
+
Example:
|
|
78
|
+
.. code-block:: python
|
|
79
|
+
|
|
80
|
+
import weaviate
|
|
81
|
+
from langchain_community.vectorstores import Weaviate
|
|
82
|
+
|
|
83
|
+
client = weaviate.Client(url=os.environ["WEAVIATE_URL"], ...)
|
|
84
|
+
weaviate = Weaviate(client, index_name, text_key)
|
|
85
|
+
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
def __init__(
|
|
89
|
+
self,
|
|
90
|
+
client: weaviate.WeaviateClient,
|
|
91
|
+
index_name: Optional[str],
|
|
92
|
+
text_key: str,
|
|
93
|
+
embedding: Optional[Embeddings] = None,
|
|
94
|
+
attributes: Optional[List[str]] = None,
|
|
95
|
+
relevance_score_fn: Optional[
|
|
96
|
+
Callable[[float], float]
|
|
97
|
+
] = _default_score_normalizer,
|
|
98
|
+
use_multi_tenancy: bool = False,
|
|
99
|
+
):
|
|
100
|
+
"""Initialize with Weaviate client."""
|
|
101
|
+
|
|
102
|
+
if not isinstance(client, weaviate.WeaviateClient):
|
|
103
|
+
raise ValueError(
|
|
104
|
+
"client should be an instance of"
|
|
105
|
+
f" weaviate.WeaviateClient, got {type(client)}"
|
|
106
|
+
)
|
|
107
|
+
self._client = client
|
|
108
|
+
self._index_name = index_name or f"LangChain_{uuid4().hex}"
|
|
109
|
+
self._embedding = embedding
|
|
110
|
+
self._text_key = text_key
|
|
111
|
+
self._query_attrs = [self._text_key]
|
|
112
|
+
self.relevance_score_fn = relevance_score_fn
|
|
113
|
+
if attributes is not None:
|
|
114
|
+
self._query_attrs.extend(attributes)
|
|
115
|
+
|
|
116
|
+
schema = _default_schema(self._index_name)
|
|
117
|
+
schema["MultiTenancyConfig"] = {"enabled": use_multi_tenancy}
|
|
118
|
+
|
|
119
|
+
# check whether the index already exists
|
|
120
|
+
if not client.collections.exists(self._index_name):
|
|
121
|
+
client.collections.create_from_dict(schema)
|
|
122
|
+
|
|
123
|
+
# store collection for convenience
|
|
124
|
+
# this does not actually send a request to weaviate
|
|
125
|
+
self._collection = client.collections.get(self._index_name)
|
|
126
|
+
|
|
127
|
+
# store this setting so we don't have to send a request to weaviate
|
|
128
|
+
# every time we want to do a CRUD operation
|
|
129
|
+
self._multi_tenancy_enabled = self._collection.config.get(
|
|
130
|
+
simple=False
|
|
131
|
+
).multi_tenancy_config.enabled
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def embeddings(self) -> Optional[Embeddings]:
|
|
135
|
+
return self._embedding
|
|
136
|
+
|
|
137
|
+
def _select_relevance_score_fn(self) -> Callable[[float], float]:
|
|
138
|
+
return (
|
|
139
|
+
self.relevance_score_fn
|
|
140
|
+
if self.relevance_score_fn
|
|
141
|
+
else _default_score_normalizer
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def add_texts(
|
|
145
|
+
self,
|
|
146
|
+
texts: Iterable[str],
|
|
147
|
+
metadatas: Optional[List[dict]] = None,
|
|
148
|
+
tenant: Optional[str] = None,
|
|
149
|
+
**kwargs: Any,
|
|
150
|
+
) -> List[str]:
|
|
151
|
+
"""Upload texts with metadata (properties) to Weaviate."""
|
|
152
|
+
from weaviate.util import get_valid_uuid # type: ignore
|
|
153
|
+
|
|
154
|
+
if tenant and not self._does_tenant_exist(tenant):
|
|
155
|
+
logger.info(
|
|
156
|
+
f"Tenant {tenant} does not exist in index {self._index_name}. "
|
|
157
|
+
"Creating tenant."
|
|
158
|
+
)
|
|
159
|
+
tenant_objs = [weaviate.classes.tenants.Tenant(name=tenant)]
|
|
160
|
+
self._collection.tenants.create(tenants=tenant_objs)
|
|
161
|
+
|
|
162
|
+
ids = []
|
|
163
|
+
embeddings: Optional[List[List[float]]] = None
|
|
164
|
+
if self._embedding:
|
|
165
|
+
embeddings = self._embedding.embed_documents(list(texts))
|
|
166
|
+
|
|
167
|
+
with self._client.batch.dynamic() as batch:
|
|
168
|
+
for i, text in enumerate(texts):
|
|
169
|
+
data_properties = {self._text_key: text}
|
|
170
|
+
if metadatas is not None:
|
|
171
|
+
for key, val in metadatas[i].items():
|
|
172
|
+
data_properties[key] = _json_serializable(val)
|
|
173
|
+
|
|
174
|
+
# Allow for ids (consistent w/ other methods)
|
|
175
|
+
# # Or uuids (backwards compatible w/ existing arg)
|
|
176
|
+
# If the UUID of one of the objects already exists
|
|
177
|
+
# then the existing object will be replaced by the new object.
|
|
178
|
+
_id = get_valid_uuid(uuid4())
|
|
179
|
+
if "uuids" in kwargs:
|
|
180
|
+
_id = kwargs["uuids"][i]
|
|
181
|
+
elif "ids" in kwargs:
|
|
182
|
+
_id = kwargs["ids"][i]
|
|
183
|
+
|
|
184
|
+
batch.add_object(
|
|
185
|
+
collection=self._index_name,
|
|
186
|
+
properties=data_properties,
|
|
187
|
+
uuid=_id,
|
|
188
|
+
vector=embeddings[i] if embeddings else None,
|
|
189
|
+
tenant=tenant,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
ids.append(_id)
|
|
193
|
+
|
|
194
|
+
failed_objs = self._client.batch.failed_objects
|
|
195
|
+
for obj in failed_objs:
|
|
196
|
+
err_message = (
|
|
197
|
+
f"Failed to add object: {obj.original_uuid}\nReason: {obj.message}"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
logger.error(err_message)
|
|
201
|
+
|
|
202
|
+
return ids
|
|
203
|
+
|
|
204
|
+
@overload
|
|
205
|
+
def _perform_search(
|
|
206
|
+
self,
|
|
207
|
+
query: Optional[str],
|
|
208
|
+
k: int,
|
|
209
|
+
return_score: Literal[False] = False,
|
|
210
|
+
tenant: Optional[str] = None,
|
|
211
|
+
**kwargs: Any,
|
|
212
|
+
) -> List[Document]: ...
|
|
213
|
+
@overload
|
|
214
|
+
def _perform_search(
|
|
215
|
+
self,
|
|
216
|
+
query: Optional[str],
|
|
217
|
+
k: int,
|
|
218
|
+
return_score: Literal[True],
|
|
219
|
+
tenant: Optional[str] = None,
|
|
220
|
+
**kwargs: Any,
|
|
221
|
+
) -> List[Tuple[Document, float]]: ...
|
|
222
|
+
def _perform_search(
|
|
223
|
+
self,
|
|
224
|
+
query: Optional[str],
|
|
225
|
+
k: int,
|
|
226
|
+
return_score: bool = False,
|
|
227
|
+
tenant: Optional[str] = None,
|
|
228
|
+
**kwargs: Any,
|
|
229
|
+
) -> Union[List[Document], List[Tuple[Document, float]]]:
|
|
230
|
+
"""
|
|
231
|
+
Perform a similarity search.
|
|
232
|
+
|
|
233
|
+
Parameters:
|
|
234
|
+
query (str): The query string to search for.
|
|
235
|
+
k (int): The number of results to return.
|
|
236
|
+
return_score (bool, optional): Whether to return the score along with the
|
|
237
|
+
document. Defaults to False.
|
|
238
|
+
tenant (Optional[str], optional): The tenant name. Defaults to None.
|
|
239
|
+
**kwargs: Additional parameters to pass to the search method. These parameters
|
|
240
|
+
will be directly passed to the underlying Weaviate client's search method.
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
List[Union[Document, Tuple[Document, float]]]: A list of documents that match
|
|
244
|
+
the query. If return_score is True, each document is returned as a tuple
|
|
245
|
+
with the document and its score.
|
|
246
|
+
|
|
247
|
+
Raises:
|
|
248
|
+
ValueError: If _embedding is None or an invalid search method is provided.
|
|
249
|
+
"""
|
|
250
|
+
if self._embedding is None:
|
|
251
|
+
raise ValueError("_embedding cannot be None for similarity_search")
|
|
252
|
+
|
|
253
|
+
if "return_metadata" not in kwargs:
|
|
254
|
+
kwargs["return_metadata"] = ["score"]
|
|
255
|
+
elif "score" not in kwargs["return_metadata"]:
|
|
256
|
+
kwargs["return_metadata"].append("score")
|
|
257
|
+
|
|
258
|
+
if (
|
|
259
|
+
"return_properties" in kwargs
|
|
260
|
+
and self._text_key not in kwargs["return_properties"]
|
|
261
|
+
):
|
|
262
|
+
kwargs["return_properties"].append(self._text_key)
|
|
263
|
+
|
|
264
|
+
vector = kwargs.pop("vector", None)
|
|
265
|
+
|
|
266
|
+
# workaround to handle test_max_marginal_relevance_search
|
|
267
|
+
if vector is None:
|
|
268
|
+
if query is None:
|
|
269
|
+
# raise an error because weaviate will do a fetch object query
|
|
270
|
+
# if both query and vector are None
|
|
271
|
+
raise ValueError("Either query or vector must be provided.")
|
|
272
|
+
else:
|
|
273
|
+
vector = self._embedding.embed_query(query)
|
|
274
|
+
|
|
275
|
+
return_uuids = kwargs.pop("return_uuids", False)
|
|
276
|
+
|
|
277
|
+
with self._tenant_context(tenant) as collection:
|
|
278
|
+
try:
|
|
279
|
+
result = collection.query.hybrid(
|
|
280
|
+
query=query, vector=vector, limit=k, **kwargs
|
|
281
|
+
)
|
|
282
|
+
except weaviate.exceptions.WeaviateQueryException as e:
|
|
283
|
+
raise ValueError(f"Error during query: {e}")
|
|
284
|
+
|
|
285
|
+
docs_and_scores: List[Tuple[Document, float]] = []
|
|
286
|
+
for obj in result.objects:
|
|
287
|
+
text = obj.properties.pop(self._text_key)
|
|
288
|
+
filtered_metadata = {
|
|
289
|
+
k: v
|
|
290
|
+
for k, v in obj.metadata.__dict__.items()
|
|
291
|
+
if v is not None and k != "score"
|
|
292
|
+
}
|
|
293
|
+
merged_props = {
|
|
294
|
+
**obj.properties,
|
|
295
|
+
**filtered_metadata,
|
|
296
|
+
**({"vector": obj.vector["default"]} if obj.vector else {}),
|
|
297
|
+
**({"uuid": str(obj.uuid)} if return_uuids else {}),
|
|
298
|
+
}
|
|
299
|
+
doc = Document(page_content=text, metadata=merged_props)
|
|
300
|
+
score = obj.metadata.score
|
|
301
|
+
docs_and_scores.append((doc, score))
|
|
302
|
+
|
|
303
|
+
if return_score:
|
|
304
|
+
return docs_and_scores
|
|
305
|
+
else:
|
|
306
|
+
return [doc for doc, _ in docs_and_scores]
|
|
307
|
+
|
|
308
|
+
def similarity_search(
|
|
309
|
+
self, query: str, k: int = 4, **kwargs: Any
|
|
310
|
+
) -> List[Document]:
|
|
311
|
+
"""Return docs most similar to query.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
query: Text to look up documents similar to.
|
|
315
|
+
k: Number of Documents to return. Defaults to 4.
|
|
316
|
+
**kwargs: Additional keyword arguments will be passed to the `hybrid()`
|
|
317
|
+
function of the weaviate client.
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
List of Documents most similar to the query.
|
|
321
|
+
"""
|
|
322
|
+
|
|
323
|
+
result = self._perform_search(query, k, **kwargs)
|
|
324
|
+
return result
|
|
325
|
+
|
|
326
|
+
def max_marginal_relevance_search(
|
|
327
|
+
self,
|
|
328
|
+
query: str,
|
|
329
|
+
k: int = 4,
|
|
330
|
+
fetch_k: int = 20,
|
|
331
|
+
lambda_mult: float = 0.5,
|
|
332
|
+
**kwargs: Any,
|
|
333
|
+
) -> List[Document]:
|
|
334
|
+
"""Return docs selected using the maximal marginal relevance.
|
|
335
|
+
|
|
336
|
+
Maximal marginal relevance optimizes for similarity to query AND diversity
|
|
337
|
+
among selected documents.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
query: Text to look up documents similar to.
|
|
341
|
+
k: Number of Documents to return. Defaults to 4.
|
|
342
|
+
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
|
343
|
+
lambda_mult: Number between 0 and 1 that determines the degree
|
|
344
|
+
of diversity among the results with 0 corresponding
|
|
345
|
+
to maximum diversity and 1 to minimum diversity.
|
|
346
|
+
Defaults to 0.5.
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
List of Documents selected by maximal marginal relevance.
|
|
350
|
+
"""
|
|
351
|
+
if self._embedding is not None:
|
|
352
|
+
embedding = self._embedding.embed_query(query)
|
|
353
|
+
else:
|
|
354
|
+
raise ValueError(
|
|
355
|
+
"max_marginal_relevance_search requires a suitable Embeddings object"
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
return self.max_marginal_relevance_search_by_vector(
|
|
359
|
+
embedding, k=k, fetch_k=fetch_k, lambda_mult=lambda_mult, **kwargs
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
def max_marginal_relevance_search_by_vector(
|
|
363
|
+
self,
|
|
364
|
+
embedding: List[float],
|
|
365
|
+
k: int = 4,
|
|
366
|
+
fetch_k: int = 20,
|
|
367
|
+
lambda_mult: float = 0.5,
|
|
368
|
+
**kwargs: Any,
|
|
369
|
+
) -> List[Document]:
|
|
370
|
+
"""Return docs selected using the maximal marginal relevance.
|
|
371
|
+
|
|
372
|
+
Maximal marginal relevance optimizes for similarity to query AND diversity
|
|
373
|
+
among selected documents.
|
|
374
|
+
|
|
375
|
+
Args:
|
|
376
|
+
embedding: Embedding to look up documents similar to.
|
|
377
|
+
k: Number of Documents to return. Defaults to 4.
|
|
378
|
+
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
|
379
|
+
lambda_mult: Number between 0 and 1 that determines the degree
|
|
380
|
+
of diversity among the results with 0 corresponding
|
|
381
|
+
to maximum diversity and 1 to minimum diversity.
|
|
382
|
+
Defaults to 0.5.
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
List of Documents selected by maximal marginal relevance.
|
|
386
|
+
"""
|
|
387
|
+
|
|
388
|
+
results = self._perform_search(
|
|
389
|
+
query=None,
|
|
390
|
+
k=fetch_k,
|
|
391
|
+
include_vector=True,
|
|
392
|
+
vector=embedding,
|
|
393
|
+
**kwargs,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
embeddings = [result.metadata["vector"] for result in results]
|
|
397
|
+
mmr_selected = maximal_marginal_relevance(
|
|
398
|
+
np.array(embedding), embeddings, k=k, lambda_mult=lambda_mult
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
docs = []
|
|
402
|
+
|
|
403
|
+
for idx in mmr_selected:
|
|
404
|
+
text = results[idx].page_content
|
|
405
|
+
results[idx].metadata.pop("vector")
|
|
406
|
+
docs.append(Document(page_content=text, metadata=results[idx].metadata))
|
|
407
|
+
|
|
408
|
+
return docs
|
|
409
|
+
|
|
410
|
+
def similarity_search_with_score(
|
|
411
|
+
self, query: str, k: int = 4, **kwargs: Any
|
|
412
|
+
) -> List[Tuple[Document, float]]:
|
|
413
|
+
"""
|
|
414
|
+
Return list of documents most similar to the query
|
|
415
|
+
text and cosine distance in float for each.
|
|
416
|
+
Lower score represents more similarity.
|
|
417
|
+
"""
|
|
418
|
+
|
|
419
|
+
results = self._perform_search(query, k, return_score=True, **kwargs)
|
|
420
|
+
|
|
421
|
+
return results
|
|
422
|
+
|
|
423
|
+
@classmethod
|
|
424
|
+
def from_texts(
|
|
425
|
+
cls,
|
|
426
|
+
texts: List[str],
|
|
427
|
+
embedding: Optional[Embeddings],
|
|
428
|
+
metadatas: Optional[List[dict]] = None,
|
|
429
|
+
*,
|
|
430
|
+
tenant: Optional[str] = None,
|
|
431
|
+
client: weaviate.WeaviateClient = None,
|
|
432
|
+
index_name: Optional[str] = None,
|
|
433
|
+
text_key: str = "text",
|
|
434
|
+
relevance_score_fn: Optional[
|
|
435
|
+
Callable[[float], float]
|
|
436
|
+
] = _default_score_normalizer,
|
|
437
|
+
**kwargs: Any,
|
|
438
|
+
) -> WeaviateVectorStore:
|
|
439
|
+
"""Construct Weaviate wrapper from raw documents.
|
|
440
|
+
|
|
441
|
+
This is a user-friendly interface that:
|
|
442
|
+
1. Embeds documents.
|
|
443
|
+
2. Creates a new index for the embeddings in the Weaviate instance.
|
|
444
|
+
3. Adds the documents to the newly created Weaviate index.
|
|
445
|
+
|
|
446
|
+
This is intended to be a quick way to get started.
|
|
447
|
+
|
|
448
|
+
Args:
|
|
449
|
+
texts: Texts to add to vector store.
|
|
450
|
+
embedding: Text embedding model to use.
|
|
451
|
+
client: weaviate.Client to use.
|
|
452
|
+
metadatas: Metadata associated with each text.
|
|
453
|
+
tenant: The tenant name. Defaults to None.
|
|
454
|
+
index_name: Index name.
|
|
455
|
+
text_key: Key to use for uploading/retrieving text to/from vectorstore.
|
|
456
|
+
relevance_score_fn: Function for converting whatever distance function the
|
|
457
|
+
vector store uses to a relevance score, which is a normalized similarity
|
|
458
|
+
score (0 means dissimilar, 1 means similar).
|
|
459
|
+
**kwargs: Additional named parameters to pass to ``Weaviate.__init__()``.
|
|
460
|
+
|
|
461
|
+
Example:
|
|
462
|
+
.. code-block:: python
|
|
463
|
+
|
|
464
|
+
from langchain_community.embeddings import OpenAIEmbeddings
|
|
465
|
+
from langchain_community.vectorstores import Weaviate
|
|
466
|
+
|
|
467
|
+
embeddings = OpenAIEmbeddings()
|
|
468
|
+
weaviate = Weaviate.from_texts(
|
|
469
|
+
texts,
|
|
470
|
+
embeddings,
|
|
471
|
+
client=client
|
|
472
|
+
)
|
|
473
|
+
"""
|
|
474
|
+
|
|
475
|
+
attributes = list(metadatas[0].keys()) if metadatas else None
|
|
476
|
+
|
|
477
|
+
weaviate_vector_store = cls(
|
|
478
|
+
client,
|
|
479
|
+
index_name,
|
|
480
|
+
text_key,
|
|
481
|
+
embedding=embedding,
|
|
482
|
+
attributes=attributes,
|
|
483
|
+
relevance_score_fn=relevance_score_fn,
|
|
484
|
+
use_multi_tenancy=tenant is not None,
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
weaviate_vector_store.add_texts(texts, metadatas, tenant=tenant, **kwargs)
|
|
488
|
+
|
|
489
|
+
return weaviate_vector_store
|
|
490
|
+
|
|
491
|
+
def delete(
|
|
492
|
+
self,
|
|
493
|
+
ids: Optional[List[str]] = None,
|
|
494
|
+
tenant: Optional[str] = None,
|
|
495
|
+
**kwargs: Any,
|
|
496
|
+
) -> None:
|
|
497
|
+
"""Delete by vector IDs.
|
|
498
|
+
|
|
499
|
+
Args:
|
|
500
|
+
ids: List of ids to delete.
|
|
501
|
+
tenant: The tenant name. Defaults to None.
|
|
502
|
+
"""
|
|
503
|
+
|
|
504
|
+
if ids is None:
|
|
505
|
+
raise ValueError("No ids provided to delete.")
|
|
506
|
+
|
|
507
|
+
id_filter = weaviate.classes.query.Filter.by_id().contains_any(ids)
|
|
508
|
+
|
|
509
|
+
with self._tenant_context(tenant) as collection:
|
|
510
|
+
collection.data.delete_many(where=id_filter)
|
|
511
|
+
|
|
512
|
+
def _does_tenant_exist(self, tenant: str) -> bool:
|
|
513
|
+
"""Check if tenant exists in Weaviate."""
|
|
514
|
+
assert (
|
|
515
|
+
self._multi_tenancy_enabled
|
|
516
|
+
), "Cannot check for tenant existence when multi-tenancy is not enabled"
|
|
517
|
+
tenants = self._collection.tenants.get()
|
|
518
|
+
|
|
519
|
+
return tenant in tenants
|
|
520
|
+
|
|
521
|
+
@contextmanager
|
|
522
|
+
def _tenant_context(
|
|
523
|
+
self, tenant: Optional[str] = None
|
|
524
|
+
) -> Generator[weaviate.collections.Collection, None, None]:
|
|
525
|
+
"""Context manager for handling tenants.
|
|
526
|
+
|
|
527
|
+
Args:
|
|
528
|
+
tenant: The tenant name. Defaults to None.
|
|
529
|
+
"""
|
|
530
|
+
|
|
531
|
+
if tenant is not None and not self._multi_tenancy_enabled:
|
|
532
|
+
raise ValueError(
|
|
533
|
+
"Cannot use tenant context when multi-tenancy is not enabled"
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
if tenant is None and self._multi_tenancy_enabled:
|
|
537
|
+
raise ValueError("Must use tenant context when multi-tenancy is enabled")
|
|
538
|
+
|
|
539
|
+
try:
|
|
540
|
+
yield self._collection.with_tenant(tenant)
|
|
541
|
+
finally:
|
|
542
|
+
pass
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "langchain-weaviate"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
description = "An integration package connecting Weaviate and LangChain"
|
|
5
|
+
authors = []
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
|
|
8
|
+
[tool.poetry.dependencies]
|
|
9
|
+
python = ">=3.9,<4.0"
|
|
10
|
+
langchain-core = "^0.1.33"
|
|
11
|
+
weaviate-client = "^4.0.0"
|
|
12
|
+
numpy = "^1.26.2"
|
|
13
|
+
simsimd = ">=3.6.1,<5.0.0"
|
|
14
|
+
|
|
15
|
+
[tool.poetry.group.test]
|
|
16
|
+
optional = true
|
|
17
|
+
|
|
18
|
+
[tool.poetry.group.test.dependencies]
|
|
19
|
+
pytest = ">=7.3,<9.0"
|
|
20
|
+
freezegun = "^1.2.2"
|
|
21
|
+
pytest-mock = "^3.10.0"
|
|
22
|
+
syrupy = "^4.0.2"
|
|
23
|
+
pytest-watcher = ">=0.3.4,<0.5.0"
|
|
24
|
+
pytest-asyncio = ">=0.21.1,<0.24.0"
|
|
25
|
+
langchain = "^0.1.8"
|
|
26
|
+
pytest-docker = ">=2.0.1,<4.0.0"
|
|
27
|
+
pytest-xdist = "^3.5.0"
|
|
28
|
+
openai = "^1.6.0"
|
|
29
|
+
tiktoken = ">=0.5.2,<0.7.0"
|
|
30
|
+
pytest-cov = ">=4.1,<6.0"
|
|
31
|
+
|
|
32
|
+
[tool.poetry.group.codespell]
|
|
33
|
+
optional = true
|
|
34
|
+
|
|
35
|
+
[tool.poetry.group.codespell.dependencies]
|
|
36
|
+
codespell = "^2.2.0"
|
|
37
|
+
|
|
38
|
+
[tool.poetry.group.lint]
|
|
39
|
+
optional = true
|
|
40
|
+
|
|
41
|
+
[tool.poetry.group.lint.dependencies]
|
|
42
|
+
ruff = ">=0.1.5,<0.4.0"
|
|
43
|
+
|
|
44
|
+
[tool.poetry.group.typing.dependencies]
|
|
45
|
+
mypy = ">=0.991,<1.10"
|
|
46
|
+
types-requests = "^2.31.0.20240403"
|
|
47
|
+
|
|
48
|
+
[tool.poetry.group.dev]
|
|
49
|
+
optional = true
|
|
50
|
+
|
|
51
|
+
[tool.poetry.group.dev.dependencies]
|
|
52
|
+
ipykernel = "^6.27.1"
|
|
53
|
+
|
|
54
|
+
[tool.poetry.group.test_integration]
|
|
55
|
+
optional = true
|
|
56
|
+
|
|
57
|
+
[tool.poetry.group.test_integration.dependencies]
|
|
58
|
+
langchain-openai = ">=0.0.3,<0.2"
|
|
59
|
+
|
|
60
|
+
[tool.ruff]
|
|
61
|
+
lint.select = [
|
|
62
|
+
"E", # pycodestyle
|
|
63
|
+
"F", # pyflakes
|
|
64
|
+
"I", # isort
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
[tool.mypy]
|
|
68
|
+
disallow_untyped_defs = "True"
|
|
69
|
+
|
|
70
|
+
[tool.coverage.run]
|
|
71
|
+
omit = ["tests/*"]
|
|
72
|
+
|
|
73
|
+
[build-system]
|
|
74
|
+
requires = ["poetry-core>=1.0.0"]
|
|
75
|
+
build-backend = "poetry.core.masonry.api"
|
|
76
|
+
|
|
77
|
+
[tool.pytest.ini_options]
|
|
78
|
+
# --strict-markers will raise errors on unknown marks.
|
|
79
|
+
# https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
|
|
80
|
+
#
|
|
81
|
+
# https://docs.pytest.org/en/7.1.x/reference/reference.html
|
|
82
|
+
# --strict-config any warnings encountered while parsing the `pytest`
|
|
83
|
+
# section of the configuration file raise errors.
|
|
84
|
+
#
|
|
85
|
+
# https://github.com/tophat/syrupy
|
|
86
|
+
# --snapshot-warn-unused Prints a warning on unused snapshots rather than fail the test suite.
|
|
87
|
+
addopts = "--strict-markers --strict-config --durations=5 -vv -s"
|
|
88
|
+
# Registering custom markers.
|
|
89
|
+
# https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers
|
|
90
|
+
markers = [
|
|
91
|
+
"compile: mark placeholder test used to compile integration tests without running them",
|
|
92
|
+
]
|
|
93
|
+
#asyncio_mode = "auto"
|