langchain-google-genai 0.0.10rc0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of langchain-google-genai might be problematic. Click here for more details.
- langchain_google_genai/__init__.py +15 -0
- langchain_google_genai/_genai_extension.py +618 -0
- langchain_google_genai/chat_models.py +17 -10
- langchain_google_genai/embeddings.py +26 -12
- langchain_google_genai/genai_aqa.py +134 -0
- langchain_google_genai/google_vector_store.py +493 -0
- langchain_google_genai/llms.py +22 -12
- {langchain_google_genai-0.0.10rc0.dist-info → langchain_google_genai-1.0.1.dist-info}/METADATA +32 -2
- langchain_google_genai-1.0.1.dist-info/RECORD +15 -0
- langchain_google_genai-0.0.10rc0.dist-info/RECORD +0 -12
- {langchain_google_genai-0.0.10rc0.dist-info → langchain_google_genai-1.0.1.dist-info}/LICENSE +0 -0
- {langchain_google_genai-0.0.10rc0.dist-info → langchain_google_genai-1.0.1.dist-info}/WHEEL +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Dict, List, Optional
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
2
|
|
|
3
3
|
# TODO: remove ignore once the google package is published with types
|
|
4
4
|
import google.generativeai as genai # type: ignore[import]
|
|
@@ -43,6 +43,13 @@ class GoogleGenerativeAIEmbeddings(BaseModel, Embeddings):
|
|
|
43
43
|
description="The Google API key to use. If not provided, "
|
|
44
44
|
"the GOOGLE_API_KEY environment variable will be used.",
|
|
45
45
|
)
|
|
46
|
+
credentials: Any = Field(
|
|
47
|
+
default=None,
|
|
48
|
+
exclude=True,
|
|
49
|
+
description="The default custom credentials "
|
|
50
|
+
"(google.auth.credentials.Credentials) to use when making API calls. If not "
|
|
51
|
+
"provided, credentials will be ascertained from the GOOGLE_API_KEY envvar",
|
|
52
|
+
)
|
|
46
53
|
client_options: Optional[Dict] = Field(
|
|
47
54
|
None,
|
|
48
55
|
description=(
|
|
@@ -58,17 +65,24 @@ class GoogleGenerativeAIEmbeddings(BaseModel, Embeddings):
|
|
|
58
65
|
@root_validator()
|
|
59
66
|
def validate_environment(cls, values: Dict) -> Dict:
|
|
60
67
|
"""Validates params and passes them to google-generativeai package."""
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
68
|
+
if values.get("credentials"):
|
|
69
|
+
genai.configure(
|
|
70
|
+
credentials=values.get("credentials"),
|
|
71
|
+
transport=values.get("transport"),
|
|
72
|
+
client_options=values.get("client_options"),
|
|
73
|
+
)
|
|
74
|
+
else:
|
|
75
|
+
google_api_key = get_from_dict_or_env(
|
|
76
|
+
values, "google_api_key", "GOOGLE_API_KEY"
|
|
77
|
+
)
|
|
78
|
+
if isinstance(google_api_key, SecretStr):
|
|
79
|
+
google_api_key = google_api_key.get_secret_value()
|
|
80
|
+
|
|
81
|
+
genai.configure(
|
|
82
|
+
api_key=google_api_key,
|
|
83
|
+
transport=values.get("transport"),
|
|
84
|
+
client_options=values.get("client_options"),
|
|
85
|
+
)
|
|
72
86
|
return values
|
|
73
87
|
|
|
74
88
|
def _embed(
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Google GenerativeAI Attributed Question and Answering (AQA) service.
|
|
2
|
+
|
|
3
|
+
The GenAI Semantic AQA API is a managed end to end service that allows
|
|
4
|
+
developers to create responses grounded on specified passages based on
|
|
5
|
+
a user query. For more information visit:
|
|
6
|
+
https://developers.generativeai.google/guide
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Any, List, Optional
|
|
10
|
+
|
|
11
|
+
import google.ai.generativelanguage as genai
|
|
12
|
+
from langchain_core.pydantic_v1 import BaseModel, PrivateAttr
|
|
13
|
+
from langchain_core.runnables import RunnableSerializable
|
|
14
|
+
from langchain_core.runnables.config import RunnableConfig
|
|
15
|
+
|
|
16
|
+
from . import _genai_extension as genaix
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class AqaInput(BaseModel):
|
|
20
|
+
"""Input to `GenAIAqa.invoke`.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
prompt: The user's inquiry.
|
|
24
|
+
source_passages: A list of passage that the LLM should use only to
|
|
25
|
+
answer the user's inquiry.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
prompt: str
|
|
29
|
+
source_passages: List[str]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class AqaOutput(BaseModel):
|
|
33
|
+
"""Output from `GenAIAqa.invoke`.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
answer: The answer to the user's inquiry.
|
|
37
|
+
attributed_passages: A list of passages that the LLM used to construct
|
|
38
|
+
the answer.
|
|
39
|
+
answerable_probability: The probability of the question being answered
|
|
40
|
+
from the provided passages.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
answer: str
|
|
44
|
+
attributed_passages: List[str]
|
|
45
|
+
answerable_probability: float
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class _AqaModel(BaseModel):
|
|
49
|
+
"""Wrapper for Google's internal AQA model."""
|
|
50
|
+
|
|
51
|
+
_client: genai.GenerativeServiceClient = PrivateAttr()
|
|
52
|
+
_answer_style: int = PrivateAttr()
|
|
53
|
+
_safety_settings: List[genai.SafetySetting] = PrivateAttr()
|
|
54
|
+
_temperature: Optional[float] = PrivateAttr()
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
answer_style: int = genai.GenerateAnswerRequest.AnswerStyle.ABSTRACTIVE,
|
|
59
|
+
safety_settings: List[genai.SafetySetting] = [],
|
|
60
|
+
temperature: Optional[float] = None,
|
|
61
|
+
**kwargs: Any,
|
|
62
|
+
) -> None:
|
|
63
|
+
super().__init__(**kwargs)
|
|
64
|
+
self._client = genaix.build_generative_service()
|
|
65
|
+
self._answer_style = answer_style
|
|
66
|
+
self._safety_settings = safety_settings
|
|
67
|
+
self._temperature = temperature
|
|
68
|
+
|
|
69
|
+
def generate_answer(
|
|
70
|
+
self,
|
|
71
|
+
prompt: str,
|
|
72
|
+
passages: List[str],
|
|
73
|
+
) -> genaix.GroundedAnswer:
|
|
74
|
+
return genaix.generate_answer(
|
|
75
|
+
prompt=prompt,
|
|
76
|
+
passages=passages,
|
|
77
|
+
client=self._client,
|
|
78
|
+
answer_style=self._answer_style,
|
|
79
|
+
safety_settings=self._safety_settings,
|
|
80
|
+
temperature=self._temperature,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class GenAIAqa(RunnableSerializable[AqaInput, AqaOutput]):
|
|
85
|
+
"""Google's Attributed Question and Answering service.
|
|
86
|
+
|
|
87
|
+
Given a user's query and a list of passages, Google's server will return
|
|
88
|
+
a response that is grounded to the provided list of passages. It will not
|
|
89
|
+
base the response on parametric memory.
|
|
90
|
+
|
|
91
|
+
Attributes:
|
|
92
|
+
answer_style: keyword-only argument. See
|
|
93
|
+
`google.ai.generativelanguage.AnswerStyle` for details.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
# Actual type is .aqa_model.AqaModel.
|
|
97
|
+
_client: _AqaModel = PrivateAttr()
|
|
98
|
+
|
|
99
|
+
# Actual type is genai.AnswerStyle.
|
|
100
|
+
# 1 = ABSTRACTIVE.
|
|
101
|
+
# Cannot use the actual type here because user may not have
|
|
102
|
+
# google.generativeai installed.
|
|
103
|
+
answer_style: int = 1
|
|
104
|
+
|
|
105
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
106
|
+
"""Construct a Google Generative AI AQA model.
|
|
107
|
+
|
|
108
|
+
All arguments are optional.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
answer_style: See
|
|
112
|
+
`google.ai.generativelanguage.GenerateAnswerRequest.AnswerStyle`.
|
|
113
|
+
safety_settings: See `google.ai.generativelanguage.SafetySetting`.
|
|
114
|
+
temperature: 0.0 to 1.0.
|
|
115
|
+
"""
|
|
116
|
+
super().__init__(**kwargs)
|
|
117
|
+
self._client = _AqaModel(**kwargs)
|
|
118
|
+
|
|
119
|
+
def invoke(
|
|
120
|
+
self, input: AqaInput, config: Optional[RunnableConfig] = None
|
|
121
|
+
) -> AqaOutput:
|
|
122
|
+
"""Generates a grounded response using the provided passages."""
|
|
123
|
+
|
|
124
|
+
response = self._client.generate_answer(
|
|
125
|
+
prompt=input.prompt, passages=input.source_passages
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
return AqaOutput(
|
|
129
|
+
answer=response.answer,
|
|
130
|
+
attributed_passages=[
|
|
131
|
+
passage.text for passage in response.attributed_passages
|
|
132
|
+
],
|
|
133
|
+
answerable_probability=response.answerable_probability or 0.0,
|
|
134
|
+
)
|
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
"""Google Generative AI Vector Store.
|
|
2
|
+
|
|
3
|
+
The GenAI Semantic Retriever API is a managed end-to-end service that allows
|
|
4
|
+
developers to create a corpus of documents to perform semantic search on
|
|
5
|
+
related passages given a user query. For more information visit:
|
|
6
|
+
https://developers.generativeai.google/guide
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import (
|
|
12
|
+
Any,
|
|
13
|
+
Callable,
|
|
14
|
+
Dict,
|
|
15
|
+
Iterable,
|
|
16
|
+
List,
|
|
17
|
+
Optional,
|
|
18
|
+
Tuple,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
import google.ai.generativelanguage as genai
|
|
22
|
+
from langchain_core.documents import Document
|
|
23
|
+
from langchain_core.embeddings import Embeddings
|
|
24
|
+
from langchain_core.pydantic_v1 import BaseModel, PrivateAttr
|
|
25
|
+
from langchain_core.runnables import Runnable, RunnableLambda, RunnablePassthrough
|
|
26
|
+
from langchain_core.vectorstores import VectorStore
|
|
27
|
+
|
|
28
|
+
from . import _genai_extension as genaix
|
|
29
|
+
from .genai_aqa import (
|
|
30
|
+
AqaInput,
|
|
31
|
+
AqaOutput,
|
|
32
|
+
GenAIAqa,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ServerSideEmbedding(Embeddings):
|
|
37
|
+
"""Do nothing embedding model where the embedding is done by the server."""
|
|
38
|
+
|
|
39
|
+
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
40
|
+
return [[] for _ in texts]
|
|
41
|
+
|
|
42
|
+
def embed_query(self, text: str) -> List[float]:
|
|
43
|
+
return []
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class DoesNotExistsException(Exception):
|
|
47
|
+
def __init__(self, *, corpus_id: str, document_id: Optional[str] = None) -> None:
|
|
48
|
+
if document_id is None:
|
|
49
|
+
message = f"No such corpus {corpus_id}"
|
|
50
|
+
else:
|
|
51
|
+
message = f"No such document {document_id} under corpus {corpus_id}"
|
|
52
|
+
super().__init__(message)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class _SemanticRetriever(BaseModel):
|
|
56
|
+
"""Wrapper class to Google's internal semantric retriever service."""
|
|
57
|
+
|
|
58
|
+
name: genaix.EntityName
|
|
59
|
+
_client: genai.RetrieverServiceClient = PrivateAttr()
|
|
60
|
+
|
|
61
|
+
def __init__(self, *, client: genai.RetrieverServiceClient, **kwargs: Any) -> None:
|
|
62
|
+
super().__init__(**kwargs)
|
|
63
|
+
self._client = client
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def from_ids(
|
|
67
|
+
cls, corpus_id: str, document_id: Optional[str]
|
|
68
|
+
) -> "_SemanticRetriever":
|
|
69
|
+
name = genaix.EntityName(corpus_id=corpus_id, document_id=document_id)
|
|
70
|
+
client = genaix.build_semantic_retriever()
|
|
71
|
+
|
|
72
|
+
# Check the entity exists on Google server.
|
|
73
|
+
if name.is_corpus():
|
|
74
|
+
if genaix.get_corpus(corpus_id=corpus_id, client=client) is None:
|
|
75
|
+
raise DoesNotExistsException(corpus_id=corpus_id)
|
|
76
|
+
elif name.is_document():
|
|
77
|
+
assert document_id is not None
|
|
78
|
+
if (
|
|
79
|
+
genaix.get_document(
|
|
80
|
+
corpus_id=corpus_id, document_id=document_id, client=client
|
|
81
|
+
)
|
|
82
|
+
is None
|
|
83
|
+
):
|
|
84
|
+
raise DoesNotExistsException(
|
|
85
|
+
corpus_id=corpus_id, document_id=document_id
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return cls(name=name, client=client)
|
|
89
|
+
|
|
90
|
+
def add_texts(
|
|
91
|
+
self,
|
|
92
|
+
texts: Iterable[str],
|
|
93
|
+
metadatas: Optional[List[Dict[str, Any]]] = None,
|
|
94
|
+
document_id: Optional[str] = None,
|
|
95
|
+
) -> List[str]:
|
|
96
|
+
if self.name.document_id is None and document_id is None:
|
|
97
|
+
raise NotImplementedError(
|
|
98
|
+
"Adding texts to a corpus directly is not supported. "
|
|
99
|
+
"Please provide a document ID under the corpus first. "
|
|
100
|
+
"Then add the texts to the document."
|
|
101
|
+
)
|
|
102
|
+
if (
|
|
103
|
+
self.name.document_id is not None
|
|
104
|
+
and document_id is not None
|
|
105
|
+
and self.name.document_id != document_id
|
|
106
|
+
):
|
|
107
|
+
raise NotImplementedError(
|
|
108
|
+
f"Parameter `document_id` {document_id} does not match the "
|
|
109
|
+
f"vector store's `document_id` {self.name.document_id}"
|
|
110
|
+
)
|
|
111
|
+
assert self.name.document_id or document_id is not None
|
|
112
|
+
new_document_id = self.name.document_id or document_id or ""
|
|
113
|
+
|
|
114
|
+
texts = list(texts)
|
|
115
|
+
if metadatas is None:
|
|
116
|
+
metadatas = [{} for _ in texts]
|
|
117
|
+
if len(texts) != len(metadatas):
|
|
118
|
+
raise ValueError(
|
|
119
|
+
f"metadatas's length {len(metadatas)} and "
|
|
120
|
+
f"texts's length {len(texts)} are mismatched"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
chunks = genaix.batch_create_chunk(
|
|
124
|
+
corpus_id=self.name.corpus_id,
|
|
125
|
+
document_id=new_document_id,
|
|
126
|
+
texts=texts,
|
|
127
|
+
metadatas=metadatas,
|
|
128
|
+
client=self._client,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return [chunk.name for chunk in chunks if chunk.name]
|
|
132
|
+
|
|
133
|
+
def similarity_search(
|
|
134
|
+
self,
|
|
135
|
+
query: str,
|
|
136
|
+
k: int = 4,
|
|
137
|
+
filter: Optional[Dict[str, Any]] = None,
|
|
138
|
+
) -> List[Tuple[str, float]]:
|
|
139
|
+
if self.name.is_corpus():
|
|
140
|
+
relevant_chunks = genaix.query_corpus(
|
|
141
|
+
corpus_id=self.name.corpus_id,
|
|
142
|
+
query=query,
|
|
143
|
+
k=k,
|
|
144
|
+
filter=filter,
|
|
145
|
+
client=self._client,
|
|
146
|
+
)
|
|
147
|
+
else:
|
|
148
|
+
assert self.name.is_document()
|
|
149
|
+
assert self.name.document_id is not None
|
|
150
|
+
relevant_chunks = genaix.query_document(
|
|
151
|
+
corpus_id=self.name.corpus_id,
|
|
152
|
+
document_id=self.name.document_id,
|
|
153
|
+
query=query,
|
|
154
|
+
k=k,
|
|
155
|
+
filter=filter,
|
|
156
|
+
client=self._client,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return [
|
|
160
|
+
(chunk.chunk.data.string_value, chunk.chunk_relevance_score)
|
|
161
|
+
for chunk in relevant_chunks
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
|
|
165
|
+
for id in ids or []:
|
|
166
|
+
name = genaix.EntityName.from_str(id)
|
|
167
|
+
_delete_chunk(
|
|
168
|
+
corpus_id=name.corpus_id,
|
|
169
|
+
document_id=name.document_id,
|
|
170
|
+
chunk_id=name.chunk_id,
|
|
171
|
+
client=self._client,
|
|
172
|
+
)
|
|
173
|
+
return True
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _delete_chunk(
|
|
177
|
+
*,
|
|
178
|
+
corpus_id: str,
|
|
179
|
+
document_id: Optional[str],
|
|
180
|
+
chunk_id: Optional[str],
|
|
181
|
+
client: genai.RetrieverServiceClient,
|
|
182
|
+
) -> None:
|
|
183
|
+
if chunk_id is not None:
|
|
184
|
+
if document_id is None:
|
|
185
|
+
raise ValueError(f"Chunk {chunk_id} requires a document ID")
|
|
186
|
+
genaix.delete_chunk(
|
|
187
|
+
corpus_id=corpus_id,
|
|
188
|
+
document_id=document_id,
|
|
189
|
+
chunk_id=chunk_id,
|
|
190
|
+
client=client,
|
|
191
|
+
)
|
|
192
|
+
elif document_id is not None:
|
|
193
|
+
genaix.delete_document(
|
|
194
|
+
corpus_id=corpus_id, document_id=document_id, client=client
|
|
195
|
+
)
|
|
196
|
+
else:
|
|
197
|
+
genaix.delete_corpus(corpus_id=corpus_id, client=client)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class GoogleVectorStore(VectorStore):
|
|
201
|
+
"""Google GenerativeAI Vector Store.
|
|
202
|
+
|
|
203
|
+
Currently, it computes the embedding vectors on the server side.
|
|
204
|
+
|
|
205
|
+
Example: Add texts to an existing corpus.
|
|
206
|
+
|
|
207
|
+
store = GoogleVectorStore(corpus_id="123")
|
|
208
|
+
store.add_documents(documents, document_id="456")
|
|
209
|
+
|
|
210
|
+
Example: Create a new corpus.
|
|
211
|
+
|
|
212
|
+
store = GoogleVectorStore.create_corpus(
|
|
213
|
+
corpus_id="123", display_name="My Google corpus")
|
|
214
|
+
|
|
215
|
+
Example: Query the corpus for relevant passages.
|
|
216
|
+
|
|
217
|
+
store.as_retriever() \
|
|
218
|
+
.get_relevant_documents("Who caught the gingerbread man?")
|
|
219
|
+
|
|
220
|
+
Example: Ask the corpus for grounded responses!
|
|
221
|
+
|
|
222
|
+
aqa = store.as_aqa()
|
|
223
|
+
response = aqa.invoke("Who caught the gingerbread man?")
|
|
224
|
+
print(response.answer)
|
|
225
|
+
print(response.attributed_passages)
|
|
226
|
+
print(response.answerability_probability)
|
|
227
|
+
|
|
228
|
+
You can also operate at Google's Document level.
|
|
229
|
+
|
|
230
|
+
Example: Add texts to an existing Google Vector Store Document.
|
|
231
|
+
|
|
232
|
+
doc_store = GoogleVectorStore(corpus_id="123", document_id="456")
|
|
233
|
+
doc_store.add_documents(documents)
|
|
234
|
+
|
|
235
|
+
Example: Create a new Google Vector Store Document.
|
|
236
|
+
|
|
237
|
+
doc_store = GoogleVectorStore.create_document(
|
|
238
|
+
corpus_id="123", document_id="456", display_name="My Google document")
|
|
239
|
+
|
|
240
|
+
Example: Query the Google document.
|
|
241
|
+
|
|
242
|
+
doc_store.as_retriever() \
|
|
243
|
+
.get_relevant_documents("Who caught the gingerbread man?")
|
|
244
|
+
|
|
245
|
+
For more details, see the class's methods.
|
|
246
|
+
"""
|
|
247
|
+
|
|
248
|
+
_retriever: _SemanticRetriever
|
|
249
|
+
|
|
250
|
+
def __init__(
|
|
251
|
+
self, *, corpus_id: str, document_id: Optional[str] = None, **kwargs: Any
|
|
252
|
+
):
|
|
253
|
+
"""Returns an existing Google Semantic Retriever corpus or document.
|
|
254
|
+
|
|
255
|
+
If just the corpus ID is provided, the vector store operates over all
|
|
256
|
+
documents within that corpus.
|
|
257
|
+
|
|
258
|
+
If the document ID is provided, the vector store operates over just that
|
|
259
|
+
document.
|
|
260
|
+
|
|
261
|
+
Raises:
|
|
262
|
+
DoesNotExistsException if the IDs do not match to anything on Google
|
|
263
|
+
server. In this case, consider using `create_corpus` or
|
|
264
|
+
`create_document` to create one.
|
|
265
|
+
"""
|
|
266
|
+
super().__init__(**kwargs)
|
|
267
|
+
self._retriever = _SemanticRetriever.from_ids(corpus_id, document_id)
|
|
268
|
+
|
|
269
|
+
@classmethod
|
|
270
|
+
def create_corpus(
|
|
271
|
+
cls,
|
|
272
|
+
corpus_id: Optional[str] = None,
|
|
273
|
+
display_name: Optional[str] = None,
|
|
274
|
+
) -> "GoogleVectorStore":
|
|
275
|
+
"""Create a Google Semantic Retriever corpus.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
corpus_id: The ID to use to create the new corpus. If not provided,
|
|
279
|
+
Google server will provide one.
|
|
280
|
+
display_name: The title of the new corpus. If not provided, Google
|
|
281
|
+
server will provide one.
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
An instance of vector store that points to the newly created corpus.
|
|
285
|
+
"""
|
|
286
|
+
client = genaix.build_semantic_retriever()
|
|
287
|
+
corpus = genaix.create_corpus(
|
|
288
|
+
corpus_id=corpus_id, display_name=display_name, client=client
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
n = genaix.EntityName.from_str(corpus.name)
|
|
292
|
+
return cls(corpus_id=n.corpus_id)
|
|
293
|
+
|
|
294
|
+
@classmethod
|
|
295
|
+
def create_document(
|
|
296
|
+
cls,
|
|
297
|
+
corpus_id: str,
|
|
298
|
+
document_id: Optional[str] = None,
|
|
299
|
+
display_name: Optional[str] = None,
|
|
300
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
301
|
+
) -> "GoogleVectorStore":
|
|
302
|
+
"""Create a Google Semantic Retriever document.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
corpus_id: ID of an existing corpus.
|
|
306
|
+
document_id: The ID to use to create the new Google Semantic
|
|
307
|
+
Retriever document. If not provided, Google server will provide
|
|
308
|
+
one.
|
|
309
|
+
display_name: The title of the new document. If not provided, Google
|
|
310
|
+
server will provide one.
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
An instance of vector store that points to the newly created
|
|
314
|
+
document.
|
|
315
|
+
"""
|
|
316
|
+
client = genaix.build_semantic_retriever()
|
|
317
|
+
document = genaix.create_document(
|
|
318
|
+
corpus_id=corpus_id,
|
|
319
|
+
document_id=document_id,
|
|
320
|
+
display_name=display_name,
|
|
321
|
+
metadata=metadata,
|
|
322
|
+
client=client,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
assert document.name is not None
|
|
326
|
+
d = genaix.EntityName.from_str(document.name)
|
|
327
|
+
return cls(corpus_id=d.corpus_id, document_id=d.document_id)
|
|
328
|
+
|
|
329
|
+
@classmethod
|
|
330
|
+
def from_texts(
|
|
331
|
+
cls,
|
|
332
|
+
texts: List[str],
|
|
333
|
+
embedding: Optional[Embeddings] = None,
|
|
334
|
+
metadatas: Optional[List[dict[str, Any]]] = None,
|
|
335
|
+
*,
|
|
336
|
+
corpus_id: Optional[str] = None, # str required
|
|
337
|
+
document_id: Optional[str] = None, # str required
|
|
338
|
+
**kwargs: Any,
|
|
339
|
+
) -> "GoogleVectorStore":
|
|
340
|
+
"""Returns a vector store of an existing document with the specified text.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
corpus_id: REQUIRED. Must be an existing corpus.
|
|
344
|
+
document_id: REQUIRED. Must be an existing document.
|
|
345
|
+
texts: Texts to be loaded into the vector store.
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
A vector store pointing to the specified Google Semantic Retriever
|
|
349
|
+
Document.
|
|
350
|
+
|
|
351
|
+
Raises:
|
|
352
|
+
DoesNotExistsException if the IDs do not match to anything at
|
|
353
|
+
Google server.
|
|
354
|
+
"""
|
|
355
|
+
if corpus_id is None or document_id is None:
|
|
356
|
+
raise NotImplementedError(
|
|
357
|
+
"Must provide an existing corpus ID and document ID"
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
doc_store = cls(corpus_id=corpus_id, document_id=document_id, **kwargs)
|
|
361
|
+
doc_store.add_texts(texts, metadatas)
|
|
362
|
+
|
|
363
|
+
return doc_store
|
|
364
|
+
|
|
365
|
+
@property
|
|
366
|
+
def name(self) -> str:
|
|
367
|
+
"""Returns the name of the Google entity.
|
|
368
|
+
|
|
369
|
+
You shouldn't need to care about this unless you want to access your
|
|
370
|
+
corpus or document via Google Generative AI API.
|
|
371
|
+
"""
|
|
372
|
+
return str(self._retriever.name)
|
|
373
|
+
|
|
374
|
+
@property
|
|
375
|
+
def corpus_id(self) -> str:
|
|
376
|
+
"""Returns the corpus ID managed by this vector store."""
|
|
377
|
+
return self._retriever.name.corpus_id
|
|
378
|
+
|
|
379
|
+
@property
|
|
380
|
+
def document_id(self) -> Optional[str]:
|
|
381
|
+
"""Returns the document ID managed by this vector store."""
|
|
382
|
+
return self._retriever.name.document_id
|
|
383
|
+
|
|
384
|
+
def add_texts(
|
|
385
|
+
self,
|
|
386
|
+
texts: Iterable[str],
|
|
387
|
+
metadatas: Optional[List[Dict[str, Any]]] = None,
|
|
388
|
+
*,
|
|
389
|
+
document_id: Optional[str] = None,
|
|
390
|
+
**kwargs: Any,
|
|
391
|
+
) -> List[str]:
|
|
392
|
+
"""Add texts to the vector store.
|
|
393
|
+
|
|
394
|
+
If the vector store points to a corpus (instead of a document), you must
|
|
395
|
+
also provide a `document_id`.
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
Chunk's names created on Google servers.
|
|
399
|
+
"""
|
|
400
|
+
return self._retriever.add_texts(texts, metadatas, document_id)
|
|
401
|
+
|
|
402
|
+
def similarity_search(
|
|
403
|
+
self,
|
|
404
|
+
query: str,
|
|
405
|
+
k: int = 4,
|
|
406
|
+
filter: Optional[Dict[str, Any]] = None,
|
|
407
|
+
**kwargs: Any,
|
|
408
|
+
) -> List[Document]:
|
|
409
|
+
"""Search the vector store for relevant texts."""
|
|
410
|
+
return [
|
|
411
|
+
document
|
|
412
|
+
for document, _ in self.similarity_search_with_score(
|
|
413
|
+
query, k, filter, **kwargs
|
|
414
|
+
)
|
|
415
|
+
]
|
|
416
|
+
|
|
417
|
+
def similarity_search_with_score(
|
|
418
|
+
self,
|
|
419
|
+
query: str,
|
|
420
|
+
k: int = 4,
|
|
421
|
+
filter: Optional[Dict[str, Any]] = None,
|
|
422
|
+
**kwargs: Any,
|
|
423
|
+
) -> List[Tuple[Document, float]]:
|
|
424
|
+
"""Run similarity search with distance."""
|
|
425
|
+
return [
|
|
426
|
+
(Document(page_content=text), score)
|
|
427
|
+
for text, score in self._retriever.similarity_search(query, k, filter)
|
|
428
|
+
]
|
|
429
|
+
|
|
430
|
+
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
|
|
431
|
+
"""Delete chunnks.
|
|
432
|
+
|
|
433
|
+
Note that the "ids" are not corpus ID or document ID. Rather, these
|
|
434
|
+
are the entity names returned by `add_texts`.
|
|
435
|
+
|
|
436
|
+
Returns:
|
|
437
|
+
True if successful. Otherwise, you should get an exception anyway.
|
|
438
|
+
"""
|
|
439
|
+
return self._retriever.delete(ids)
|
|
440
|
+
|
|
441
|
+
async def adelete(
|
|
442
|
+
self, ids: Optional[List[str]] = None, **kwargs: Any
|
|
443
|
+
) -> Optional[bool]:
|
|
444
|
+
return await asyncio.get_running_loop().run_in_executor(
|
|
445
|
+
None, partial(self.delete, **kwargs), ids
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
def _select_relevance_score_fn(self) -> Callable[[float], float]:
|
|
449
|
+
"""
|
|
450
|
+
TODO: Check with the team about this!
|
|
451
|
+
The underlying vector store already returns a "score proper",
|
|
452
|
+
i.e. one in [0, 1] where higher means more *similar*.
|
|
453
|
+
"""
|
|
454
|
+
return lambda score: score
|
|
455
|
+
|
|
456
|
+
def as_aqa(self, **kwargs: Any) -> Runnable[str, AqaOutput]:
|
|
457
|
+
"""Construct a Google Generative AI AQA engine.
|
|
458
|
+
|
|
459
|
+
All arguments are optional.
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
answer_style: See
|
|
463
|
+
`google.ai.generativelanguage.GenerateAnswerRequest.AnswerStyle`.
|
|
464
|
+
safety_settings: See `google.ai.generativelanguage.SafetySetting`.
|
|
465
|
+
temperature: 0.0 to 1.0.
|
|
466
|
+
"""
|
|
467
|
+
return (
|
|
468
|
+
RunnablePassthrough[str]()
|
|
469
|
+
| {
|
|
470
|
+
"prompt": RunnablePassthrough(),
|
|
471
|
+
"passages": self.as_retriever(),
|
|
472
|
+
}
|
|
473
|
+
| RunnableLambda(_toAqaInput)
|
|
474
|
+
| GenAIAqa(**kwargs)
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def _toAqaInput(input: Dict[str, Any]) -> AqaInput:
|
|
479
|
+
prompt = input["prompt"]
|
|
480
|
+
assert isinstance(prompt, str)
|
|
481
|
+
|
|
482
|
+
passages = input["passages"]
|
|
483
|
+
assert isinstance(passages, list)
|
|
484
|
+
|
|
485
|
+
source_passages: List[str] = []
|
|
486
|
+
for passage in passages:
|
|
487
|
+
assert isinstance(passage, Document)
|
|
488
|
+
source_passages.append(passage.page_content)
|
|
489
|
+
|
|
490
|
+
return AqaInput(
|
|
491
|
+
prompt=prompt,
|
|
492
|
+
source_passages=source_passages,
|
|
493
|
+
)
|