langchain-kinetica 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langchain_kinetica/__init__.py +31 -6
- langchain_kinetica/chat_models.py +537 -0
- langchain_kinetica/document_loaders.py +89 -0
- langchain_kinetica/py.typed +0 -0
- langchain_kinetica/vectorstores.py +934 -0
- langchain_kinetica-1.1.0.dist-info/METADATA +71 -0
- langchain_kinetica-1.1.0.dist-info/RECORD +8 -0
- {langchain_kinetica-1.0.0.dist-info → langchain_kinetica-1.1.0.dist-info}/WHEEL +1 -2
- langchain_kinetica/llm_chat.py +0 -183
- langchain_kinetica/sa_datafile.py +0 -60
- langchain_kinetica/sa_dto.py +0 -111
- langchain_kinetica/sql_output.py +0 -45
- langchain_kinetica-1.0.0.dist-info/LICENSE +0 -21
- langchain_kinetica-1.0.0.dist-info/METADATA +0 -110
- langchain_kinetica-1.0.0.dist-info/RECORD +0 -10
- langchain_kinetica-1.0.0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,934 @@
|
|
|
1
|
+
"""Kinetica vector store implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import enum
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
import struct
|
|
10
|
+
import uuid
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from functools import partial
|
|
13
|
+
from typing import TYPE_CHECKING, Any, override
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
from gpudb import GPUdb, GPUdbTable
|
|
17
|
+
from langchain_core.documents import Document
|
|
18
|
+
from langchain_core.vectorstores import VectorStore
|
|
19
|
+
from langchain_core.vectorstores.utils import maximal_marginal_relevance
|
|
20
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from collections import OrderedDict
|
|
24
|
+
from collections.abc import Callable, Iterable
|
|
25
|
+
|
|
26
|
+
from langchain_core.embeddings import Embeddings
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DistanceStrategy(str, enum.Enum):
|
|
30
|
+
"""Enumerator of the Distance strategies."""
|
|
31
|
+
|
|
32
|
+
EUCLIDEAN = "l2"
|
|
33
|
+
COSINE = "cosine"
|
|
34
|
+
MAX_INNER_PRODUCT = "inner"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _results_to_docs(docs_and_scores: Any) -> list[Document]:
|
|
38
|
+
"""Return docs from docs and scores."""
|
|
39
|
+
return [doc for doc, _ in docs_and_scores]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class Dimension(int, Enum):
|
|
43
|
+
"""Some default dimensions for known embeddings."""
|
|
44
|
+
|
|
45
|
+
OPENAI = 1536
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.EUCLIDEAN
|
|
49
|
+
|
|
50
|
+
_LANGCHAIN_DEFAULT_SCHEMA_NAME = "langchain" ## Default Kinetica schema name
|
|
51
|
+
_LANGCHAIN_DEFAULT_COLLECTION_NAME = (
|
|
52
|
+
"langchain_kinetica_embeddings" ## Default Kinetica table name
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class KineticaSettings(BaseSettings):
|
|
57
|
+
"""`Kinetica` client configuration.
|
|
58
|
+
|
|
59
|
+
Connection parameters should be passed as environment variables.
|
|
60
|
+
|
|
61
|
+
Example:
|
|
62
|
+
KINETICA_HOST='http://localhost:9191'
|
|
63
|
+
KINETICA_USERNAME='admin'
|
|
64
|
+
KINETICA_PASSWORD=''
|
|
65
|
+
|
|
66
|
+
Attributes:
|
|
67
|
+
kdbc (GPUdb, optional): An optional GPUdb connection instance. If not
|
|
68
|
+
provided, the connection will be established using environment variables.
|
|
69
|
+
database (str) : Database name to find the table. Defaults to 'default'.
|
|
70
|
+
table (str) : Table name to operate on.
|
|
71
|
+
Defaults to 'vector_table'.
|
|
72
|
+
metric (str) : Metric to compute distance,
|
|
73
|
+
supported are ('angular', 'euclidean', 'manhattan', 'hamming',
|
|
74
|
+
'dot'). Defaults to 'angular'.
|
|
75
|
+
https://github.com/spotify/annoy/blob/main/src/annoymodule.cc#L149-L169
|
|
76
|
+
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
# Optional gpudb connection. If this is not provided then use env variables:
|
|
80
|
+
# KINETICA_URL, KINETICA_USERNAME, KINETICA_PASSWORD
|
|
81
|
+
kdbc: GPUdb | None = None
|
|
82
|
+
|
|
83
|
+
database: str = _LANGCHAIN_DEFAULT_SCHEMA_NAME
|
|
84
|
+
table: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME
|
|
85
|
+
metric: str = DEFAULT_DISTANCE_STRATEGY.value
|
|
86
|
+
|
|
87
|
+
def __getitem__(self, item: str) -> Any:
|
|
88
|
+
"""Get attribute by key."""
|
|
89
|
+
return getattr(self, item)
|
|
90
|
+
|
|
91
|
+
model_config = SettingsConfigDict(
|
|
92
|
+
env_file=".env",
|
|
93
|
+
env_file_encoding="utf-8",
|
|
94
|
+
env_prefix="kinetica_",
|
|
95
|
+
extra="ignore",
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class KineticaVectorstore(VectorStore):
|
|
100
|
+
"""`Kinetica` vector store.
|
|
101
|
+
|
|
102
|
+
To use, you should have the ``gpudb`` python package installed.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
config: Kinetica connection settings class.
|
|
106
|
+
embedding_function: Any embedding function implementing
|
|
107
|
+
`langchain.embeddings.base.Embeddings` interface.
|
|
108
|
+
collection_name: The name of the collection to use. (default: langchain)
|
|
109
|
+
NOTE: This is not the name of the table, but the name of the collection.
|
|
110
|
+
The tables will be created when initializing the store (if not exists)
|
|
111
|
+
So, make sure the user has the right permissions to create tables.
|
|
112
|
+
distance_strategy: The distance strategy to use. (default: COSINE)
|
|
113
|
+
pre_delete_collection: If True, will delete the collection if it exists.
|
|
114
|
+
(default: False). Useful for testing.
|
|
115
|
+
engine_args: SQLAlchemy's create engine arguments.
|
|
116
|
+
|
|
117
|
+
Example:
|
|
118
|
+
.. code-block:: python
|
|
119
|
+
|
|
120
|
+
from langchain_community.vectorstores import Kinetica, KineticaSettings
|
|
121
|
+
from langchain_community.embeddings.openai import OpenAIEmbeddings
|
|
122
|
+
|
|
123
|
+
vectorstore = Kinetica.from_documents(
|
|
124
|
+
documents=docs,
|
|
125
|
+
embedding=OpenAIEmbeddings(),
|
|
126
|
+
collection_name="kinetica_store",
|
|
127
|
+
config=KineticaSettings(),
|
|
128
|
+
)
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
def __init__(
|
|
132
|
+
self,
|
|
133
|
+
config: KineticaSettings,
|
|
134
|
+
embedding_function: Embeddings,
|
|
135
|
+
*, # to force keyword arguments only
|
|
136
|
+
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
|
|
137
|
+
schema_name: str = _LANGCHAIN_DEFAULT_SCHEMA_NAME,
|
|
138
|
+
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
|
139
|
+
pre_delete_collection: bool = False,
|
|
140
|
+
logger: logging.Logger | None = None,
|
|
141
|
+
relevance_score_fn: Callable[[float], float] | None = None,
|
|
142
|
+
) -> None:
|
|
143
|
+
"""Constructor for the Kinetica class.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
config (KineticaSettings): a `KineticaSettings` instance
|
|
147
|
+
embedding_function (Embeddings): embedding function to use
|
|
148
|
+
collection_name (str, optional): the Kinetica table name.
|
|
149
|
+
Defaults to _LANGCHAIN_DEFAULT_COLLECTION_NAME.
|
|
150
|
+
schema_name (str, optional): the Kinetica table name.
|
|
151
|
+
Defaults to _LANGCHAIN_DEFAULT_SCHEMA_NAME.
|
|
152
|
+
distance_strategy (DistanceStrategy, optional): _description_.
|
|
153
|
+
Defaults to DEFAULT_DISTANCE_STRATEGY.
|
|
154
|
+
pre_delete_collection (bool, optional): _description_. Defaults to False.
|
|
155
|
+
logger (Optional[logging.Logger], optional): _description_.
|
|
156
|
+
Defaults to None.
|
|
157
|
+
relevance_score_fn (Optional[Callable[[float], float]], optional):
|
|
158
|
+
A function that takes in a distance and outputs a relevance score.
|
|
159
|
+
If not provided, a default function will be used based on the
|
|
160
|
+
distance strategy. Defaults to None.
|
|
161
|
+
"""
|
|
162
|
+
self._config = config
|
|
163
|
+
self.embedding_function = embedding_function
|
|
164
|
+
self.collection_name = collection_name
|
|
165
|
+
self.schema_name = schema_name
|
|
166
|
+
self._distance_strategy = distance_strategy
|
|
167
|
+
self.pre_delete_collection = pre_delete_collection
|
|
168
|
+
self.logger = logger or logging.getLogger(__name__)
|
|
169
|
+
self.override_relevance_score_fn = relevance_score_fn
|
|
170
|
+
self._db = self.__get_db(self._config)
|
|
171
|
+
|
|
172
|
+
def __post_init__(self, dimensions: int) -> None:
|
|
173
|
+
"""Initialize the store."""
|
|
174
|
+
self.dimensions = dimensions
|
|
175
|
+
dimension_field = f"vector({dimensions})"
|
|
176
|
+
|
|
177
|
+
if self.pre_delete_collection:
|
|
178
|
+
self.delete_schema()
|
|
179
|
+
|
|
180
|
+
self.table_name = self.collection_name
|
|
181
|
+
if self.schema_name is not None and len(self.schema_name) > 0:
|
|
182
|
+
self.table_name = f"{self.schema_name}.{self.collection_name}"
|
|
183
|
+
|
|
184
|
+
self.table_schema = [
|
|
185
|
+
["text", "string"],
|
|
186
|
+
["embedding", "bytes", dimension_field],
|
|
187
|
+
["metadata", "string", "json"],
|
|
188
|
+
["id", "string", "uuid"],
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
self.create_schema()
|
|
192
|
+
self.EmbeddingStore: GPUdbTable = self.create_tables_if_not_exists()
|
|
193
|
+
|
|
194
|
+
def __get_db(self, config: KineticaSettings) -> GPUdb:
|
|
195
|
+
if config.kdbc is not None:
|
|
196
|
+
return config.kdbc
|
|
197
|
+
return GPUdb.get_connection()
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def embeddings(self) -> Embeddings:
|
|
201
|
+
"""Return the embedding function."""
|
|
202
|
+
return self.embedding_function
|
|
203
|
+
|
|
204
|
+
@classmethod
|
|
205
|
+
def __from(
|
|
206
|
+
cls,
|
|
207
|
+
config: KineticaSettings,
|
|
208
|
+
texts: list[str],
|
|
209
|
+
embeddings: list[list[float]],
|
|
210
|
+
embedding: Embeddings,
|
|
211
|
+
dimensions: int,
|
|
212
|
+
*, # to force keyword arguments only
|
|
213
|
+
metadatas: list[dict] | None = None,
|
|
214
|
+
ids: list[str] | None = None,
|
|
215
|
+
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
|
|
216
|
+
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
|
217
|
+
pre_delete_collection: bool = False,
|
|
218
|
+
logger: logging.Logger | None = None,
|
|
219
|
+
schema_name: str = _LANGCHAIN_DEFAULT_SCHEMA_NAME,
|
|
220
|
+
**kwargs: Any,
|
|
221
|
+
) -> KineticaVectorstore:
|
|
222
|
+
"""Constructor helper.
|
|
223
|
+
|
|
224
|
+
Class method to assist in constructing the `Kinetica` store instance
|
|
225
|
+
using different combinations of parameters
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
config (KineticaSettings): a `KineticaSettings` instance
|
|
229
|
+
texts (List[str]): The list of texts to generate embeddings for and store
|
|
230
|
+
embeddings (List[List[float]]): List of embeddings
|
|
231
|
+
embedding (Embeddings): the Embedding function
|
|
232
|
+
dimensions (int): The number of dimensions the embeddings have
|
|
233
|
+
metadatas (Optional[List[dict]], optional): List of JSON data associated
|
|
234
|
+
with each text. Defaults to None.
|
|
235
|
+
ids (Optional[List[str]], optional): List of unique IDs (UUID by default)
|
|
236
|
+
associated with each text. Defaults to None.
|
|
237
|
+
collection_name (str, optional): Kinetica table name.
|
|
238
|
+
Defaults to _LANGCHAIN_DEFAULT_COLLECTION_NAME.
|
|
239
|
+
schema_name (str, optional): Kinetica schema name.
|
|
240
|
+
Defaults to _LANGCHAIN_DEFAULT_SCHEMA_NAME.
|
|
241
|
+
distance_strategy (DistanceStrategy, optional): Not used for now.
|
|
242
|
+
Defaults to DEFAULT_DISTANCE_STRATEGY.
|
|
243
|
+
pre_delete_collection (bool, optional): Whether to delete the Kinetica
|
|
244
|
+
schema or not. Defaults to False.
|
|
245
|
+
logger (Optional[logging.Logger], optional): Logger to use for logging at
|
|
246
|
+
different levels. Defaults to None.
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
Kinetica: An instance of Kinetica class
|
|
250
|
+
|
|
251
|
+
"""
|
|
252
|
+
if ids is None:
|
|
253
|
+
ids = [str(uuid.uuid4()) for _ in texts]
|
|
254
|
+
|
|
255
|
+
if not metadatas:
|
|
256
|
+
metadatas = [{} for _ in texts]
|
|
257
|
+
|
|
258
|
+
store = cls(
|
|
259
|
+
config=config,
|
|
260
|
+
collection_name=collection_name,
|
|
261
|
+
schema_name=schema_name,
|
|
262
|
+
embedding_function=embedding,
|
|
263
|
+
distance_strategy=distance_strategy,
|
|
264
|
+
pre_delete_collection=pre_delete_collection,
|
|
265
|
+
logger=logger,
|
|
266
|
+
**kwargs,
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
store.__post_init__(dimensions)
|
|
270
|
+
|
|
271
|
+
store.add_embeddings(
|
|
272
|
+
texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
return store
|
|
276
|
+
|
|
277
|
+
def create_tables_if_not_exists(self) -> Any:
|
|
278
|
+
"""Create the table to store the texts and embeddings."""
|
|
279
|
+
return GPUdbTable(
|
|
280
|
+
_type=self.table_schema,
|
|
281
|
+
name=self.table_name,
|
|
282
|
+
db=self._db,
|
|
283
|
+
options={"is_replicated": "true"},
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
def drop_tables(self) -> None:
|
|
287
|
+
"""Delete the table."""
|
|
288
|
+
self._db.clear_table(
|
|
289
|
+
f"{self.table_name}", options={"no_error_if_not_exists": "true"}
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
def create_schema(self) -> None:
|
|
293
|
+
"""Create a new Kinetica schema."""
|
|
294
|
+
self._db.create_schema(self.schema_name)
|
|
295
|
+
|
|
296
|
+
def delete_schema(self) -> None:
|
|
297
|
+
"""Delete schema and tables.
|
|
298
|
+
|
|
299
|
+
Delete a Kinetica schema with cascade set to `true`
|
|
300
|
+
This method will delete a schema with all tables in it.
|
|
301
|
+
"""
|
|
302
|
+
self.logger.debug("Trying to delete collection")
|
|
303
|
+
self._db.drop_schema(
|
|
304
|
+
self.schema_name, {"no_error_if_not_exists": "true", "cascade": "true"}
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
def add_embeddings(
|
|
308
|
+
self,
|
|
309
|
+
texts: Iterable[str],
|
|
310
|
+
embeddings: list[list[float]],
|
|
311
|
+
metadatas: list[dict] | None = None,
|
|
312
|
+
ids: list[str] | None = None,
|
|
313
|
+
) -> list[str]:
|
|
314
|
+
"""Add embeddings to the vectorstore.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
texts: Iterable of strings to add to the vectorstore.
|
|
318
|
+
embeddings: List of list of embedding vectors.
|
|
319
|
+
metadatas: List of metadatas associated with the texts.
|
|
320
|
+
ids: List of ids for the text embedding pairs
|
|
321
|
+
kwargs: vectorstore specific parameters
|
|
322
|
+
"""
|
|
323
|
+
if ids is None:
|
|
324
|
+
ids = [str(uuid.uuid4()) for _ in texts]
|
|
325
|
+
|
|
326
|
+
if not metadatas:
|
|
327
|
+
metadatas = [{} for _ in texts]
|
|
328
|
+
|
|
329
|
+
records = []
|
|
330
|
+
for text, embedding, metadata, doc_id in zip(
|
|
331
|
+
texts, embeddings, metadatas, ids, strict=False
|
|
332
|
+
):
|
|
333
|
+
buf = struct.pack(f"{self.dimensions}f", *embedding)
|
|
334
|
+
records.append([text, buf, json.dumps(metadata), doc_id])
|
|
335
|
+
|
|
336
|
+
self.EmbeddingStore.insert_records(records)
|
|
337
|
+
|
|
338
|
+
return ids
|
|
339
|
+
|
|
340
|
+
def add_texts(
|
|
341
|
+
self,
|
|
342
|
+
texts: Iterable[str],
|
|
343
|
+
metadatas: list[dict] | None = None,
|
|
344
|
+
ids: list[str] | None = None,
|
|
345
|
+
**kwargs: Any,
|
|
346
|
+
) -> list[str]:
|
|
347
|
+
"""Run more texts through the embeddings and add to the vectorstore.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
texts: Iterable of strings to add to the vectorstore.
|
|
351
|
+
metadatas: Optional list of metadatas (JSON data) associated with the texts.
|
|
352
|
+
ids: List of IDs (UUID) for the texts supplied; will be generated if None
|
|
353
|
+
kwargs: vectorstore specific parameters
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
List of ids from adding the texts into the vectorstore.
|
|
357
|
+
"""
|
|
358
|
+
embeddings = self.embedding_function.embed_documents(list(texts))
|
|
359
|
+
self.dimensions = len(embeddings[0])
|
|
360
|
+
if not hasattr(self, "EmbeddingStore"):
|
|
361
|
+
self.__post_init__(self.dimensions)
|
|
362
|
+
return self.add_embeddings(
|
|
363
|
+
texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
@override
|
|
367
|
+
def similarity_search(
|
|
368
|
+
self,
|
|
369
|
+
query: str,
|
|
370
|
+
k: int = 4,
|
|
371
|
+
emb_filter: dict | None = None,
|
|
372
|
+
**kwargs: Any,
|
|
373
|
+
) -> list[Document]:
|
|
374
|
+
"""Run similarity search with Kinetica with distance.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
query (str): Query text to search for.
|
|
378
|
+
k (int): Number of results to return. Defaults to 4.
|
|
379
|
+
emb_filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
List of Documents most similar to the query.
|
|
383
|
+
"""
|
|
384
|
+
embedding = self.embedding_function.embed_query(text=query)
|
|
385
|
+
return self.similarity_search_by_vector(
|
|
386
|
+
embedding=embedding,
|
|
387
|
+
k=k,
|
|
388
|
+
emb_filter=emb_filter,
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
def similarity_search_with_score(
|
|
392
|
+
self,
|
|
393
|
+
query: str,
|
|
394
|
+
k: int = 4,
|
|
395
|
+
emb_filter: dict | None = None,
|
|
396
|
+
) -> list[tuple[Document, float]]:
|
|
397
|
+
"""Return docs most similar to query.
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
query: Text to look up documents similar to.
|
|
401
|
+
k: Number of Documents to return. Defaults to 4.
|
|
402
|
+
emb_filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
List of Documents most similar to the query and score for each
|
|
406
|
+
"""
|
|
407
|
+
embedding = self.embedding_function.embed_query(query)
|
|
408
|
+
|
|
409
|
+
return self.similarity_search_with_score_by_vector(
|
|
410
|
+
embedding=embedding, k=k, emb_filter=emb_filter
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
def similarity_search_with_score_by_vector(
|
|
414
|
+
self,
|
|
415
|
+
embedding: list[float],
|
|
416
|
+
k: int = 4,
|
|
417
|
+
emb_filter: dict | None = None,
|
|
418
|
+
) -> list[tuple[Document, float]]:
|
|
419
|
+
"""Return docs most similar to embedding vector.
|
|
420
|
+
|
|
421
|
+
Args:
|
|
422
|
+
embedding: Embedding to look up documents similar to.
|
|
423
|
+
k: Number of Documents to return. Defaults to 4.
|
|
424
|
+
emb_filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
425
|
+
"""
|
|
426
|
+
results = []
|
|
427
|
+
resp: dict = self.__query_collection(embedding, k, emb_filter)
|
|
428
|
+
if resp and resp["status_info"]["status"] == "OK":
|
|
429
|
+
total_records = resp["total_number_of_records"]
|
|
430
|
+
if total_records > 0:
|
|
431
|
+
records: OrderedDict = resp["records"]
|
|
432
|
+
results = list(zip(*list(records.values()), strict=False))
|
|
433
|
+
|
|
434
|
+
return self._results_to_docs_and_scores(results)
|
|
435
|
+
self.logger.warning(
|
|
436
|
+
"No records found; status: %s", resp["status_info"]["status"]
|
|
437
|
+
)
|
|
438
|
+
return results
|
|
439
|
+
|
|
440
|
+
@override
|
|
441
|
+
def similarity_search_by_vector(
|
|
442
|
+
self,
|
|
443
|
+
embedding: list[float],
|
|
444
|
+
k: int = 4,
|
|
445
|
+
emb_filter: dict | None = None,
|
|
446
|
+
**kwargs: Any,
|
|
447
|
+
) -> list[Document]:
|
|
448
|
+
"""Return docs most similar to embedding vector.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
embedding: Embedding to look up documents similar to.
|
|
452
|
+
k: Number of Documents to return. Defaults to 4.
|
|
453
|
+
emb_filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
454
|
+
|
|
455
|
+
Returns:
|
|
456
|
+
List of Documents most similar to the query vector.
|
|
457
|
+
"""
|
|
458
|
+
docs_and_scores = self.similarity_search_with_score_by_vector(
|
|
459
|
+
embedding=embedding, k=k, emb_filter=emb_filter
|
|
460
|
+
)
|
|
461
|
+
return [doc for doc, _ in docs_and_scores]
|
|
462
|
+
|
|
463
|
+
def _results_to_docs_and_scores(self, results: Any) -> list[tuple[Document, float]]:
|
|
464
|
+
"""Return docs and scores from results."""
|
|
465
|
+
return (
|
|
466
|
+
[
|
|
467
|
+
(
|
|
468
|
+
Document(
|
|
469
|
+
page_content=result[0],
|
|
470
|
+
metadata=json.loads(result[1]),
|
|
471
|
+
),
|
|
472
|
+
result[2] if self.embedding_function is not None else None,
|
|
473
|
+
)
|
|
474
|
+
for result in results
|
|
475
|
+
]
|
|
476
|
+
if len(results) > 0
|
|
477
|
+
else []
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
def _select_relevance_score_fn(self) -> Callable[[float], float]:
|
|
481
|
+
"""Select the relevance score function based on distance strategy.
|
|
482
|
+
|
|
483
|
+
The 'correct' relevance function may differ depending on a few things,
|
|
484
|
+
including:
|
|
485
|
+
- the distance / similarity metric used by the VectorStore
|
|
486
|
+
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
|
|
487
|
+
- embedding dimensionality
|
|
488
|
+
- etc.
|
|
489
|
+
"""
|
|
490
|
+
if self.override_relevance_score_fn is not None:
|
|
491
|
+
return self.override_relevance_score_fn
|
|
492
|
+
|
|
493
|
+
# Default strategy is to rely on distance strategy provided
|
|
494
|
+
# in vectorstore constructor
|
|
495
|
+
if self._distance_strategy == DistanceStrategy.COSINE:
|
|
496
|
+
return self._cosine_relevance_score_fn
|
|
497
|
+
if self._distance_strategy == DistanceStrategy.EUCLIDEAN:
|
|
498
|
+
return self._euclidean_relevance_score_fn
|
|
499
|
+
if self._distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT:
|
|
500
|
+
return self._max_inner_product_relevance_score_fn
|
|
501
|
+
|
|
502
|
+
msg = (
|
|
503
|
+
"No supported normalization function"
|
|
504
|
+
f" for distance_strategy of {self._distance_strategy}."
|
|
505
|
+
"Consider providing relevance_score_fn to Kinetica constructor."
|
|
506
|
+
)
|
|
507
|
+
raise ValueError(msg)
|
|
508
|
+
|
|
509
|
+
@property
|
|
510
|
+
def distance_strategy(self) -> str:
|
|
511
|
+
"""Return the distance strategy."""
|
|
512
|
+
if self._distance_strategy == DistanceStrategy.EUCLIDEAN:
|
|
513
|
+
return "l2_distance"
|
|
514
|
+
if self._distance_strategy == DistanceStrategy.COSINE:
|
|
515
|
+
return "cosine_distance"
|
|
516
|
+
if self._distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT:
|
|
517
|
+
return "dot_product"
|
|
518
|
+
msg = (
|
|
519
|
+
f"Got unexpected value for distance: {self._distance_strategy}. "
|
|
520
|
+
f"Should be one of {', '.join([ds.value for ds in DistanceStrategy])}."
|
|
521
|
+
)
|
|
522
|
+
raise ValueError(msg)
|
|
523
|
+
|
|
524
|
+
def __query_collection(
|
|
525
|
+
self,
|
|
526
|
+
embedding: list[float],
|
|
527
|
+
k: int = 4,
|
|
528
|
+
emb_filter: dict[str, str] | None = None,
|
|
529
|
+
) -> dict:
|
|
530
|
+
"""Query the collection."""
|
|
531
|
+
json_filter = json.dumps(emb_filter) if emb_filter is not None else None
|
|
532
|
+
where_clause = (
|
|
533
|
+
f" where '{json_filter}' = JSON(metadata) "
|
|
534
|
+
if json_filter is not None
|
|
535
|
+
else ""
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
embedding_str = "[" + ",".join([str(x) for x in embedding]) + "]"
|
|
539
|
+
dist_strategy = self.distance_strategy
|
|
540
|
+
query_string = f"""
|
|
541
|
+
SELECT text, metadata, {dist_strategy}(embedding, '{embedding_str}')
|
|
542
|
+
as distance, embedding
|
|
543
|
+
FROM "{self.schema_name}"."{self.collection_name}"
|
|
544
|
+
{where_clause}
|
|
545
|
+
ORDER BY distance asc NULLS LAST
|
|
546
|
+
LIMIT {k}
|
|
547
|
+
""" # noqa: S608
|
|
548
|
+
|
|
549
|
+
self.logger.debug(query_string)
|
|
550
|
+
resp = self._db.execute_sql_and_decode(query_string)
|
|
551
|
+
self.logger.debug(resp)
|
|
552
|
+
return resp
|
|
553
|
+
|
|
554
|
+
def max_marginal_relevance_search_with_score_by_vector(
|
|
555
|
+
self,
|
|
556
|
+
embedding: list[float],
|
|
557
|
+
k: int = 4,
|
|
558
|
+
fetch_k: int = 20,
|
|
559
|
+
lambda_mult: float = 0.5,
|
|
560
|
+
emb_filter: dict[str, str] | None = None,
|
|
561
|
+
) -> list[tuple[Document, float]]:
|
|
562
|
+
"""Maximal Marginal Relevance search with score.
|
|
563
|
+
|
|
564
|
+
Return docs selected using the maximal marginal relevance with score
|
|
565
|
+
to embedding vector. Maximal marginal relevance optimizes for similarity
|
|
566
|
+
to query AND diversity among selected documents.
|
|
567
|
+
|
|
568
|
+
Args:
|
|
569
|
+
embedding: Embedding to look up documents similar to.
|
|
570
|
+
k (int): Number of Documents to return. Defaults to 4.
|
|
571
|
+
fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
|
572
|
+
Defaults to 20.
|
|
573
|
+
lambda_mult (float): Number between 0 and 1 that determines the degree
|
|
574
|
+
of diversity among the results with 0 corresponding
|
|
575
|
+
to maximum diversity and 1 to minimum diversity.
|
|
576
|
+
Defaults to 0.5.
|
|
577
|
+
emb_filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
578
|
+
|
|
579
|
+
Returns:
|
|
580
|
+
List[Tuple[Document, float]]: List of Documents selected by maximal marginal
|
|
581
|
+
relevance to the query and score for each.
|
|
582
|
+
"""
|
|
583
|
+
resp = self.__query_collection(
|
|
584
|
+
embedding=embedding, k=fetch_k, emb_filter=emb_filter
|
|
585
|
+
)
|
|
586
|
+
records: OrderedDict = resp["records"]
|
|
587
|
+
results = list(zip(*list(records.values()), strict=False))
|
|
588
|
+
|
|
589
|
+
embedding_list = [
|
|
590
|
+
struct.unpack(f"{self.dimensions}f", embedding)
|
|
591
|
+
for embedding in records["embedding"]
|
|
592
|
+
]
|
|
593
|
+
|
|
594
|
+
mmr_selected = maximal_marginal_relevance(
|
|
595
|
+
np.array(embedding, dtype=np.float32),
|
|
596
|
+
embedding_list,
|
|
597
|
+
k=k,
|
|
598
|
+
lambda_mult=lambda_mult,
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
candidates = self._results_to_docs_and_scores(results)
|
|
602
|
+
|
|
603
|
+
return [r for i, r in enumerate(candidates) if i in mmr_selected]
|
|
604
|
+
|
|
605
|
+
def max_marginal_relevance_search(
|
|
606
|
+
self,
|
|
607
|
+
query: str,
|
|
608
|
+
k: int = 4,
|
|
609
|
+
fetch_k: int = 20,
|
|
610
|
+
lambda_mult: float = 0.5,
|
|
611
|
+
emb_filter: dict[str, str] | None = None,
|
|
612
|
+
**kwargs: Any,
|
|
613
|
+
) -> list[Document]:
|
|
614
|
+
"""Return docs selected using the maximal marginal relevance.
|
|
615
|
+
|
|
616
|
+
Maximal marginal relevance optimizes for similarity to query AND diversity
|
|
617
|
+
among selected documents.
|
|
618
|
+
|
|
619
|
+
Args:
|
|
620
|
+
query (str): Text to look up documents similar to.
|
|
621
|
+
k (int): Number of Documents to return. Defaults to 4.
|
|
622
|
+
fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
|
623
|
+
Defaults to 20.
|
|
624
|
+
lambda_mult (float): Number between 0 and 1 that determines the degree
|
|
625
|
+
of diversity among the results with 0 corresponding
|
|
626
|
+
to maximum diversity and 1 to minimum diversity.
|
|
627
|
+
Defaults to 0.5.
|
|
628
|
+
emb_filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
629
|
+
|
|
630
|
+
Returns:
|
|
631
|
+
List[Document]: List of Documents selected by maximal marginal relevance.
|
|
632
|
+
"""
|
|
633
|
+
embedding = self.embedding_function.embed_query(query)
|
|
634
|
+
return self.max_marginal_relevance_search_by_vector(
|
|
635
|
+
embedding,
|
|
636
|
+
k=k,
|
|
637
|
+
fetch_k=fetch_k,
|
|
638
|
+
lambda_mult=lambda_mult,
|
|
639
|
+
emb_filter=emb_filter,
|
|
640
|
+
**kwargs,
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
def max_marginal_relevance_search_with_score(
|
|
644
|
+
self,
|
|
645
|
+
query: str,
|
|
646
|
+
k: int = 4,
|
|
647
|
+
fetch_k: int = 20,
|
|
648
|
+
lambda_mult: float = 0.5,
|
|
649
|
+
emb_filter: dict | None = None,
|
|
650
|
+
**kwargs: Any,
|
|
651
|
+
) -> list[tuple[Document, float]]:
|
|
652
|
+
"""Return docs selected using the maximal marginal relevance with score.
|
|
653
|
+
|
|
654
|
+
Maximal marginal relevance optimizes for similarity to query AND diversity
|
|
655
|
+
among selected documents.
|
|
656
|
+
|
|
657
|
+
Args:
|
|
658
|
+
query (str): Text to look up documents similar to.
|
|
659
|
+
k (int): Number of Documents to return. Defaults to 4.
|
|
660
|
+
fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
|
661
|
+
Defaults to 20.
|
|
662
|
+
lambda_mult (float): Number between 0 and 1 that determines the degree
|
|
663
|
+
of diversity among the results with 0 corresponding
|
|
664
|
+
to maximum diversity and 1 to minimum diversity.
|
|
665
|
+
Defaults to 0.5.
|
|
666
|
+
emb_filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
List[Tuple[Document, float]]: List of Documents selected by maximal marginal
|
|
670
|
+
relevance to the query and score for each.
|
|
671
|
+
"""
|
|
672
|
+
embedding = self.embedding_function.embed_query(query)
|
|
673
|
+
return self.max_marginal_relevance_search_with_score_by_vector(
|
|
674
|
+
embedding=embedding,
|
|
675
|
+
k=k,
|
|
676
|
+
fetch_k=fetch_k,
|
|
677
|
+
lambda_mult=lambda_mult,
|
|
678
|
+
emb_filter=emb_filter,
|
|
679
|
+
**kwargs,
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
def max_marginal_relevance_search_by_vector(
|
|
683
|
+
self,
|
|
684
|
+
embedding: list[float],
|
|
685
|
+
k: int = 4,
|
|
686
|
+
fetch_k: int = 20,
|
|
687
|
+
lambda_mult: float = 0.5,
|
|
688
|
+
emb_filter: dict[str, str] | None = None,
|
|
689
|
+
**kwargs: Any,
|
|
690
|
+
) -> list[Document]:
|
|
691
|
+
"""Maximal Marginal Relevance search.
|
|
692
|
+
|
|
693
|
+
Return docs selected using the maximal marginal relevance to embedding vector.
|
|
694
|
+
Maximal marginal relevance optimizes for similarity to query AND
|
|
695
|
+
diversity among selected documents.
|
|
696
|
+
|
|
697
|
+
Args:
|
|
698
|
+
embedding (str): Text to look up documents similar to.
|
|
699
|
+
k (int): Number of Documents to return. Defaults to 4.
|
|
700
|
+
fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
|
701
|
+
Defaults to 20.
|
|
702
|
+
lambda_mult (float): Number between 0 and 1 that determines the degree
|
|
703
|
+
of diversity among the results with 0 corresponding
|
|
704
|
+
to maximum diversity and 1 to minimum diversity.
|
|
705
|
+
Defaults to 0.5.
|
|
706
|
+
emb_filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
707
|
+
|
|
708
|
+
Returns:
|
|
709
|
+
List[Document]: List of Documents selected by maximal marginal relevance.
|
|
710
|
+
"""
|
|
711
|
+
docs_and_scores = self.max_marginal_relevance_search_with_score_by_vector(
|
|
712
|
+
embedding,
|
|
713
|
+
k=k,
|
|
714
|
+
fetch_k=fetch_k,
|
|
715
|
+
lambda_mult=lambda_mult,
|
|
716
|
+
emb_filter=emb_filter,
|
|
717
|
+
**kwargs,
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
return _results_to_docs(docs_and_scores)
|
|
721
|
+
|
|
722
|
+
async def amax_marginal_relevance_search_by_vector(
|
|
723
|
+
self,
|
|
724
|
+
embedding: list[float],
|
|
725
|
+
k: int = 4,
|
|
726
|
+
fetch_k: int = 20,
|
|
727
|
+
lambda_mult: float = 0.5,
|
|
728
|
+
emb_filter: dict[str, str] | None = None,
|
|
729
|
+
**kwargs: Any,
|
|
730
|
+
) -> list[Document]:
|
|
731
|
+
"""Return docs selected using the maximal marginal relevance."""
|
|
732
|
+
# This is a temporary workaround to make the similarity search
|
|
733
|
+
# asynchronous. The proper solution is to make the similarity search
|
|
734
|
+
# asynchronous in the vector store implementations.
|
|
735
|
+
func = partial(
|
|
736
|
+
self.max_marginal_relevance_search_by_vector,
|
|
737
|
+
embedding,
|
|
738
|
+
k=k,
|
|
739
|
+
fetch_k=fetch_k,
|
|
740
|
+
lambda_mult=lambda_mult,
|
|
741
|
+
emb_filter=emb_filter,
|
|
742
|
+
**kwargs,
|
|
743
|
+
)
|
|
744
|
+
return await asyncio.get_event_loop().run_in_executor(None, func)
|
|
745
|
+
|
|
746
|
+
@classmethod
|
|
747
|
+
def from_texts(
|
|
748
|
+
cls: type[KineticaVectorstore],
|
|
749
|
+
texts: list[str],
|
|
750
|
+
embedding: Embeddings,
|
|
751
|
+
metadatas: list[dict] | None = None,
|
|
752
|
+
config: KineticaSettings | None = None,
|
|
753
|
+
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
|
|
754
|
+
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
|
755
|
+
ids: list[str] | None = None,
|
|
756
|
+
*,
|
|
757
|
+
pre_delete_collection: bool = False,
|
|
758
|
+
schema_name: str = _LANGCHAIN_DEFAULT_SCHEMA_NAME,
|
|
759
|
+
**kwargs: Any,
|
|
760
|
+
) -> KineticaVectorstore:
|
|
761
|
+
"""Adds the texts passed in to the vector store and returns it.
|
|
762
|
+
|
|
763
|
+
Args:
|
|
764
|
+
cls (Type[Kinetica]): Kinetica class
|
|
765
|
+
texts (List[str]): A list of texts for which the embeddings are generated
|
|
766
|
+
embedding (Embeddings): List of embeddings
|
|
767
|
+
metadatas (Optional[List[dict]], optional): List of dicts, JSON
|
|
768
|
+
describing the texts/documents. Defaults to None.
|
|
769
|
+
config (KineticaSettings): a `KineticaSettings` instance
|
|
770
|
+
collection_name (str, optional): Kinetica schema name.
|
|
771
|
+
Defaults to _LANGCHAIN_DEFAULT_COLLECTION_NAME.
|
|
772
|
+
schema_name (str, optional): Kinetica schema name.
|
|
773
|
+
Defaults to _LANGCHAIN_DEFAULT_SCHEMA_NAME.
|
|
774
|
+
distance_strategy (DistanceStrategy, optional): Distance strategy
|
|
775
|
+
e.g., l2, cosine etc.. Defaults to DEFAULT_DISTANCE_STRATEGY.
|
|
776
|
+
ids (Optional[List[str]], optional): A list of UUIDs for each
|
|
777
|
+
text/document. Defaults to None.
|
|
778
|
+
pre_delete_collection (bool, optional): Indicates whether the Kinetica
|
|
779
|
+
schema is to be deleted or not. Defaults to False.
|
|
780
|
+
|
|
781
|
+
Returns:
|
|
782
|
+
Kinetica: a `Kinetica` instance
|
|
783
|
+
"""
|
|
784
|
+
if len(texts) == 0:
|
|
785
|
+
msg = "texts is empty"
|
|
786
|
+
raise ValueError(msg)
|
|
787
|
+
|
|
788
|
+
if config is None:
|
|
789
|
+
config = KineticaSettings()
|
|
790
|
+
|
|
791
|
+
try:
|
|
792
|
+
first_embedding = embedding.embed_documents(texts[0:1])
|
|
793
|
+
except NotImplementedError:
|
|
794
|
+
first_embedding = [embedding.embed_query(texts[0])]
|
|
795
|
+
|
|
796
|
+
dimensions = len(first_embedding[0])
|
|
797
|
+
embeddings = embedding.embed_documents(list(texts))
|
|
798
|
+
|
|
799
|
+
return cls.__from(
|
|
800
|
+
texts=texts,
|
|
801
|
+
embeddings=embeddings,
|
|
802
|
+
embedding=embedding,
|
|
803
|
+
dimensions=dimensions,
|
|
804
|
+
config=config,
|
|
805
|
+
metadatas=metadatas,
|
|
806
|
+
ids=ids,
|
|
807
|
+
collection_name=collection_name,
|
|
808
|
+
schema_name=schema_name,
|
|
809
|
+
distance_strategy=distance_strategy,
|
|
810
|
+
pre_delete_collection=pre_delete_collection,
|
|
811
|
+
**kwargs,
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
@classmethod
|
|
815
|
+
def from_embeddings(
|
|
816
|
+
cls: type[KineticaVectorstore],
|
|
817
|
+
text_embeddings: list[tuple[str, list[float]]],
|
|
818
|
+
embedding: Embeddings,
|
|
819
|
+
metadatas: list[dict] | None = None,
|
|
820
|
+
config: KineticaSettings | None = None,
|
|
821
|
+
dimensions: int = Dimension.OPENAI,
|
|
822
|
+
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
|
|
823
|
+
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
|
824
|
+
ids: list[str] | None = None,
|
|
825
|
+
*,
|
|
826
|
+
pre_delete_collection: bool = False,
|
|
827
|
+
schema_name: str = _LANGCHAIN_DEFAULT_SCHEMA_NAME,
|
|
828
|
+
**kwargs: Any,
|
|
829
|
+
) -> KineticaVectorstore:
|
|
830
|
+
"""Adds the embeddings passed in to the vector store and returns it.
|
|
831
|
+
|
|
832
|
+
Args:
|
|
833
|
+
cls (Type[Kinetica]): Kinetica class
|
|
834
|
+
text_embeddings (List[Tuple[str, List[float]]]): A list of texts
|
|
835
|
+
and the embeddings
|
|
836
|
+
embedding (Embeddings): List of embeddings
|
|
837
|
+
metadatas (Optional[List[dict]], optional): List of dicts, JSON describing
|
|
838
|
+
the texts/documents. Defaults to None.
|
|
839
|
+
config (KineticaSettings): a `KineticaSettings` instance
|
|
840
|
+
dimensions (int, optional): Dimension for the vector data, if not passed a
|
|
841
|
+
default will be used. Defaults to Dimension.OPENAI.
|
|
842
|
+
collection_name (str, optional): Kinetica schema name.
|
|
843
|
+
Defaults to _LANGCHAIN_DEFAULT_COLLECTION_NAME.
|
|
844
|
+
schema_name (str, optional): Kinetica schema name.
|
|
845
|
+
Defaults to _LANGCHAIN_DEFAULT_SCHEMA_NAME.
|
|
846
|
+
distance_strategy (DistanceStrategy, optional): Distance strategy
|
|
847
|
+
e.g., l2, cosine etc.. Defaults to DEFAULT_DISTANCE_STRATEGY.
|
|
848
|
+
ids (Optional[List[str]], optional): A list of UUIDs for each text/document.
|
|
849
|
+
Defaults to None.
|
|
850
|
+
pre_delete_collection (bool, optional): Indicates whether the
|
|
851
|
+
Kinetica schema is to be deleted or not. Defaults to False.
|
|
852
|
+
|
|
853
|
+
Returns:
|
|
854
|
+
Kinetica: a `Kinetica` instance
|
|
855
|
+
"""
|
|
856
|
+
if config is None:
|
|
857
|
+
config = KineticaSettings()
|
|
858
|
+
|
|
859
|
+
texts = [t[0] for t in text_embeddings]
|
|
860
|
+
embeddings = [t[1] for t in text_embeddings]
|
|
861
|
+
dimensions = len(embeddings[0])
|
|
862
|
+
|
|
863
|
+
return cls.__from(
|
|
864
|
+
texts=texts,
|
|
865
|
+
embeddings=embeddings,
|
|
866
|
+
embedding=embedding,
|
|
867
|
+
dimensions=dimensions,
|
|
868
|
+
config=config,
|
|
869
|
+
metadatas=metadatas,
|
|
870
|
+
ids=ids,
|
|
871
|
+
collection_name=collection_name,
|
|
872
|
+
schema_name=schema_name,
|
|
873
|
+
distance_strategy=distance_strategy,
|
|
874
|
+
pre_delete_collection=pre_delete_collection,
|
|
875
|
+
**kwargs,
|
|
876
|
+
)
|
|
877
|
+
|
|
878
|
+
@classmethod
|
|
879
|
+
def from_documents(
|
|
880
|
+
cls: type[KineticaVectorstore],
|
|
881
|
+
documents: list[Document],
|
|
882
|
+
embedding: Embeddings,
|
|
883
|
+
config: KineticaSettings | None = None,
|
|
884
|
+
metadatas: list[dict] | None = None,
|
|
885
|
+
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
|
|
886
|
+
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
|
887
|
+
ids: list[str] | None = None,
|
|
888
|
+
*,
|
|
889
|
+
pre_delete_collection: bool = False,
|
|
890
|
+
schema_name: str = _LANGCHAIN_DEFAULT_SCHEMA_NAME,
|
|
891
|
+
**kwargs: Any,
|
|
892
|
+
) -> KineticaVectorstore:
|
|
893
|
+
"""Adds the list of `Document` passed in to the vector store and returns it.
|
|
894
|
+
|
|
895
|
+
Args:
|
|
896
|
+
cls (Type[Kinetica]): Kinetica class
|
|
897
|
+
documents (List[str]): A list of texts for which the embeddings are
|
|
898
|
+
generated
|
|
899
|
+
embedding (Embeddings): List of embeddings
|
|
900
|
+
config (KineticaSettings): a `KineticaSettings` instance
|
|
901
|
+
metadatas (Optional[List[dict]], optional): List of dicts, JSON describing
|
|
902
|
+
the texts/documents. Defaults to None.
|
|
903
|
+
collection_name (str, optional): Kinetica schema name.
|
|
904
|
+
Defaults to _LANGCHAIN_DEFAULT_COLLECTION_NAME.
|
|
905
|
+
schema_name (str, optional): Kinetica schema name.
|
|
906
|
+
Defaults to _LANGCHAIN_DEFAULT_SCHEMA_NAME.
|
|
907
|
+
distance_strategy (DistanceStrategy, optional): Distance strategy
|
|
908
|
+
e.g., l2, cosine etc.. Defaults to DEFAULT_DISTANCE_STRATEGY.
|
|
909
|
+
ids (Optional[List[str]], optional): A list of UUIDs for each text/document.
|
|
910
|
+
Defaults to None.
|
|
911
|
+
pre_delete_collection (bool, optional): Indicates whether the Kinetica
|
|
912
|
+
schema is to be deleted or not. Defaults to False.
|
|
913
|
+
|
|
914
|
+
Returns:
|
|
915
|
+
Kinetica: a `Kinetica` instance
|
|
916
|
+
"""
|
|
917
|
+
if config is None:
|
|
918
|
+
config = KineticaSettings()
|
|
919
|
+
|
|
920
|
+
texts = [d.page_content for d in documents]
|
|
921
|
+
metadatas = [d.metadata for d in documents]
|
|
922
|
+
|
|
923
|
+
return cls.from_texts(
|
|
924
|
+
texts=texts,
|
|
925
|
+
embedding=embedding,
|
|
926
|
+
metadatas=metadatas,
|
|
927
|
+
config=config,
|
|
928
|
+
collection_name=collection_name,
|
|
929
|
+
schema_name=schema_name,
|
|
930
|
+
distance_strategy=distance_strategy,
|
|
931
|
+
ids=ids,
|
|
932
|
+
pre_delete_collection=pre_delete_collection,
|
|
933
|
+
**kwargs,
|
|
934
|
+
)
|