elasticsearch 8.13.0__py3-none-any.whl → 8.13.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- elasticsearch/_async/client/__init__.py +21 -0
- elasticsearch/_async/client/ml.py +61 -0
- elasticsearch/_async/client/security.py +7 -2
- elasticsearch/_sync/client/__init__.py +21 -0
- elasticsearch/_sync/client/ml.py +61 -0
- elasticsearch/_sync/client/security.py +7 -2
- elasticsearch/_version.py +1 -1
- elasticsearch/helpers/vectorstore/__init__.py +62 -0
- elasticsearch/helpers/vectorstore/_async/__init__.py +16 -0
- elasticsearch/helpers/vectorstore/_async/_utils.py +39 -0
- elasticsearch/helpers/vectorstore/_async/embedding_service.py +89 -0
- elasticsearch/helpers/vectorstore/_async/strategies.py +466 -0
- elasticsearch/helpers/vectorstore/_async/vectorstore.py +391 -0
- elasticsearch/helpers/vectorstore/_sync/__init__.py +16 -0
- elasticsearch/helpers/vectorstore/_sync/_utils.py +39 -0
- elasticsearch/helpers/vectorstore/_sync/embedding_service.py +89 -0
- elasticsearch/helpers/vectorstore/_sync/strategies.py +466 -0
- elasticsearch/helpers/vectorstore/_sync/vectorstore.py +388 -0
- elasticsearch/helpers/vectorstore/_utils.py +116 -0
- {elasticsearch-8.13.0.dist-info → elasticsearch-8.13.2.dist-info}/METADATA +5 -2
- {elasticsearch-8.13.0.dist-info → elasticsearch-8.13.2.dist-info}/RECORD +25 -13
- {elasticsearch-8.13.0.dist-info → elasticsearch-8.13.2.dist-info}/LICENSE +0 -0
- {elasticsearch-8.13.0.dist-info → elasticsearch-8.13.2.dist-info}/NOTICE +0 -0
- {elasticsearch-8.13.0.dist-info → elasticsearch-8.13.2.dist-info}/WHEEL +0 -0
- {elasticsearch-8.13.0.dist-info → elasticsearch-8.13.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,388 @@
|
|
|
1
|
+
# Licensed to Elasticsearch B.V. under one or more contributor
|
|
2
|
+
# license agreements. See the NOTICE file distributed with
|
|
3
|
+
# this work for additional information regarding copyright
|
|
4
|
+
# ownership. Elasticsearch B.V. licenses this file to you under
|
|
5
|
+
# the Apache License, Version 2.0 (the "License"); you may
|
|
6
|
+
# not use this file except in compliance with the License.
|
|
7
|
+
# You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
import uuid
|
|
20
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
21
|
+
|
|
22
|
+
from elasticsearch import Elasticsearch
|
|
23
|
+
from elasticsearch._version import __versionstr__ as lib_version
|
|
24
|
+
from elasticsearch.helpers import BulkIndexError, bulk
|
|
25
|
+
from elasticsearch.helpers.vectorstore import EmbeddingService, RetrievalStrategy
|
|
26
|
+
from elasticsearch.helpers.vectorstore._utils import maximal_marginal_relevance
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class VectorStore:
|
|
32
|
+
"""
|
|
33
|
+
VectorStore is a higher-level abstraction of indexing and search.
|
|
34
|
+
Users can pick from available retrieval strategies.
|
|
35
|
+
|
|
36
|
+
Documents have up to 3 fields:
|
|
37
|
+
- text_field: the text to be indexed and searched.
|
|
38
|
+
- metadata: additional information about the document, either schema-free
|
|
39
|
+
or defined by the supplied metadata_mappings.
|
|
40
|
+
- vector_field (usually not filled by the user): the embedding vector of the text.
|
|
41
|
+
|
|
42
|
+
Depending on the strategy, vector embeddings are
|
|
43
|
+
- created by the user beforehand
|
|
44
|
+
- created by this AsyncVectorStore class in Python
|
|
45
|
+
- created in-stack by inference pipelines.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
client: Elasticsearch,
|
|
51
|
+
*,
|
|
52
|
+
index: str,
|
|
53
|
+
retrieval_strategy: RetrievalStrategy,
|
|
54
|
+
embedding_service: Optional[EmbeddingService] = None,
|
|
55
|
+
num_dimensions: Optional[int] = None,
|
|
56
|
+
text_field: str = "text_field",
|
|
57
|
+
vector_field: str = "vector_field",
|
|
58
|
+
metadata_mappings: Optional[Dict[str, Any]] = None,
|
|
59
|
+
user_agent: str = f"elasticsearch-py-vs/{lib_version}",
|
|
60
|
+
) -> None:
|
|
61
|
+
"""
|
|
62
|
+
:param user_header: user agent header specific to the 3rd party integration.
|
|
63
|
+
Used for usage tracking in Elastic Cloud.
|
|
64
|
+
:param index: The name of the index to query.
|
|
65
|
+
:param retrieval_strategy: how to index and search the data. See the strategies
|
|
66
|
+
module for availble strategies.
|
|
67
|
+
:param text_field: Name of the field with the textual data.
|
|
68
|
+
:param vector_field: For strategies that perform embedding inference in Python,
|
|
69
|
+
the embedding vector goes in this field.
|
|
70
|
+
:param client: Elasticsearch client connection. Alternatively specify the
|
|
71
|
+
Elasticsearch connection with the other es_* parameters.
|
|
72
|
+
"""
|
|
73
|
+
# Add integration-specific usage header for tracking usage in Elastic Cloud.
|
|
74
|
+
# client.options preserves existing (non-user-agent) headers.
|
|
75
|
+
client = client.options(headers={"User-Agent": user_agent})
|
|
76
|
+
|
|
77
|
+
if hasattr(retrieval_strategy, "text_field"):
|
|
78
|
+
retrieval_strategy.text_field = text_field
|
|
79
|
+
if hasattr(retrieval_strategy, "vector_field"):
|
|
80
|
+
retrieval_strategy.vector_field = vector_field
|
|
81
|
+
|
|
82
|
+
self.client = client
|
|
83
|
+
self.index = index
|
|
84
|
+
self.retrieval_strategy = retrieval_strategy
|
|
85
|
+
self.embedding_service = embedding_service
|
|
86
|
+
self.num_dimensions = num_dimensions
|
|
87
|
+
self.text_field = text_field
|
|
88
|
+
self.vector_field = vector_field
|
|
89
|
+
self.metadata_mappings = metadata_mappings
|
|
90
|
+
|
|
91
|
+
def close(self) -> None:
|
|
92
|
+
return self.client.close()
|
|
93
|
+
|
|
94
|
+
def add_texts(
|
|
95
|
+
self,
|
|
96
|
+
texts: List[str],
|
|
97
|
+
*,
|
|
98
|
+
metadatas: Optional[List[Dict[str, Any]]] = None,
|
|
99
|
+
vectors: Optional[List[List[float]]] = None,
|
|
100
|
+
ids: Optional[List[str]] = None,
|
|
101
|
+
refresh_indices: bool = True,
|
|
102
|
+
create_index_if_not_exists: bool = True,
|
|
103
|
+
bulk_kwargs: Optional[Dict[str, Any]] = None,
|
|
104
|
+
) -> List[str]:
|
|
105
|
+
"""Add documents to the Elasticsearch index.
|
|
106
|
+
|
|
107
|
+
:param texts: List of text documents.
|
|
108
|
+
:param metadata: Optional list of document metadata. Must be of same length as
|
|
109
|
+
texts.
|
|
110
|
+
:param vectors: Optional list of embedding vectors. Must be of same length as
|
|
111
|
+
texts.
|
|
112
|
+
:param ids: Optional list of ID strings. Must be of same length as texts.
|
|
113
|
+
:param refresh_indices: Whether to refresh the index after deleting documents.
|
|
114
|
+
Defaults to True.
|
|
115
|
+
:param create_index_if_not_exists: Whether to create the index if it does not
|
|
116
|
+
exist. Defaults to True.
|
|
117
|
+
:param bulk_kwargs: Arguments to pass to the bulk function when indexing
|
|
118
|
+
(for example chunk_size).
|
|
119
|
+
|
|
120
|
+
:return: List of IDs of the created documents, either echoing the provided one
|
|
121
|
+
or returning newly created ones.
|
|
122
|
+
"""
|
|
123
|
+
bulk_kwargs = bulk_kwargs or {}
|
|
124
|
+
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
|
125
|
+
requests = []
|
|
126
|
+
|
|
127
|
+
if create_index_if_not_exists:
|
|
128
|
+
self._create_index_if_not_exists()
|
|
129
|
+
|
|
130
|
+
if self.embedding_service and not vectors:
|
|
131
|
+
vectors = self.embedding_service.embed_documents(texts)
|
|
132
|
+
|
|
133
|
+
for i, text in enumerate(texts):
|
|
134
|
+
metadata = metadatas[i] if metadatas else {}
|
|
135
|
+
|
|
136
|
+
request: Dict[str, Any] = {
|
|
137
|
+
"_op_type": "index",
|
|
138
|
+
"_index": self.index,
|
|
139
|
+
self.text_field: text,
|
|
140
|
+
"metadata": metadata,
|
|
141
|
+
"_id": ids[i],
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
if vectors:
|
|
145
|
+
request[self.vector_field] = vectors[i]
|
|
146
|
+
|
|
147
|
+
requests.append(request)
|
|
148
|
+
|
|
149
|
+
if len(requests) > 0:
|
|
150
|
+
try:
|
|
151
|
+
success, failed = bulk(
|
|
152
|
+
self.client,
|
|
153
|
+
requests,
|
|
154
|
+
stats_only=True,
|
|
155
|
+
refresh=refresh_indices,
|
|
156
|
+
**bulk_kwargs,
|
|
157
|
+
)
|
|
158
|
+
logger.debug(f"added texts {ids} to index")
|
|
159
|
+
return ids
|
|
160
|
+
except BulkIndexError as e:
|
|
161
|
+
logger.error(f"Error adding texts: {e}")
|
|
162
|
+
firstError = e.errors[0].get("index", {}).get("error", {})
|
|
163
|
+
logger.error(f"First error reason: {firstError.get('reason')}")
|
|
164
|
+
raise e
|
|
165
|
+
|
|
166
|
+
else:
|
|
167
|
+
logger.debug("No texts to add to index")
|
|
168
|
+
return []
|
|
169
|
+
|
|
170
|
+
def delete( # type: ignore[no-untyped-def]
|
|
171
|
+
self,
|
|
172
|
+
*,
|
|
173
|
+
ids: Optional[List[str]] = None,
|
|
174
|
+
query: Optional[Dict[str, Any]] = None,
|
|
175
|
+
refresh_indices: bool = True,
|
|
176
|
+
**delete_kwargs,
|
|
177
|
+
) -> bool:
|
|
178
|
+
"""Delete documents from the Elasticsearch index.
|
|
179
|
+
|
|
180
|
+
:param ids: List of IDs of documents to delete.
|
|
181
|
+
:param refresh_indices: Whether to refresh the index after deleting documents.
|
|
182
|
+
Defaults to True.
|
|
183
|
+
|
|
184
|
+
:return: True if deletion was successful.
|
|
185
|
+
"""
|
|
186
|
+
if ids is not None and query is not None:
|
|
187
|
+
raise ValueError("one of ids or query must be specified")
|
|
188
|
+
elif ids is None and query is None:
|
|
189
|
+
raise ValueError("either specify ids or query")
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
if ids:
|
|
193
|
+
body = [
|
|
194
|
+
{"_op_type": "delete", "_index": self.index, "_id": _id}
|
|
195
|
+
for _id in ids
|
|
196
|
+
]
|
|
197
|
+
bulk(
|
|
198
|
+
self.client,
|
|
199
|
+
body,
|
|
200
|
+
refresh=refresh_indices,
|
|
201
|
+
ignore_status=404,
|
|
202
|
+
**delete_kwargs,
|
|
203
|
+
)
|
|
204
|
+
logger.debug(f"Deleted {len(body)} texts from index")
|
|
205
|
+
|
|
206
|
+
else:
|
|
207
|
+
self.client.delete_by_query(
|
|
208
|
+
index=self.index,
|
|
209
|
+
query=query,
|
|
210
|
+
refresh=refresh_indices,
|
|
211
|
+
**delete_kwargs,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
except BulkIndexError as e:
|
|
215
|
+
logger.error(f"Error deleting texts: {e}")
|
|
216
|
+
firstError = e.errors[0].get("index", {}).get("error", {})
|
|
217
|
+
logger.error(f"First error reason: {firstError.get('reason')}")
|
|
218
|
+
raise e
|
|
219
|
+
|
|
220
|
+
return True
|
|
221
|
+
|
|
222
|
+
def search(
|
|
223
|
+
self,
|
|
224
|
+
*,
|
|
225
|
+
query: Optional[str],
|
|
226
|
+
query_vector: Optional[List[float]] = None,
|
|
227
|
+
k: int = 4,
|
|
228
|
+
num_candidates: int = 50,
|
|
229
|
+
fields: Optional[List[str]] = None,
|
|
230
|
+
filter: Optional[List[Dict[str, Any]]] = None,
|
|
231
|
+
custom_query: Optional[
|
|
232
|
+
Callable[[Dict[str, Any], Optional[str]], Dict[str, Any]]
|
|
233
|
+
] = None,
|
|
234
|
+
) -> List[Dict[str, Any]]:
|
|
235
|
+
"""
|
|
236
|
+
:param query: Input query string.
|
|
237
|
+
:param query_vector: Input embedding vector. If given, input query string is
|
|
238
|
+
ignored.
|
|
239
|
+
:param k: Number of returned results.
|
|
240
|
+
:param num_candidates: Number of candidates to fetch from data nodes in knn.
|
|
241
|
+
:param fields: List of field names to return.
|
|
242
|
+
:param filter: Elasticsearch filters to apply.
|
|
243
|
+
:param custom_query: Function to modify the Elasticsearch query body before it is
|
|
244
|
+
sent to Elasticsearch.
|
|
245
|
+
|
|
246
|
+
:return: List of document hits. Includes _index, _id, _score and _source.
|
|
247
|
+
"""
|
|
248
|
+
if fields is None:
|
|
249
|
+
fields = []
|
|
250
|
+
if "metadata" not in fields:
|
|
251
|
+
fields.append("metadata")
|
|
252
|
+
if self.text_field not in fields:
|
|
253
|
+
fields.append(self.text_field)
|
|
254
|
+
|
|
255
|
+
if self.embedding_service and not query_vector:
|
|
256
|
+
if not query:
|
|
257
|
+
raise ValueError("specify a query or a query_vector to search")
|
|
258
|
+
query_vector = self.embedding_service.embed_query(query)
|
|
259
|
+
|
|
260
|
+
query_body = self.retrieval_strategy.es_query(
|
|
261
|
+
query=query,
|
|
262
|
+
query_vector=query_vector,
|
|
263
|
+
text_field=self.text_field,
|
|
264
|
+
vector_field=self.vector_field,
|
|
265
|
+
k=k,
|
|
266
|
+
num_candidates=num_candidates,
|
|
267
|
+
filter=filter or [],
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
if custom_query is not None:
|
|
271
|
+
query_body = custom_query(query_body, query)
|
|
272
|
+
logger.debug(f"Calling custom_query, Query body now: {query_body}")
|
|
273
|
+
|
|
274
|
+
response = self.client.search(
|
|
275
|
+
index=self.index,
|
|
276
|
+
**query_body,
|
|
277
|
+
size=k,
|
|
278
|
+
source=True,
|
|
279
|
+
source_includes=fields,
|
|
280
|
+
)
|
|
281
|
+
hits: List[Dict[str, Any]] = response["hits"]["hits"]
|
|
282
|
+
|
|
283
|
+
return hits
|
|
284
|
+
|
|
285
|
+
def _create_index_if_not_exists(self) -> None:
|
|
286
|
+
exists = self.client.indices.exists(index=self.index)
|
|
287
|
+
if exists.meta.status == 200:
|
|
288
|
+
logger.debug(f"Index {self.index} already exists. Skipping creation.")
|
|
289
|
+
return
|
|
290
|
+
|
|
291
|
+
if self.retrieval_strategy.needs_inference():
|
|
292
|
+
if not self.num_dimensions and not self.embedding_service:
|
|
293
|
+
raise ValueError(
|
|
294
|
+
"retrieval strategy requires embeddings; either embedding_service "
|
|
295
|
+
"or num_dimensions need to be specified"
|
|
296
|
+
)
|
|
297
|
+
if not self.num_dimensions and self.embedding_service:
|
|
298
|
+
vector = self.embedding_service.embed_query("get num dimensions")
|
|
299
|
+
self.num_dimensions = len(vector)
|
|
300
|
+
|
|
301
|
+
mappings, settings = self.retrieval_strategy.es_mappings_settings(
|
|
302
|
+
text_field=self.text_field,
|
|
303
|
+
vector_field=self.vector_field,
|
|
304
|
+
num_dimensions=self.num_dimensions,
|
|
305
|
+
)
|
|
306
|
+
if self.metadata_mappings:
|
|
307
|
+
metadata = mappings["properties"].get("metadata", {"properties": {}})
|
|
308
|
+
for key in self.metadata_mappings.keys():
|
|
309
|
+
if key in metadata:
|
|
310
|
+
raise ValueError(f"metadata key {key} already exists in mappings")
|
|
311
|
+
|
|
312
|
+
metadata = dict(**metadata["properties"], **self.metadata_mappings)
|
|
313
|
+
mappings["properties"]["metadata"] = {"properties": metadata}
|
|
314
|
+
|
|
315
|
+
self.retrieval_strategy.before_index_creation(
|
|
316
|
+
client=self.client,
|
|
317
|
+
text_field=self.text_field,
|
|
318
|
+
vector_field=self.vector_field,
|
|
319
|
+
)
|
|
320
|
+
self.client.indices.create(
|
|
321
|
+
index=self.index, mappings=mappings, settings=settings
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
def max_marginal_relevance_search(
|
|
325
|
+
self,
|
|
326
|
+
*,
|
|
327
|
+
embedding_service: EmbeddingService,
|
|
328
|
+
query: str,
|
|
329
|
+
vector_field: str,
|
|
330
|
+
k: int = 4,
|
|
331
|
+
num_candidates: int = 20,
|
|
332
|
+
lambda_mult: float = 0.5,
|
|
333
|
+
fields: Optional[List[str]] = None,
|
|
334
|
+
custom_query: Optional[
|
|
335
|
+
Callable[[Dict[str, Any], Optional[str]], Dict[str, Any]]
|
|
336
|
+
] = None,
|
|
337
|
+
) -> List[Dict[str, Any]]:
|
|
338
|
+
"""Return docs selected using the maximal marginal relevance.
|
|
339
|
+
|
|
340
|
+
Maximal marginal relevance optimizes for similarity to query AND diversity
|
|
341
|
+
among selected documents.
|
|
342
|
+
|
|
343
|
+
:param query (str): Text to look up documents similar to.
|
|
344
|
+
:param k (int): Number of Documents to return. Defaults to 4.
|
|
345
|
+
:param fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
|
346
|
+
:param lambda_mult (float): Number between 0 and 1 that determines the degree
|
|
347
|
+
of diversity among the results with 0 corresponding
|
|
348
|
+
to maximum diversity and 1 to minimum diversity.
|
|
349
|
+
Defaults to 0.5.
|
|
350
|
+
:param fields: Other fields to get from elasticsearch source. These fields
|
|
351
|
+
will be added to the document metadata.
|
|
352
|
+
|
|
353
|
+
:return: A list of Documents selected by maximal marginal relevance.
|
|
354
|
+
"""
|
|
355
|
+
remove_vector_query_field_from_metadata = True
|
|
356
|
+
if fields is None:
|
|
357
|
+
fields = [vector_field]
|
|
358
|
+
elif vector_field not in fields:
|
|
359
|
+
fields.append(vector_field)
|
|
360
|
+
else:
|
|
361
|
+
remove_vector_query_field_from_metadata = False
|
|
362
|
+
|
|
363
|
+
# Embed the query
|
|
364
|
+
query_embedding = embedding_service.embed_query(query)
|
|
365
|
+
|
|
366
|
+
# Fetch the initial documents
|
|
367
|
+
got_hits = self.search(
|
|
368
|
+
query=None,
|
|
369
|
+
query_vector=query_embedding,
|
|
370
|
+
k=num_candidates,
|
|
371
|
+
fields=fields,
|
|
372
|
+
custom_query=custom_query,
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
# Get the embeddings for the fetched documents
|
|
376
|
+
got_embeddings = [hit["_source"][vector_field] for hit in got_hits]
|
|
377
|
+
|
|
378
|
+
# Select documents using maximal marginal relevance
|
|
379
|
+
selected_indices = maximal_marginal_relevance(
|
|
380
|
+
query_embedding, got_embeddings, lambda_mult=lambda_mult, k=k
|
|
381
|
+
)
|
|
382
|
+
selected_hits = [got_hits[i] for i in selected_indices]
|
|
383
|
+
|
|
384
|
+
if remove_vector_query_field_from_metadata:
|
|
385
|
+
for hit in selected_hits:
|
|
386
|
+
del hit["_source"][vector_field]
|
|
387
|
+
|
|
388
|
+
return selected_hits
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# Licensed to Elasticsearch B.V. under one or more contributor
|
|
2
|
+
# license agreements. See the NOTICE file distributed with
|
|
3
|
+
# this work for additional information regarding copyright
|
|
4
|
+
# ownership. Elasticsearch B.V. licenses this file to you under
|
|
5
|
+
# the Apache License, Version 2.0 (the "License"); you may
|
|
6
|
+
# not use this file except in compliance with the License.
|
|
7
|
+
# You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
from enum import Enum
|
|
19
|
+
from typing import TYPE_CHECKING, List, Union
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
import numpy as np
|
|
23
|
+
import numpy.typing as npt
|
|
24
|
+
|
|
25
|
+
Matrix = Union[
|
|
26
|
+
List[List[float]], List["npt.NDArray[np.float64]"], "npt.NDArray[np.float64]"
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DistanceMetric(str, Enum):
|
|
31
|
+
"""Enumerator of all Elasticsearch dense vector distance metrics."""
|
|
32
|
+
|
|
33
|
+
COSINE = "COSINE"
|
|
34
|
+
DOT_PRODUCT = "DOT_PRODUCT"
|
|
35
|
+
EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE"
|
|
36
|
+
MAX_INNER_PRODUCT = "MAX_INNER_PRODUCT"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def maximal_marginal_relevance(
|
|
40
|
+
query_embedding: List[float],
|
|
41
|
+
embedding_list: List[List[float]],
|
|
42
|
+
lambda_mult: float = 0.5,
|
|
43
|
+
k: int = 4,
|
|
44
|
+
) -> List[int]:
|
|
45
|
+
"""Calculate maximal marginal relevance."""
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
import numpy as np
|
|
49
|
+
except ModuleNotFoundError as e:
|
|
50
|
+
_raise_missing_mmr_deps_error(e)
|
|
51
|
+
|
|
52
|
+
query_embedding_arr = np.array(query_embedding)
|
|
53
|
+
|
|
54
|
+
if min(k, len(embedding_list)) <= 0:
|
|
55
|
+
return []
|
|
56
|
+
if query_embedding_arr.ndim == 1:
|
|
57
|
+
query_embedding_arr = np.expand_dims(query_embedding_arr, axis=0)
|
|
58
|
+
similarity_to_query = _cosine_similarity(query_embedding_arr, embedding_list)[0]
|
|
59
|
+
most_similar = int(np.argmax(similarity_to_query))
|
|
60
|
+
idxs = [most_similar]
|
|
61
|
+
selected = np.array([embedding_list[most_similar]])
|
|
62
|
+
while len(idxs) < min(k, len(embedding_list)):
|
|
63
|
+
best_score = -np.inf
|
|
64
|
+
idx_to_add = -1
|
|
65
|
+
similarity_to_selected = _cosine_similarity(embedding_list, selected)
|
|
66
|
+
for i, query_score in enumerate(similarity_to_query):
|
|
67
|
+
if i in idxs:
|
|
68
|
+
continue
|
|
69
|
+
redundant_score = max(similarity_to_selected[i])
|
|
70
|
+
equation_score = (
|
|
71
|
+
lambda_mult * query_score - (1 - lambda_mult) * redundant_score
|
|
72
|
+
)
|
|
73
|
+
if equation_score > best_score:
|
|
74
|
+
best_score = equation_score
|
|
75
|
+
idx_to_add = i
|
|
76
|
+
idxs.append(idx_to_add)
|
|
77
|
+
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
|
|
78
|
+
return idxs
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _cosine_similarity(X: Matrix, Y: Matrix) -> "npt.NDArray[np.float64]":
|
|
82
|
+
"""Row-wise cosine similarity between two equal-width matrices."""
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
import numpy as np
|
|
86
|
+
import simsimd as simd
|
|
87
|
+
except ModuleNotFoundError as e:
|
|
88
|
+
_raise_missing_mmr_deps_error(e)
|
|
89
|
+
|
|
90
|
+
if len(X) == 0 or len(Y) == 0:
|
|
91
|
+
return np.array([])
|
|
92
|
+
|
|
93
|
+
X = np.array(X)
|
|
94
|
+
Y = np.array(Y)
|
|
95
|
+
if X.shape[1] != Y.shape[1]:
|
|
96
|
+
raise ValueError(
|
|
97
|
+
f"Number of columns in X and Y must be the same. X has shape {X.shape} "
|
|
98
|
+
f"and Y has shape {Y.shape}."
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
X = np.array(X, dtype=np.float32)
|
|
102
|
+
Y = np.array(Y, dtype=np.float32)
|
|
103
|
+
Z = 1 - np.array(simd.cdist(X, Y, metric="cosine"))
|
|
104
|
+
if isinstance(Z, float):
|
|
105
|
+
return np.array([Z])
|
|
106
|
+
return np.array(Z)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _raise_missing_mmr_deps_error(parent_error: ModuleNotFoundError) -> None:
|
|
110
|
+
import sys
|
|
111
|
+
|
|
112
|
+
raise ModuleNotFoundError(
|
|
113
|
+
f"Failed to compute maximal marginal relevance because the required "
|
|
114
|
+
f"module '{parent_error.name}' is missing. You can install it by running: "
|
|
115
|
+
f"'{sys.executable} -m pip install elasticsearch[vectorstore_mmr]'"
|
|
116
|
+
) from parent_error
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: elasticsearch
|
|
3
|
-
Version: 8.13.
|
|
3
|
+
Version: 8.13.2
|
|
4
4
|
Summary: Python client for Elasticsearch
|
|
5
5
|
Home-page: https://github.com/elastic/elasticsearch-py
|
|
6
6
|
Author: Elastic Client Library Maintainers
|
|
@@ -33,7 +33,10 @@ Requires-Dist: aiohttp <4,>=3 ; extra == 'async'
|
|
|
33
33
|
Provides-Extra: orjson
|
|
34
34
|
Requires-Dist: orjson >=3 ; extra == 'orjson'
|
|
35
35
|
Provides-Extra: requests
|
|
36
|
-
Requires-Dist: requests
|
|
36
|
+
Requires-Dist: requests !=2.32.2,<3.0.0,>=2.4.0 ; extra == 'requests'
|
|
37
|
+
Provides-Extra: vectorstore_mmr
|
|
38
|
+
Requires-Dist: numpy >=1 ; extra == 'vectorstore_mmr'
|
|
39
|
+
Requires-Dist: simsimd >=3 ; extra == 'vectorstore_mmr'
|
|
37
40
|
|
|
38
41
|
|
|
39
42
|
Elasticsearch Python Client
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
elasticsearch/__init__.py,sha256=w5YnO16zjOi6loGJ8caUgSXsj3b-Y8OfF0BIddP2BiE,3289
|
|
2
2
|
elasticsearch/_otel.py,sha256=9nMvDwSz_BephEQmkETCmEdA_6YNtj_CJQyxIwOzEGo,3406
|
|
3
3
|
elasticsearch/_utils.py,sha256=Vr_aNG5ddxInE1PgDpCXMYpXBTNUFM9nYrgbw-cjeCc,1419
|
|
4
|
-
elasticsearch/_version.py,sha256=
|
|
4
|
+
elasticsearch/_version.py,sha256=0P-o-HdbMQa0vGhTZYE8ZkuZ9v7zXoV3M6M0wUoqfrs,814
|
|
5
5
|
elasticsearch/client.py,sha256=p4naAgAPs_B9laux-aC-bpZNvbu9fvrpicBvLRlp_yw,5120
|
|
6
6
|
elasticsearch/compat.py,sha256=hL3mtqVxWwxeiFbNGADva5XruAwK-A6xcu-vrpnDXFs,2657
|
|
7
7
|
elasticsearch/exceptions.py,sha256=HHqidMTlLXRcNbo4gjsbhHsLfiV4JGg7tx1lXQndG98,4025
|
|
@@ -10,7 +10,7 @@ elasticsearch/serializer.py,sha256=hPBqMHweVES7ixgKhe5KMMQGAdDamzW-PAn7jX8uqcs,7
|
|
|
10
10
|
elasticsearch/transport.py,sha256=CxKo2USPQ-6q4x8Ckfq_dUFWhA1s98p75ghXc3breJI,2248
|
|
11
11
|
elasticsearch/_async/__init__.py,sha256=TZps9WjF-TaSNzBvW5wUCgXRcbHnvE_9xAynBHsMtSo,787
|
|
12
12
|
elasticsearch/_async/helpers.py,sha256=vX2oGiYHO5raZJOnTMEX9hViPNy1YW26IeS60gbhNno,22285
|
|
13
|
-
elasticsearch/_async/client/__init__.py,sha256=
|
|
13
|
+
elasticsearch/_async/client/__init__.py,sha256=0JS0lH4tghGRfN5I2TiUv6f0VsF4et_cCD3lD7vDFao,240323
|
|
14
14
|
elasticsearch/_async/client/_base.py,sha256=wDJIs-4Z_fDBF0_XvfCmfSuyL6Oh8I2nSGDZyP91XCU,15531
|
|
15
15
|
elasticsearch/_async/client/async_search.py,sha256=bv2GC_5ZPX3HpOHCJLf7-CzjPxEQSyhWTlbn7K40ODY,29238
|
|
16
16
|
elasticsearch/_async/client/autoscaling.py,sha256=eV4PGsQc3F-ykw8nRfRZcjD0Xr_0eo1oFbOh5nBBvok,7741
|
|
@@ -31,14 +31,14 @@ elasticsearch/_async/client/ingest.py,sha256=5V8fur9z4Vkv4p-3geOfc1ek5N8qJ6CvWbd
|
|
|
31
31
|
elasticsearch/_async/client/license.py,sha256=kuqW-yfOVdsDHh5puu7_tD_LN7JU0Axt0AtE5ttx8jc,12100
|
|
32
32
|
elasticsearch/_async/client/logstash.py,sha256=JrvbrTi9sCO-GzOe3lBWcaIOPkYFqzenCkYw_B6JW7Q,6127
|
|
33
33
|
elasticsearch/_async/client/migration.py,sha256=It0NTjXaN-UW8keaNURglL4rYbOmAvTbHD7QlFBOJg4,5331
|
|
34
|
-
elasticsearch/_async/client/ml.py,sha256=
|
|
34
|
+
elasticsearch/_async/client/ml.py,sha256=tckls24jxPSOb_OA3f6plc0DDLfKhwddhpAoVfyr0gU,241082
|
|
35
35
|
elasticsearch/_async/client/monitoring.py,sha256=ZQSiQ8AXfu_8_--cHOpfhjpsQkQZekmBlPiOLAYotpw,3822
|
|
36
36
|
elasticsearch/_async/client/nodes.py,sha256=xmLfm0cPsQSjH_WKyzGKHI6IhgT9gKhj3XrsJdTXL5w,23723
|
|
37
37
|
elasticsearch/_async/client/query_ruleset.py,sha256=vBgpKRYL0X7giU05Vw6-pXre10W81zGoALWL_XAnkRM,7606
|
|
38
38
|
elasticsearch/_async/client/rollup.py,sha256=g_zc1KwIW9Bms2XPO0jkaqhrDGVy05WvA1pYYBfgyLc,20828
|
|
39
39
|
elasticsearch/_async/client/search_application.py,sha256=2UWgrM69Ai-O6oMbBrUxmW9BBBdTDI_UPGFzW86wIFU,15047
|
|
40
40
|
elasticsearch/_async/client/searchable_snapshots.py,sha256=WsRPKq-ZjFwt7nj6GPCib1RwR7f5YG-Qo58BgZxJpqc,11694
|
|
41
|
-
elasticsearch/_async/client/security.py,sha256=
|
|
41
|
+
elasticsearch/_async/client/security.py,sha256=OJ7vXLuIja8VGHZIxteLjuEcMaLyfKUGOi75f-N54Eo,129415
|
|
42
42
|
elasticsearch/_async/client/shutdown.py,sha256=Ckk_YcGy1yO2PuD0LDIulxkFFcmx4t7V3z_thRUWkEw,11364
|
|
43
43
|
elasticsearch/_async/client/slm.py,sha256=lVnfb9LERioxR9frUwZA6l9UZaQcb3sYzBBLo1fuimE,16150
|
|
44
44
|
elasticsearch/_async/client/snapshot.py,sha256=c6FSTngRw5PQK1UEFlA_vXPZOmbHMFi9FtT63y_mbbE,37085
|
|
@@ -52,7 +52,7 @@ elasticsearch/_async/client/utils.py,sha256=JwFOxo-YrRg4exXlpiUuOG1uNtJKP_VeK0Hg
|
|
|
52
52
|
elasticsearch/_async/client/watcher.py,sha256=rp8cX_RgvpPp0Zny0fXPQ6Tx-oik2l3y38z1n8b4h44,25760
|
|
53
53
|
elasticsearch/_async/client/xpack.py,sha256=yNFQduIcpIfbvY2-spbOOoQWQbKl0yx71mB-tMV9Sic,4490
|
|
54
54
|
elasticsearch/_sync/__init__.py,sha256=TZps9WjF-TaSNzBvW5wUCgXRcbHnvE_9xAynBHsMtSo,787
|
|
55
|
-
elasticsearch/_sync/client/__init__.py,sha256=
|
|
55
|
+
elasticsearch/_sync/client/__init__.py,sha256=WhHCGYW8u7czxp1LTiqC6DMgwzM5DocqP_i8mCkvr3k,239692
|
|
56
56
|
elasticsearch/_sync/client/_base.py,sha256=LesRKQzvgstEPn-hzoY7PBq_4hAyPCzjGiUggGn-fB8,15461
|
|
57
57
|
elasticsearch/_sync/client/async_search.py,sha256=miChWFtoXb8UhC4mrfzcM0vXxxOAoHsno1zJVey2vHM,29190
|
|
58
58
|
elasticsearch/_sync/client/autoscaling.py,sha256=eSNyAXvp7TkYTVthPd8ez5UpCq_UrAYVEWGQw0cjriI,7693
|
|
@@ -73,14 +73,14 @@ elasticsearch/_sync/client/ingest.py,sha256=UY-x1MXjnD2LkbX8O2qa5Ag6eD4lzf4LeZO8
|
|
|
73
73
|
elasticsearch/_sync/client/license.py,sha256=tEc366OvzPh9OFbgCMAo8dyw7wjajIcNQyfAX-uin3I,12016
|
|
74
74
|
elasticsearch/_sync/client/logstash.py,sha256=2Mpd3y-wNR3H7wsFW2riiB6bbobctNoMW22VOvDbN2s,6091
|
|
75
75
|
elasticsearch/_sync/client/migration.py,sha256=dJ27K1SLcJn4XyBvyPFVocjxHtsAyf2vuOQl52uxK64,5295
|
|
76
|
-
elasticsearch/_sync/client/ml.py,sha256=
|
|
76
|
+
elasticsearch/_sync/client/ml.py,sha256=7YgW9g5MQ4KUhI05X_mp7zDHwmRhDAXc0UahK_7qmwM,240206
|
|
77
77
|
elasticsearch/_sync/client/monitoring.py,sha256=Wh2RTfFOOm5ssbdGv0_gSDS2og1L8ATJamLuUuNrBYU,3810
|
|
78
78
|
elasticsearch/_sync/client/nodes.py,sha256=YloTg5V90lyGeoY_XU_2X2aeLHOpPss1g7Kp0hR2EF4,23639
|
|
79
79
|
elasticsearch/_sync/client/query_ruleset.py,sha256=dRzaVZVYLtlRi6PJVGQTPXCKCJuuf6fkU6wug-zmmXs,7558
|
|
80
80
|
elasticsearch/_sync/client/rollup.py,sha256=ow9wty2HE3doDEMcg-krKWmbgDsAD6brZe6IuenlV94,20732
|
|
81
81
|
elasticsearch/_sync/client/search_application.py,sha256=QWZqLxnaNo7XDg1xQPa0dY2j8LiJn03DDxiygJirqEc,14951
|
|
82
82
|
elasticsearch/_sync/client/searchable_snapshots.py,sha256=ehDowrZcltn4DJUkswyTOXk16QxVHdnid1fb_SdZ9UY,11646
|
|
83
|
-
elasticsearch/_sync/client/security.py,sha256=
|
|
83
|
+
elasticsearch/_sync/client/security.py,sha256=P8hvujP7dJYA1xsmbcLRQVmR586qRMgIHEn3p-QECs8,128803
|
|
84
84
|
elasticsearch/_sync/client/shutdown.py,sha256=YWT70DR1u4wBh1kKAtO-9hkJTm8OrYfTuUiLshwxebM,11328
|
|
85
85
|
elasticsearch/_sync/client/slm.py,sha256=WaMu3iQtftjelmfHyZUExLpzfkwJKS-iR218VZzPOiA,16042
|
|
86
86
|
elasticsearch/_sync/client/snapshot.py,sha256=pB1Gsv2kmmb-uyLlSRdd0K-g2B7xCFojyk3KgGsu7a0,36953
|
|
@@ -96,9 +96,21 @@ elasticsearch/_sync/client/xpack.py,sha256=PzQCp2i4nNzCtV1HGtLYuwqUDGnIe4On7f2Wy
|
|
|
96
96
|
elasticsearch/helpers/__init__.py,sha256=7X10XwdP_fP1QTHGcOxGbCvl2oBevkz_DjhjXCh_59I,1470
|
|
97
97
|
elasticsearch/helpers/actions.py,sha256=oEVAqOjWBn9PLFPpdIEBb5BMKLgN4mi7EozEcOqb4us,30956
|
|
98
98
|
elasticsearch/helpers/errors.py,sha256=GKtlM2687mbBC8PjwQGClBFE4sD129Ytb6wkHZveFJw,1213
|
|
99
|
-
elasticsearch
|
|
100
|
-
elasticsearch
|
|
101
|
-
elasticsearch
|
|
102
|
-
elasticsearch
|
|
103
|
-
elasticsearch
|
|
104
|
-
elasticsearch
|
|
99
|
+
elasticsearch/helpers/vectorstore/__init__.py,sha256=znQOANiaSZOJco_dkBf06wpFMKwK0OoDcNkkS8NMWKE,2192
|
|
100
|
+
elasticsearch/helpers/vectorstore/_utils.py,sha256=xJwCFq7sqUBeq143tfnfm3i4e-ta88s85wKZmPZwJWg,3985
|
|
101
|
+
elasticsearch/helpers/vectorstore/_async/__init__.py,sha256=TZps9WjF-TaSNzBvW5wUCgXRcbHnvE_9xAynBHsMtSo,787
|
|
102
|
+
elasticsearch/helpers/vectorstore/_async/_utils.py,sha256=wYlPKvAT4bflJjULLB2LMjJroAgX6tjoDGBPT6V1gj8,1608
|
|
103
|
+
elasticsearch/helpers/vectorstore/_async/embedding_service.py,sha256=Qv4HsPC4k6J00K4ajhJPFlET6fOTV-l74iDCr4dpZgc,3655
|
|
104
|
+
elasticsearch/helpers/vectorstore/_async/strategies.py,sha256=nh_wvNfWmZIjbTQ38FGSkUeF4_yDk48zukNuhOkq9bE,15322
|
|
105
|
+
elasticsearch/helpers/vectorstore/_async/vectorstore.py,sha256=YiwiNJsslVPmfrqB5jpkNchHKoGGZLdz_tieHMR0Jpk,15161
|
|
106
|
+
elasticsearch/helpers/vectorstore/_sync/__init__.py,sha256=TZps9WjF-TaSNzBvW5wUCgXRcbHnvE_9xAynBHsMtSo,787
|
|
107
|
+
elasticsearch/helpers/vectorstore/_sync/_utils.py,sha256=5pdvNS5XC3wqShjliW9Njl9tVuyI9WMy0cxc5-97K-c,1569
|
|
108
|
+
elasticsearch/helpers/vectorstore/_sync/embedding_service.py,sha256=sAw_WKUcmyqOOJRqnNesZCzn7ZyA91v4NvvQszHIWJ8,3582
|
|
109
|
+
elasticsearch/helpers/vectorstore/_sync/strategies.py,sha256=0Q1zoOrO51S6HXjXwkAePPVtCUGQz5lKN9NyRCso-GU,15220
|
|
110
|
+
elasticsearch/helpers/vectorstore/_sync/vectorstore.py,sha256=MqTDeHyG4NS9gInxwljh4J0QVvS499SajQ22fe1UpLs,14970
|
|
111
|
+
elasticsearch-8.13.2.dist-info/LICENSE,sha256=XfKg2H1sVi8OoRxoisUlMqoo10TKvHmU_wU39ks7MyA,10143
|
|
112
|
+
elasticsearch-8.13.2.dist-info/METADATA,sha256=Eex24ZRMP8r810PL3irLjbwO0c_KeWgszymGV2Q6k8M,6464
|
|
113
|
+
elasticsearch-8.13.2.dist-info/NOTICE,sha256=t4IjKAJ_G-0hYaL4AH16CVS_xDel8UXrJVK6x7JDaGA,61
|
|
114
|
+
elasticsearch-8.13.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
115
|
+
elasticsearch-8.13.2.dist-info/top_level.txt,sha256=Jp2bLWq49skvCN4YCZsg1Hfn_NDLgleC-x-Bn01_HgM,14
|
|
116
|
+
elasticsearch-8.13.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|