elasticsearch 8.14.0__py3-none-any.whl → 8.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- elasticsearch/_async/client/__init__.py +200 -168
- elasticsearch/_async/client/async_search.py +35 -20
- elasticsearch/_async/client/autoscaling.py +4 -4
- elasticsearch/_async/client/cat.py +785 -180
- elasticsearch/_async/client/ccr.py +20 -32
- elasticsearch/_async/client/cluster.py +87 -79
- elasticsearch/_async/client/connector.py +1470 -0
- elasticsearch/_async/client/dangling_indices.py +7 -11
- elasticsearch/_async/client/enrich.py +8 -8
- elasticsearch/_async/client/eql.py +17 -16
- elasticsearch/_async/client/esql.py +2 -2
- elasticsearch/_async/client/features.py +2 -2
- elasticsearch/_async/client/fleet.py +18 -17
- elasticsearch/_async/client/graph.py +4 -4
- elasticsearch/_async/client/ilm.py +36 -44
- elasticsearch/_async/client/indices.py +295 -317
- elasticsearch/_async/client/inference.py +42 -33
- elasticsearch/_async/client/ingest.py +197 -23
- elasticsearch/_async/client/license.py +18 -10
- elasticsearch/_async/client/logstash.py +6 -6
- elasticsearch/_async/client/migration.py +3 -3
- elasticsearch/_async/client/ml.py +383 -176
- elasticsearch/_async/client/monitoring.py +2 -2
- elasticsearch/_async/client/nodes.py +32 -32
- elasticsearch/_async/client/query_rules.py +384 -0
- elasticsearch/_async/client/rollup.py +13 -13
- elasticsearch/_async/client/search_application.py +10 -10
- elasticsearch/_async/client/searchable_snapshots.py +9 -13
- elasticsearch/_async/client/security.py +577 -104
- elasticsearch/_async/client/shutdown.py +7 -7
- elasticsearch/_async/client/slm.py +11 -13
- elasticsearch/_async/client/snapshot.py +39 -52
- elasticsearch/_async/client/sql.py +12 -14
- elasticsearch/_async/client/ssl.py +1 -1
- elasticsearch/_async/client/synonyms.py +11 -9
- elasticsearch/_async/client/tasks.py +9 -10
- elasticsearch/_async/client/text_structure.py +3 -3
- elasticsearch/_async/client/transform.py +89 -34
- elasticsearch/_async/client/watcher.py +30 -15
- elasticsearch/_async/client/xpack.py +6 -7
- elasticsearch/_otel.py +24 -6
- elasticsearch/_sync/client/__init__.py +200 -168
- elasticsearch/_sync/client/async_search.py +35 -20
- elasticsearch/_sync/client/autoscaling.py +4 -4
- elasticsearch/_sync/client/cat.py +785 -180
- elasticsearch/_sync/client/ccr.py +20 -32
- elasticsearch/_sync/client/cluster.py +87 -79
- elasticsearch/_sync/client/connector.py +1470 -0
- elasticsearch/_sync/client/dangling_indices.py +7 -11
- elasticsearch/_sync/client/enrich.py +8 -8
- elasticsearch/_sync/client/eql.py +17 -16
- elasticsearch/_sync/client/esql.py +2 -2
- elasticsearch/_sync/client/features.py +2 -2
- elasticsearch/_sync/client/fleet.py +18 -17
- elasticsearch/_sync/client/graph.py +4 -4
- elasticsearch/_sync/client/ilm.py +36 -44
- elasticsearch/_sync/client/indices.py +295 -317
- elasticsearch/_sync/client/inference.py +42 -33
- elasticsearch/_sync/client/ingest.py +197 -23
- elasticsearch/_sync/client/license.py +18 -10
- elasticsearch/_sync/client/logstash.py +6 -6
- elasticsearch/_sync/client/migration.py +3 -3
- elasticsearch/_sync/client/ml.py +383 -176
- elasticsearch/_sync/client/monitoring.py +2 -2
- elasticsearch/_sync/client/nodes.py +32 -32
- elasticsearch/_sync/client/query_rules.py +384 -0
- elasticsearch/_sync/client/rollup.py +13 -13
- elasticsearch/_sync/client/search_application.py +10 -10
- elasticsearch/_sync/client/searchable_snapshots.py +9 -13
- elasticsearch/_sync/client/security.py +577 -104
- elasticsearch/_sync/client/shutdown.py +7 -7
- elasticsearch/_sync/client/slm.py +11 -13
- elasticsearch/_sync/client/snapshot.py +39 -52
- elasticsearch/_sync/client/sql.py +12 -14
- elasticsearch/_sync/client/ssl.py +1 -1
- elasticsearch/_sync/client/synonyms.py +11 -9
- elasticsearch/_sync/client/tasks.py +9 -10
- elasticsearch/_sync/client/text_structure.py +3 -3
- elasticsearch/_sync/client/transform.py +89 -34
- elasticsearch/_sync/client/watcher.py +30 -15
- elasticsearch/_sync/client/xpack.py +6 -7
- elasticsearch/_version.py +1 -1
- elasticsearch/client.py +3 -3
- elasticsearch/helpers/actions.py +120 -106
- elasticsearch/helpers/vectorstore/_async/vectorstore.py +36 -6
- elasticsearch/helpers/vectorstore/_sync/vectorstore.py +36 -6
- elasticsearch/serializer.py +34 -0
- elasticsearch-8.15.1.dist-info/METADATA +177 -0
- elasticsearch-8.15.1.dist-info/RECORD +117 -0
- {elasticsearch-8.14.0.dist-info → elasticsearch-8.15.1.dist-info}/WHEEL +1 -2
- elasticsearch/_async/client/query_ruleset.py +0 -205
- elasticsearch/_sync/client/query_ruleset.py +0 -205
- elasticsearch-8.14.0.dist-info/METADATA +0 -161
- elasticsearch-8.14.0.dist-info/RECORD +0 -116
- elasticsearch-8.14.0.dist-info/top_level.txt +0 -1
- {elasticsearch-8.14.0.dist-info → elasticsearch-8.15.1.dist-info/licenses}/LICENSE +0 -0
- {elasticsearch-8.14.0.dist-info → elasticsearch-8.15.1.dist-info/licenses}/NOTICE +0 -0
elasticsearch/helpers/actions.py
CHANGED
|
@@ -34,6 +34,8 @@ from typing import (
|
|
|
34
34
|
Union,
|
|
35
35
|
)
|
|
36
36
|
|
|
37
|
+
from elastic_transport import OpenTelemetrySpan
|
|
38
|
+
|
|
37
39
|
from .. import Elasticsearch
|
|
38
40
|
from ..compat import to_bytes
|
|
39
41
|
from ..exceptions import ApiError, NotFoundError, TransportError
|
|
@@ -322,6 +324,7 @@ def _process_bulk_chunk(
|
|
|
322
324
|
Tuple[_TYPE_BULK_ACTION_HEADER, _TYPE_BULK_ACTION_BODY],
|
|
323
325
|
]
|
|
324
326
|
],
|
|
327
|
+
otel_span: OpenTelemetrySpan,
|
|
325
328
|
raise_on_exception: bool = True,
|
|
326
329
|
raise_on_error: bool = True,
|
|
327
330
|
ignore_status: Union[int, Collection[int]] = (),
|
|
@@ -331,28 +334,29 @@ def _process_bulk_chunk(
|
|
|
331
334
|
"""
|
|
332
335
|
Send a bulk request to elasticsearch and process the output.
|
|
333
336
|
"""
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
337
|
+
with client._otel.use_span(otel_span):
|
|
338
|
+
if isinstance(ignore_status, int):
|
|
339
|
+
ignore_status = (ignore_status,)
|
|
340
|
+
|
|
341
|
+
try:
|
|
342
|
+
# send the actual request
|
|
343
|
+
resp = client.bulk(*args, operations=bulk_actions, **kwargs) # type: ignore[arg-type]
|
|
344
|
+
except ApiError as e:
|
|
345
|
+
gen = _process_bulk_chunk_error(
|
|
346
|
+
error=e,
|
|
347
|
+
bulk_data=bulk_data,
|
|
348
|
+
ignore_status=ignore_status,
|
|
349
|
+
raise_on_exception=raise_on_exception,
|
|
350
|
+
raise_on_error=raise_on_error,
|
|
351
|
+
)
|
|
352
|
+
else:
|
|
353
|
+
gen = _process_bulk_chunk_success(
|
|
354
|
+
resp=resp.body,
|
|
355
|
+
bulk_data=bulk_data,
|
|
356
|
+
ignore_status=ignore_status,
|
|
357
|
+
raise_on_error=raise_on_error,
|
|
358
|
+
)
|
|
359
|
+
yield from gen
|
|
356
360
|
|
|
357
361
|
|
|
358
362
|
def streaming_bulk(
|
|
@@ -370,6 +374,7 @@ def streaming_bulk(
|
|
|
370
374
|
max_backoff: float = 600,
|
|
371
375
|
yield_ok: bool = True,
|
|
372
376
|
ignore_status: Union[int, Collection[int]] = (),
|
|
377
|
+
span_name: str = "helpers.streaming_bulk",
|
|
373
378
|
*args: Any,
|
|
374
379
|
**kwargs: Any,
|
|
375
380
|
) -> Iterable[Tuple[bool, Dict[str, Any]]]:
|
|
@@ -406,73 +411,78 @@ def streaming_bulk(
|
|
|
406
411
|
:arg yield_ok: if set to False will skip successful documents in the output
|
|
407
412
|
:arg ignore_status: list of HTTP status code that you want to ignore
|
|
408
413
|
"""
|
|
409
|
-
|
|
410
|
-
|
|
414
|
+
with client._otel.helpers_span(span_name) as otel_span:
|
|
415
|
+
client = client.options()
|
|
416
|
+
client._client_meta = (("h", "bp"),)
|
|
411
417
|
|
|
412
|
-
|
|
418
|
+
serializer = client.transport.serializers.get_serializer("application/json")
|
|
413
419
|
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
420
|
+
bulk_data: List[
|
|
421
|
+
Union[
|
|
422
|
+
Tuple[_TYPE_BULK_ACTION_HEADER],
|
|
423
|
+
Tuple[_TYPE_BULK_ACTION_HEADER, _TYPE_BULK_ACTION_BODY],
|
|
424
|
+
]
|
|
418
425
|
]
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
client,
|
|
440
|
-
bulk_actions,
|
|
426
|
+
bulk_actions: List[bytes]
|
|
427
|
+
for bulk_data, bulk_actions in _chunk_actions(
|
|
428
|
+
map(expand_action_callback, actions),
|
|
429
|
+
chunk_size,
|
|
430
|
+
max_chunk_bytes,
|
|
431
|
+
serializer,
|
|
432
|
+
):
|
|
433
|
+
for attempt in range(max_retries + 1):
|
|
434
|
+
to_retry: List[bytes] = []
|
|
435
|
+
to_retry_data: List[
|
|
436
|
+
Union[
|
|
437
|
+
Tuple[_TYPE_BULK_ACTION_HEADER],
|
|
438
|
+
Tuple[_TYPE_BULK_ACTION_HEADER, _TYPE_BULK_ACTION_BODY],
|
|
439
|
+
]
|
|
440
|
+
] = []
|
|
441
|
+
if attempt:
|
|
442
|
+
time.sleep(min(max_backoff, initial_backoff * 2 ** (attempt - 1)))
|
|
443
|
+
|
|
444
|
+
try:
|
|
445
|
+
for data, (ok, info) in zip(
|
|
441
446
|
bulk_data,
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
447
|
+
_process_bulk_chunk(
|
|
448
|
+
client,
|
|
449
|
+
bulk_actions,
|
|
450
|
+
bulk_data,
|
|
451
|
+
otel_span,
|
|
452
|
+
raise_on_exception,
|
|
453
|
+
raise_on_error,
|
|
454
|
+
ignore_status,
|
|
455
|
+
*args,
|
|
456
|
+
**kwargs,
|
|
457
|
+
),
|
|
458
|
+
):
|
|
459
|
+
if not ok:
|
|
460
|
+
action, info = info.popitem()
|
|
461
|
+
# retry if retries enabled, we get 429, and we are not
|
|
462
|
+
# in the last attempt
|
|
463
|
+
if (
|
|
464
|
+
max_retries
|
|
465
|
+
and info["status"] == 429
|
|
466
|
+
and (attempt + 1) <= max_retries
|
|
467
|
+
):
|
|
468
|
+
# _process_bulk_chunk expects bytes so we need to
|
|
469
|
+
# re-serialize the data
|
|
470
|
+
to_retry.extend(map(serializer.dumps, data))
|
|
471
|
+
to_retry_data.append(data)
|
|
472
|
+
else:
|
|
473
|
+
yield ok, {action: info}
|
|
474
|
+
elif yield_ok:
|
|
475
|
+
yield ok, info
|
|
476
|
+
|
|
477
|
+
except ApiError as e:
|
|
478
|
+
# suppress 429 errors since we will retry them
|
|
479
|
+
if attempt == max_retries or e.status_code != 429:
|
|
480
|
+
raise
|
|
481
|
+
else:
|
|
482
|
+
if not to_retry:
|
|
483
|
+
break
|
|
484
|
+
# retry only subset of documents that didn't succeed
|
|
485
|
+
bulk_actions, bulk_data = to_retry, to_retry_data
|
|
476
486
|
|
|
477
487
|
|
|
478
488
|
def bulk(
|
|
@@ -519,7 +529,7 @@ def bulk(
|
|
|
519
529
|
# make streaming_bulk yield successful results so we can count them
|
|
520
530
|
kwargs["yield_ok"] = True
|
|
521
531
|
for ok, item in streaming_bulk(
|
|
522
|
-
client, actions, ignore_status=ignore_status, *args, **kwargs # type: ignore[misc]
|
|
532
|
+
client, actions, ignore_status=ignore_status, span_name="helpers.bulk", *args, **kwargs # type: ignore[misc]
|
|
523
533
|
):
|
|
524
534
|
# go through request-response pairs and detect failures
|
|
525
535
|
if not ok:
|
|
@@ -589,27 +599,31 @@ def parallel_bulk(
|
|
|
589
599
|
] = Queue(max(queue_size, thread_count))
|
|
590
600
|
self._quick_put = self._inqueue.put
|
|
591
601
|
|
|
592
|
-
|
|
602
|
+
with client._otel.helpers_span("helpers.parallel_bulk") as otel_span:
|
|
603
|
+
pool = BlockingPool(thread_count)
|
|
593
604
|
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
605
|
+
try:
|
|
606
|
+
for result in pool.imap(
|
|
607
|
+
lambda bulk_chunk: list(
|
|
608
|
+
_process_bulk_chunk(
|
|
609
|
+
client,
|
|
610
|
+
bulk_chunk[1],
|
|
611
|
+
bulk_chunk[0],
|
|
612
|
+
otel_span=otel_span,
|
|
613
|
+
ignore_status=ignore_status, # type: ignore[misc]
|
|
614
|
+
*args,
|
|
615
|
+
**kwargs,
|
|
616
|
+
)
|
|
617
|
+
),
|
|
618
|
+
_chunk_actions(
|
|
619
|
+
expanded_actions, chunk_size, max_chunk_bytes, serializer
|
|
620
|
+
),
|
|
621
|
+
):
|
|
622
|
+
yield from result
|
|
623
|
+
|
|
624
|
+
finally:
|
|
625
|
+
pool.close()
|
|
626
|
+
pool.join()
|
|
613
627
|
|
|
614
628
|
|
|
615
629
|
def scan(
|
|
@@ -60,6 +60,7 @@ class AsyncVectorStore:
|
|
|
60
60
|
vector_field: str = "vector_field",
|
|
61
61
|
metadata_mappings: Optional[Dict[str, Any]] = None,
|
|
62
62
|
user_agent: str = f"elasticsearch-py-vs/{lib_version}",
|
|
63
|
+
custom_index_settings: Optional[Dict[str, Any]] = None,
|
|
63
64
|
) -> None:
|
|
64
65
|
"""
|
|
65
66
|
:param user_header: user agent header specific to the 3rd party integration.
|
|
@@ -72,6 +73,11 @@ class AsyncVectorStore:
|
|
|
72
73
|
the embedding vector goes in this field.
|
|
73
74
|
:param client: Elasticsearch client connection. Alternatively specify the
|
|
74
75
|
Elasticsearch connection with the other es_* parameters.
|
|
76
|
+
:param custom_index_settings: A dictionary of custom settings for the index.
|
|
77
|
+
This can include configurations like the number of shards, number of replicas,
|
|
78
|
+
analysis settings, and other index-specific settings. If not provided, default
|
|
79
|
+
settings will be used. Note that if the same setting is provided by both the user
|
|
80
|
+
and the strategy, will raise an error.
|
|
75
81
|
"""
|
|
76
82
|
# Add integration-specific usage header for tracking usage in Elastic Cloud.
|
|
77
83
|
# client.options preserves existing (non-user-agent) headers.
|
|
@@ -90,6 +96,7 @@ class AsyncVectorStore:
|
|
|
90
96
|
self.text_field = text_field
|
|
91
97
|
self.vector_field = vector_field
|
|
92
98
|
self.metadata_mappings = metadata_mappings
|
|
99
|
+
self.custom_index_settings = custom_index_settings
|
|
93
100
|
|
|
94
101
|
async def close(self) -> None:
|
|
95
102
|
return await self.client.close()
|
|
@@ -225,7 +232,7 @@ class AsyncVectorStore:
|
|
|
225
232
|
async def search(
|
|
226
233
|
self,
|
|
227
234
|
*,
|
|
228
|
-
query: Optional[str],
|
|
235
|
+
query: Optional[str] = None,
|
|
229
236
|
query_vector: Optional[List[float]] = None,
|
|
230
237
|
k: int = 4,
|
|
231
238
|
num_candidates: int = 50,
|
|
@@ -306,6 +313,16 @@ class AsyncVectorStore:
|
|
|
306
313
|
vector_field=self.vector_field,
|
|
307
314
|
num_dimensions=self.num_dimensions,
|
|
308
315
|
)
|
|
316
|
+
|
|
317
|
+
if self.custom_index_settings:
|
|
318
|
+
conflicting_keys = set(self.custom_index_settings.keys()) & set(
|
|
319
|
+
settings.keys()
|
|
320
|
+
)
|
|
321
|
+
if conflicting_keys:
|
|
322
|
+
raise ValueError(f"Conflicting settings: {conflicting_keys}")
|
|
323
|
+
else:
|
|
324
|
+
settings.update(self.custom_index_settings)
|
|
325
|
+
|
|
309
326
|
if self.metadata_mappings:
|
|
310
327
|
metadata = mappings["properties"].get("metadata", {"properties": {}})
|
|
311
328
|
for key in self.metadata_mappings.keys():
|
|
@@ -327,8 +344,9 @@ class AsyncVectorStore:
|
|
|
327
344
|
async def max_marginal_relevance_search(
|
|
328
345
|
self,
|
|
329
346
|
*,
|
|
330
|
-
|
|
331
|
-
|
|
347
|
+
query: Optional[str] = None,
|
|
348
|
+
query_embedding: Optional[List[float]] = None,
|
|
349
|
+
embedding_service: Optional[AsyncEmbeddingService] = None,
|
|
332
350
|
vector_field: str,
|
|
333
351
|
k: int = 4,
|
|
334
352
|
num_candidates: int = 20,
|
|
@@ -344,6 +362,8 @@ class AsyncVectorStore:
|
|
|
344
362
|
among selected documents.
|
|
345
363
|
|
|
346
364
|
:param query (str): Text to look up documents similar to.
|
|
365
|
+
:param query_embedding: Input embedding vector. If given, input query string is
|
|
366
|
+
ignored.
|
|
347
367
|
:param k (int): Number of Documents to return. Defaults to 4.
|
|
348
368
|
:param fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
|
349
369
|
:param lambda_mult (float): Number between 0 and 1 that determines the degree
|
|
@@ -364,12 +384,22 @@ class AsyncVectorStore:
|
|
|
364
384
|
remove_vector_query_field_from_metadata = False
|
|
365
385
|
|
|
366
386
|
# Embed the query
|
|
367
|
-
query_embedding
|
|
387
|
+
if query_embedding:
|
|
388
|
+
query_vector = query_embedding
|
|
389
|
+
else:
|
|
390
|
+
if not query:
|
|
391
|
+
raise ValueError("specify either query or query_embedding to search")
|
|
392
|
+
elif embedding_service:
|
|
393
|
+
query_vector = await embedding_service.embed_query(query)
|
|
394
|
+
elif self.embedding_service:
|
|
395
|
+
query_vector = await self.embedding_service.embed_query(query)
|
|
396
|
+
else:
|
|
397
|
+
raise ValueError("specify embedding_service to search with query")
|
|
368
398
|
|
|
369
399
|
# Fetch the initial documents
|
|
370
400
|
got_hits = await self.search(
|
|
371
401
|
query=None,
|
|
372
|
-
query_vector=
|
|
402
|
+
query_vector=query_vector,
|
|
373
403
|
k=num_candidates,
|
|
374
404
|
fields=fields,
|
|
375
405
|
custom_query=custom_query,
|
|
@@ -380,7 +410,7 @@ class AsyncVectorStore:
|
|
|
380
410
|
|
|
381
411
|
# Select documents using maximal marginal relevance
|
|
382
412
|
selected_indices = maximal_marginal_relevance(
|
|
383
|
-
|
|
413
|
+
query_vector, got_embeddings, lambda_mult=lambda_mult, k=k
|
|
384
414
|
)
|
|
385
415
|
selected_hits = [got_hits[i] for i in selected_indices]
|
|
386
416
|
|
|
@@ -57,6 +57,7 @@ class VectorStore:
|
|
|
57
57
|
vector_field: str = "vector_field",
|
|
58
58
|
metadata_mappings: Optional[Dict[str, Any]] = None,
|
|
59
59
|
user_agent: str = f"elasticsearch-py-vs/{lib_version}",
|
|
60
|
+
custom_index_settings: Optional[Dict[str, Any]] = None,
|
|
60
61
|
) -> None:
|
|
61
62
|
"""
|
|
62
63
|
:param user_header: user agent header specific to the 3rd party integration.
|
|
@@ -69,6 +70,11 @@ class VectorStore:
|
|
|
69
70
|
the embedding vector goes in this field.
|
|
70
71
|
:param client: Elasticsearch client connection. Alternatively specify the
|
|
71
72
|
Elasticsearch connection with the other es_* parameters.
|
|
73
|
+
:param custom_index_settings: A dictionary of custom settings for the index.
|
|
74
|
+
This can include configurations like the number of shards, number of replicas,
|
|
75
|
+
analysis settings, and other index-specific settings. If not provided, default
|
|
76
|
+
settings will be used. Note that if the same setting is provided by both the user
|
|
77
|
+
and the strategy, will raise an error.
|
|
72
78
|
"""
|
|
73
79
|
# Add integration-specific usage header for tracking usage in Elastic Cloud.
|
|
74
80
|
# client.options preserves existing (non-user-agent) headers.
|
|
@@ -87,6 +93,7 @@ class VectorStore:
|
|
|
87
93
|
self.text_field = text_field
|
|
88
94
|
self.vector_field = vector_field
|
|
89
95
|
self.metadata_mappings = metadata_mappings
|
|
96
|
+
self.custom_index_settings = custom_index_settings
|
|
90
97
|
|
|
91
98
|
def close(self) -> None:
|
|
92
99
|
return self.client.close()
|
|
@@ -222,7 +229,7 @@ class VectorStore:
|
|
|
222
229
|
def search(
|
|
223
230
|
self,
|
|
224
231
|
*,
|
|
225
|
-
query: Optional[str],
|
|
232
|
+
query: Optional[str] = None,
|
|
226
233
|
query_vector: Optional[List[float]] = None,
|
|
227
234
|
k: int = 4,
|
|
228
235
|
num_candidates: int = 50,
|
|
@@ -303,6 +310,16 @@ class VectorStore:
|
|
|
303
310
|
vector_field=self.vector_field,
|
|
304
311
|
num_dimensions=self.num_dimensions,
|
|
305
312
|
)
|
|
313
|
+
|
|
314
|
+
if self.custom_index_settings:
|
|
315
|
+
conflicting_keys = set(self.custom_index_settings.keys()) & set(
|
|
316
|
+
settings.keys()
|
|
317
|
+
)
|
|
318
|
+
if conflicting_keys:
|
|
319
|
+
raise ValueError(f"Conflicting settings: {conflicting_keys}")
|
|
320
|
+
else:
|
|
321
|
+
settings.update(self.custom_index_settings)
|
|
322
|
+
|
|
306
323
|
if self.metadata_mappings:
|
|
307
324
|
metadata = mappings["properties"].get("metadata", {"properties": {}})
|
|
308
325
|
for key in self.metadata_mappings.keys():
|
|
@@ -324,8 +341,9 @@ class VectorStore:
|
|
|
324
341
|
def max_marginal_relevance_search(
|
|
325
342
|
self,
|
|
326
343
|
*,
|
|
327
|
-
|
|
328
|
-
|
|
344
|
+
query: Optional[str] = None,
|
|
345
|
+
query_embedding: Optional[List[float]] = None,
|
|
346
|
+
embedding_service: Optional[EmbeddingService] = None,
|
|
329
347
|
vector_field: str,
|
|
330
348
|
k: int = 4,
|
|
331
349
|
num_candidates: int = 20,
|
|
@@ -341,6 +359,8 @@ class VectorStore:
|
|
|
341
359
|
among selected documents.
|
|
342
360
|
|
|
343
361
|
:param query (str): Text to look up documents similar to.
|
|
362
|
+
:param query_embedding: Input embedding vector. If given, input query string is
|
|
363
|
+
ignored.
|
|
344
364
|
:param k (int): Number of Documents to return. Defaults to 4.
|
|
345
365
|
:param fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
|
346
366
|
:param lambda_mult (float): Number between 0 and 1 that determines the degree
|
|
@@ -361,12 +381,22 @@ class VectorStore:
|
|
|
361
381
|
remove_vector_query_field_from_metadata = False
|
|
362
382
|
|
|
363
383
|
# Embed the query
|
|
364
|
-
query_embedding
|
|
384
|
+
if query_embedding:
|
|
385
|
+
query_vector = query_embedding
|
|
386
|
+
else:
|
|
387
|
+
if not query:
|
|
388
|
+
raise ValueError("specify either query or query_embedding to search")
|
|
389
|
+
elif embedding_service:
|
|
390
|
+
query_vector = embedding_service.embed_query(query)
|
|
391
|
+
elif self.embedding_service:
|
|
392
|
+
query_vector = self.embedding_service.embed_query(query)
|
|
393
|
+
else:
|
|
394
|
+
raise ValueError("specify embedding_service to search with query")
|
|
365
395
|
|
|
366
396
|
# Fetch the initial documents
|
|
367
397
|
got_hits = self.search(
|
|
368
398
|
query=None,
|
|
369
|
-
query_vector=
|
|
399
|
+
query_vector=query_vector,
|
|
370
400
|
k=num_candidates,
|
|
371
401
|
fields=fields,
|
|
372
402
|
custom_query=custom_query,
|
|
@@ -377,7 +407,7 @@ class VectorStore:
|
|
|
377
407
|
|
|
378
408
|
# Select documents using maximal marginal relevance
|
|
379
409
|
selected_indices = maximal_marginal_relevance(
|
|
380
|
-
|
|
410
|
+
query_vector, got_embeddings, lambda_mult=lambda_mult, k=k
|
|
381
411
|
)
|
|
382
412
|
selected_hits = [got_hits[i] for i in selected_indices]
|
|
383
413
|
|
elasticsearch/serializer.py
CHANGED
|
@@ -49,6 +49,14 @@ except ImportError:
|
|
|
49
49
|
_OrjsonSerializer = None # type: ignore[assignment,misc]
|
|
50
50
|
|
|
51
51
|
|
|
52
|
+
try:
|
|
53
|
+
import pyarrow as pa
|
|
54
|
+
|
|
55
|
+
__all__.append("PyArrowSerializer")
|
|
56
|
+
except ImportError:
|
|
57
|
+
pa = None
|
|
58
|
+
|
|
59
|
+
|
|
52
60
|
class JsonSerializer(_JsonSerializer):
|
|
53
61
|
mimetype: ClassVar[str] = "application/json"
|
|
54
62
|
|
|
@@ -114,6 +122,29 @@ class MapboxVectorTileSerializer(Serializer):
|
|
|
114
122
|
raise SerializationError(f"Cannot serialize {data!r} into a MapBox vector tile")
|
|
115
123
|
|
|
116
124
|
|
|
125
|
+
if pa is not None:
|
|
126
|
+
|
|
127
|
+
class PyArrowSerializer(Serializer):
|
|
128
|
+
"""PyArrow serializer for deserializing Arrow Stream data."""
|
|
129
|
+
|
|
130
|
+
mimetype: ClassVar[str] = "application/vnd.apache.arrow.stream"
|
|
131
|
+
|
|
132
|
+
def loads(self, data: bytes) -> pa.Table:
|
|
133
|
+
try:
|
|
134
|
+
with pa.ipc.open_stream(data) as reader:
|
|
135
|
+
return reader.read_all()
|
|
136
|
+
except pa.ArrowException as e:
|
|
137
|
+
raise SerializationError(
|
|
138
|
+
message=f"Unable to deserialize as Arrow stream: {data!r}",
|
|
139
|
+
errors=(e,),
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def dumps(self, data: Any) -> bytes:
|
|
143
|
+
raise SerializationError(
|
|
144
|
+
message="Elasticsearch does not accept Arrow input data"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
117
148
|
DEFAULT_SERIALIZERS: Dict[str, Serializer] = {
|
|
118
149
|
JsonSerializer.mimetype: JsonSerializer(),
|
|
119
150
|
MapboxVectorTileSerializer.mimetype: MapboxVectorTileSerializer(),
|
|
@@ -122,6 +153,9 @@ DEFAULT_SERIALIZERS: Dict[str, Serializer] = {
|
|
|
122
153
|
CompatibilityModeNdjsonSerializer.mimetype: CompatibilityModeNdjsonSerializer(),
|
|
123
154
|
}
|
|
124
155
|
|
|
156
|
+
if pa is not None:
|
|
157
|
+
DEFAULT_SERIALIZERS[PyArrowSerializer.mimetype] = PyArrowSerializer()
|
|
158
|
+
|
|
125
159
|
# Alias for backwards compatibility
|
|
126
160
|
JSONSerializer = JsonSerializer
|
|
127
161
|
|