elasticsearch-haystack 0.2.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of elasticsearch-haystack might be problematic. Click here for more details.
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/PKG-INFO +2 -2
- elasticsearch_haystack-0.4.0/pydoc/config.yml +32 -0
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/pyproject.toml +11 -7
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +32 -14
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +53 -11
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/src/haystack_integrations/document_stores/elasticsearch/document_store.py +119 -55
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/tests/test_document_store.py +32 -27
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/.gitignore +0 -0
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/LICENSE +0 -0
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/README.md +0 -0
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/docker-compose.yml +0 -0
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/src/haystack_integrations/components/retrievers/elasticsearch/__init__.py +0 -0
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/src/haystack_integrations/document_stores/elasticsearch/__init__.py +0 -0
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/src/haystack_integrations/document_stores/elasticsearch/filters.py +0 -0
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/tests/__init__.py +0 -0
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/tests/test_bm25_retriever.py +0 -0
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/tests/test_embedding_retriever.py +0 -0
- {elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/tests/test_filters.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: elasticsearch-haystack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Haystack 2.x Document Store for ElasticSearch
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
loaders:
|
|
2
|
+
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
|
|
3
|
+
search_path: [../src]
|
|
4
|
+
modules: [
|
|
5
|
+
"haystack_integrations.components.retrievers.elasticsearch.bm25_retriever",
|
|
6
|
+
"haystack_integrations.components.retrievers.elasticsearch.embedding_retriever",
|
|
7
|
+
"haystack_integrations.document_stores.elasticsearch.document_store",
|
|
8
|
+
"haystack_integrations.document_stores.elasticsearch.filters",
|
|
9
|
+
]
|
|
10
|
+
ignore_when_discovered: ["__init__"]
|
|
11
|
+
processors:
|
|
12
|
+
- type: filter
|
|
13
|
+
expression:
|
|
14
|
+
documented_only: true
|
|
15
|
+
do_not_filter_modules: false
|
|
16
|
+
skip_empty_modules: true
|
|
17
|
+
- type: smart
|
|
18
|
+
- type: crossref
|
|
19
|
+
renderer:
|
|
20
|
+
type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer
|
|
21
|
+
excerpt: Elasticsearch integration for Haystack
|
|
22
|
+
category_slug: integrations-api
|
|
23
|
+
title: Elasticsearch
|
|
24
|
+
slug: integrations-elasticsearch
|
|
25
|
+
order: 70
|
|
26
|
+
markdown:
|
|
27
|
+
descriptive_class_title: false
|
|
28
|
+
classdef_code_block: false
|
|
29
|
+
descriptive_module_title: true
|
|
30
|
+
add_method_class_prefix: true
|
|
31
|
+
add_member_class_prefix: false
|
|
32
|
+
filename: _readme_elasticsearch.md
|
|
@@ -49,6 +49,7 @@ dependencies = [
|
|
|
49
49
|
"coverage[toml]>=6.5",
|
|
50
50
|
"pytest",
|
|
51
51
|
"pytest-xdist",
|
|
52
|
+
"haystack-pydoc-tools",
|
|
52
53
|
]
|
|
53
54
|
[tool.hatch.envs.default.scripts]
|
|
54
55
|
test = "pytest {args:tests}"
|
|
@@ -61,6 +62,9 @@ cov = [
|
|
|
61
62
|
"test-cov",
|
|
62
63
|
"cov-report",
|
|
63
64
|
]
|
|
65
|
+
docs = [
|
|
66
|
+
"pydoc-markdown pydoc/config.yml"
|
|
67
|
+
]
|
|
64
68
|
|
|
65
69
|
[[tool.hatch.envs.all.matrix]]
|
|
66
70
|
python = ["3.8", "3.9", "3.10", "3.11"]
|
|
@@ -92,12 +96,12 @@ all = [
|
|
|
92
96
|
allow-direct-references = true
|
|
93
97
|
|
|
94
98
|
[tool.black]
|
|
95
|
-
target-version = ["
|
|
99
|
+
target-version = ["py38"]
|
|
96
100
|
line-length = 120
|
|
97
101
|
skip-string-normalization = true
|
|
98
102
|
|
|
99
103
|
[tool.ruff]
|
|
100
|
-
target-version = "
|
|
104
|
+
target-version = "py38"
|
|
101
105
|
line-length = 120
|
|
102
106
|
select = [
|
|
103
107
|
"A",
|
|
@@ -152,21 +156,21 @@ ban-relative-imports = "parents"
|
|
|
152
156
|
"tests/**/*" = ["PLR2004", "S101", "TID252"]
|
|
153
157
|
|
|
154
158
|
[tool.coverage.run]
|
|
155
|
-
|
|
159
|
+
source = ["haystack_integrations"]
|
|
156
160
|
branch = true
|
|
157
|
-
parallel =
|
|
161
|
+
parallel = false
|
|
158
162
|
|
|
159
|
-
[tool.coverage.paths]
|
|
160
|
-
elasticsearch_haystack = ["src/haystack_integrations", "*/elasticsearch/src/haystack_integrations"]
|
|
161
|
-
tests = ["tests", "*/elasticsearch/src/tests"]
|
|
162
163
|
|
|
163
164
|
[tool.coverage.report]
|
|
165
|
+
omit = ["*/tests/*", "*/__init__.py"]
|
|
166
|
+
show_missing=true
|
|
164
167
|
exclude_lines = [
|
|
165
168
|
"no cov",
|
|
166
169
|
"if __name__ == .__main__.:",
|
|
167
170
|
"if TYPE_CHECKING:",
|
|
168
171
|
]
|
|
169
172
|
|
|
173
|
+
|
|
170
174
|
[tool.pytest.ini_options]
|
|
171
175
|
minversion = "6.0"
|
|
172
176
|
markers = [
|
|
@@ -11,8 +11,9 @@ from haystack_integrations.document_stores.elasticsearch.document_store import E
|
|
|
11
11
|
@component
|
|
12
12
|
class ElasticsearchBM25Retriever:
|
|
13
13
|
"""
|
|
14
|
-
ElasticsearchBM25Retriever
|
|
15
|
-
similar documents to a user's query.
|
|
14
|
+
ElasticsearchBM25Retriever retrieves documents from the ElasticsearchDocumentStore using BM25 algorithm to find the
|
|
15
|
+
most similar documents to a user's query.
|
|
16
|
+
|
|
16
17
|
This retriever is only compatible with ElasticsearchDocumentStore.
|
|
17
18
|
|
|
18
19
|
Usage example:
|
|
@@ -35,7 +36,7 @@ class ElasticsearchBM25Retriever:
|
|
|
35
36
|
|
|
36
37
|
result = retriever.run(query="Who lives in Berlin?")
|
|
37
38
|
for doc in result["documents"]:
|
|
38
|
-
print(doc.
|
|
39
|
+
print(doc.content)
|
|
39
40
|
```
|
|
40
41
|
"""
|
|
41
42
|
|
|
@@ -53,12 +54,13 @@ class ElasticsearchBM25Retriever:
|
|
|
53
54
|
|
|
54
55
|
:param document_store: An instance of ElasticsearchDocumentStore.
|
|
55
56
|
:param filters: Filters applied to the retrieved Documents, for more info
|
|
56
|
-
see `ElasticsearchDocumentStore.filter_documents
|
|
57
|
-
:param fuzziness: Fuzziness parameter passed to Elasticsearch
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
:param top_k: Maximum number of Documents to return
|
|
61
|
-
:param scale_score: If `True` scales the Document`s scores between 0 and 1
|
|
57
|
+
see `ElasticsearchDocumentStore.filter_documents`.
|
|
58
|
+
:param fuzziness: Fuzziness parameter passed to Elasticsearch. See the official
|
|
59
|
+
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
|
|
60
|
+
for more details.
|
|
61
|
+
:param top_k: Maximum number of Documents to return.
|
|
62
|
+
:param scale_score: If `True` scales the Document`s scores between 0 and 1.
|
|
63
|
+
:raises ValueError: If `document_store` is not an instance of `ElasticsearchDocumentStore`.
|
|
62
64
|
"""
|
|
63
65
|
|
|
64
66
|
if not isinstance(document_store, ElasticsearchDocumentStore):
|
|
@@ -72,6 +74,12 @@ class ElasticsearchBM25Retriever:
|
|
|
72
74
|
self._scale_score = scale_score
|
|
73
75
|
|
|
74
76
|
def to_dict(self) -> Dict[str, Any]:
|
|
77
|
+
"""
|
|
78
|
+
Serializes the component to a dictionary.
|
|
79
|
+
|
|
80
|
+
:returns:
|
|
81
|
+
Dictionary with serialized data.
|
|
82
|
+
"""
|
|
75
83
|
return default_to_dict(
|
|
76
84
|
self,
|
|
77
85
|
filters=self._filters,
|
|
@@ -83,23 +91,33 @@ class ElasticsearchBM25Retriever:
|
|
|
83
91
|
|
|
84
92
|
@classmethod
|
|
85
93
|
def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchBM25Retriever":
|
|
94
|
+
"""
|
|
95
|
+
Deserializes the component from a dictionary.
|
|
96
|
+
|
|
97
|
+
:param data:
|
|
98
|
+
Dictionary to deserialize from.
|
|
99
|
+
:returns:
|
|
100
|
+
Deserialized component.
|
|
101
|
+
"""
|
|
86
102
|
data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict(
|
|
87
103
|
data["init_parameters"]["document_store"]
|
|
88
104
|
)
|
|
89
105
|
return default_from_dict(cls, data)
|
|
90
106
|
|
|
91
107
|
@component.output_types(documents=List[Document])
|
|
92
|
-
def run(self, query: str, top_k: Optional[int] = None):
|
|
108
|
+
def run(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None):
|
|
93
109
|
"""
|
|
94
110
|
Retrieve documents using the BM25 keyword-based algorithm.
|
|
95
111
|
|
|
96
|
-
:param query: String to search in
|
|
97
|
-
:param
|
|
98
|
-
:
|
|
112
|
+
:param query: String to search in `Document`s' text.
|
|
113
|
+
:param filters: Filters applied to the retrieved `Document`s.
|
|
114
|
+
:param top_k: Maximum number of `Document` to return.
|
|
115
|
+
:returns: A dictionary with the following keys:
|
|
116
|
+
- `documents`: List of `Document`s that match the query.
|
|
99
117
|
"""
|
|
100
118
|
docs = self._document_store._bm25_retrieval(
|
|
101
119
|
query=query,
|
|
102
|
-
filters=self._filters,
|
|
120
|
+
filters=filters or self._filters,
|
|
103
121
|
fuzziness=self._fuzziness,
|
|
104
122
|
top_k=top_k or self._top_k,
|
|
105
123
|
scale_score=self._scale_score,
|
|
@@ -11,9 +11,35 @@ from haystack_integrations.document_stores.elasticsearch.document_store import E
|
|
|
11
11
|
@component
|
|
12
12
|
class ElasticsearchEmbeddingRetriever:
|
|
13
13
|
"""
|
|
14
|
-
|
|
14
|
+
ElasticsearchEmbeddingRetriever retrieves documents from the ElasticsearchDocumentStore using vector similarity.
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
Usage example:
|
|
17
|
+
```python
|
|
18
|
+
from haystack import Document
|
|
19
|
+
from haystack.components.embedders import SentenceTransformersTextEmbedder
|
|
20
|
+
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
|
|
21
|
+
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchEmbeddingRetriever
|
|
22
|
+
|
|
23
|
+
document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200")
|
|
24
|
+
retriever = ElasticsearchEmbeddingRetriever(document_store=document_store)
|
|
25
|
+
|
|
26
|
+
# Add documents to DocumentStore
|
|
27
|
+
documents = [
|
|
28
|
+
Document(text="My name is Carla and I live in Berlin"),
|
|
29
|
+
Document(text="My name is Paul and I live in New York"),
|
|
30
|
+
Document(text="My name is Silvano and I live in Matera"),
|
|
31
|
+
Document(text="My name is Usagi Tsukino and I live in Tokyo"),
|
|
32
|
+
]
|
|
33
|
+
document_store.write_documents(documents)
|
|
34
|
+
|
|
35
|
+
te = SentenceTransformersTextEmbedder()
|
|
36
|
+
te.warm_up()
|
|
37
|
+
query_embeddings = te.run("Who lives in Berlin?")["embedding"]
|
|
38
|
+
|
|
39
|
+
result = retriever.run(query=query_embeddings)
|
|
40
|
+
for doc in result["documents"]:
|
|
41
|
+
print(doc.content)
|
|
42
|
+
```
|
|
17
43
|
"""
|
|
18
44
|
|
|
19
45
|
def __init__(
|
|
@@ -28,13 +54,13 @@ class ElasticsearchEmbeddingRetriever:
|
|
|
28
54
|
Create the ElasticsearchEmbeddingRetriever component.
|
|
29
55
|
|
|
30
56
|
:param document_store: An instance of ElasticsearchDocumentStore.
|
|
31
|
-
:param filters: Filters applied to the retrieved Documents.
|
|
32
|
-
Filters are applied during the approximate
|
|
33
|
-
:param top_k: Maximum number of Documents to return
|
|
57
|
+
:param filters: Filters applied to the retrieved Documents.
|
|
58
|
+
Filters are applied during the approximate KNN search to ensure that top_k matching documents are returned.
|
|
59
|
+
:param top_k: Maximum number of Documents to return.
|
|
34
60
|
:param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10.
|
|
35
61
|
Increasing this value will improve search accuracy at the cost of slower search speeds.
|
|
36
|
-
You can read more about it in the Elasticsearch
|
|
37
|
-
https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy
|
|
62
|
+
You can read more about it in the Elasticsearch
|
|
63
|
+
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy)
|
|
38
64
|
:raises ValueError: If `document_store` is not an instance of ElasticsearchDocumentStore.
|
|
39
65
|
"""
|
|
40
66
|
if not isinstance(document_store, ElasticsearchDocumentStore):
|
|
@@ -47,6 +73,12 @@ class ElasticsearchEmbeddingRetriever:
|
|
|
47
73
|
self._num_candidates = num_candidates
|
|
48
74
|
|
|
49
75
|
def to_dict(self) -> Dict[str, Any]:
|
|
76
|
+
"""
|
|
77
|
+
Serializes the component to a dictionary.
|
|
78
|
+
|
|
79
|
+
:returns:
|
|
80
|
+
Dictionary with serialized data.
|
|
81
|
+
"""
|
|
50
82
|
return default_to_dict(
|
|
51
83
|
self,
|
|
52
84
|
filters=self._filters,
|
|
@@ -57,23 +89,33 @@ class ElasticsearchEmbeddingRetriever:
|
|
|
57
89
|
|
|
58
90
|
@classmethod
|
|
59
91
|
def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchEmbeddingRetriever":
|
|
92
|
+
"""
|
|
93
|
+
Deserializes the component from a dictionary.
|
|
94
|
+
|
|
95
|
+
:param data:
|
|
96
|
+
Dictionary to deserialize from.
|
|
97
|
+
:returns:
|
|
98
|
+
Deserialized component.
|
|
99
|
+
"""
|
|
60
100
|
data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict(
|
|
61
101
|
data["init_parameters"]["document_store"]
|
|
62
102
|
)
|
|
63
103
|
return default_from_dict(cls, data)
|
|
64
104
|
|
|
65
105
|
@component.output_types(documents=List[Document])
|
|
66
|
-
def run(self, query_embedding: List[float], top_k: Optional[int] = None):
|
|
106
|
+
def run(self, query_embedding: List[float], filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None):
|
|
67
107
|
"""
|
|
68
108
|
Retrieve documents using a vector similarity metric.
|
|
69
109
|
|
|
70
110
|
:param query_embedding: Embedding of the query.
|
|
71
|
-
:param
|
|
72
|
-
:
|
|
111
|
+
:param filters: Filters applied to the retrieved `Document`s.
|
|
112
|
+
:param top_k: Maximum number of `Document`s to return.
|
|
113
|
+
:returns: A dictionary with the following keys:
|
|
114
|
+
- `documents`: List of `Document`s most similar to the given `query_embedding`
|
|
73
115
|
"""
|
|
74
116
|
docs = self._document_store._embedding_retrieval(
|
|
75
117
|
query_embedding=query_embedding,
|
|
76
|
-
filters=self._filters,
|
|
118
|
+
filters=filters or self._filters,
|
|
77
119
|
top_k=top_k or self._top_k,
|
|
78
120
|
num_candidates=self._num_candidates,
|
|
79
121
|
)
|
|
@@ -13,6 +13,7 @@ from haystack.dataclasses import Document
|
|
|
13
13
|
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
|
|
14
14
|
from haystack.document_stores.types import DuplicatePolicy
|
|
15
15
|
from haystack.utils.filters import convert
|
|
16
|
+
from haystack.version import __version__ as haystack_version
|
|
16
17
|
|
|
17
18
|
from elasticsearch import Elasticsearch, helpers # type: ignore[import-not-found]
|
|
18
19
|
|
|
@@ -34,16 +35,16 @@ BM25_SCALING_FACTOR = 8
|
|
|
34
35
|
|
|
35
36
|
class ElasticsearchDocumentStore:
|
|
36
37
|
"""
|
|
37
|
-
ElasticsearchDocumentStore is a Document Store for Elasticsearch.
|
|
38
|
-
|
|
38
|
+
ElasticsearchDocumentStore is a Document Store for Elasticsearch. It can be used with Elastic Cloud or your own
|
|
39
|
+
Elasticsearch cluster.
|
|
39
40
|
|
|
40
|
-
|
|
41
|
+
Usage example (Elastic Cloud):
|
|
41
42
|
```python
|
|
42
43
|
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
|
|
43
44
|
document_store = ElasticsearchDocumentStore(cloud_id="YOUR_CLOUD_ID", api_key="YOUR_API_KEY")
|
|
44
45
|
```
|
|
45
46
|
|
|
46
|
-
|
|
47
|
+
Usage example (self-hosted Elasticsearch instance):
|
|
47
48
|
```python
|
|
48
49
|
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
|
|
49
50
|
document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200")
|
|
@@ -52,8 +53,8 @@ class ElasticsearchDocumentStore:
|
|
|
52
53
|
We strongly recommend to enable security so that only authorized users can access your data.
|
|
53
54
|
|
|
54
55
|
For more details on how to connect to Elasticsearch and configure security,
|
|
55
|
-
see the official Elasticsearch
|
|
56
|
-
https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
|
|
56
|
+
see the official Elasticsearch
|
|
57
|
+
[documentation](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html)
|
|
57
58
|
|
|
58
59
|
All extra keyword arguments will be passed to the Elasticsearch client.
|
|
59
60
|
"""
|
|
@@ -68,29 +69,33 @@ class ElasticsearchDocumentStore:
|
|
|
68
69
|
):
|
|
69
70
|
"""
|
|
70
71
|
Creates a new ElasticsearchDocumentStore instance.
|
|
71
|
-
|
|
72
|
-
It will also try to create that index if it doesn't exist yet. Otherwise it will use the existing one.
|
|
72
|
+
|
|
73
|
+
It will also try to create that index if it doesn't exist yet. Otherwise, it will use the existing one.
|
|
73
74
|
|
|
74
75
|
One can also set the similarity function used to compare Documents embeddings. This is mostly useful
|
|
75
76
|
when using the `ElasticsearchDocumentStore` in a Pipeline with an `ElasticsearchEmbeddingRetriever`.
|
|
76
77
|
|
|
77
|
-
For more information on connection parameters, see the official Elasticsearch
|
|
78
|
-
https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
|
|
78
|
+
For more information on connection parameters, see the official Elasticsearch
|
|
79
|
+
[documentation](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html)
|
|
79
80
|
|
|
80
|
-
For the full list of supported kwargs, see the official Elasticsearch
|
|
81
|
-
https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch
|
|
81
|
+
For the full list of supported kwargs, see the official Elasticsearch
|
|
82
|
+
[reference](https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch)
|
|
82
83
|
|
|
83
|
-
:param hosts: List of hosts running the Elasticsearch client.
|
|
84
|
-
:param index: Name of index in Elasticsearch
|
|
84
|
+
:param hosts: List of hosts running the Elasticsearch client.
|
|
85
|
+
:param index: Name of index in Elasticsearch.
|
|
85
86
|
:param embedding_similarity_function: The similarity function used to compare Documents embeddings.
|
|
86
|
-
|
|
87
|
+
This parameter only takes effect if the index does not yet exist and is created.
|
|
87
88
|
To choose the most appropriate function, look for information about your embedding model.
|
|
88
|
-
To understand how document scores are computed, see the Elasticsearch
|
|
89
|
-
https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params
|
|
90
|
-
:param **kwargs: Optional arguments that
|
|
89
|
+
To understand how document scores are computed, see the Elasticsearch
|
|
90
|
+
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params)
|
|
91
|
+
:param **kwargs: Optional arguments that `Elasticsearch` takes.
|
|
91
92
|
"""
|
|
92
93
|
self._hosts = hosts
|
|
93
|
-
self._client = Elasticsearch(
|
|
94
|
+
self._client = Elasticsearch(
|
|
95
|
+
hosts,
|
|
96
|
+
headers={"user-agent": f"haystack-py-ds/{haystack_version}"},
|
|
97
|
+
**kwargs,
|
|
98
|
+
)
|
|
94
99
|
self._index = index
|
|
95
100
|
self._embedding_similarity_function = embedding_similarity_function
|
|
96
101
|
self._kwargs = kwargs
|
|
@@ -101,8 +106,20 @@ class ElasticsearchDocumentStore:
|
|
|
101
106
|
# configure mapping for the embedding field
|
|
102
107
|
mappings = {
|
|
103
108
|
"properties": {
|
|
104
|
-
"embedding": {"type": "dense_vector", "index": True, "similarity": embedding_similarity_function}
|
|
105
|
-
|
|
109
|
+
"embedding": {"type": "dense_vector", "index": True, "similarity": embedding_similarity_function},
|
|
110
|
+
"content": {"type": "text"},
|
|
111
|
+
},
|
|
112
|
+
"dynamic_templates": [
|
|
113
|
+
{
|
|
114
|
+
"strings": {
|
|
115
|
+
"path_match": "*",
|
|
116
|
+
"match_mapping_type": "string",
|
|
117
|
+
"mapping": {
|
|
118
|
+
"type": "keyword",
|
|
119
|
+
},
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
],
|
|
106
123
|
}
|
|
107
124
|
|
|
108
125
|
# Create the index if it doesn't exist
|
|
@@ -110,6 +127,12 @@ class ElasticsearchDocumentStore:
|
|
|
110
127
|
self._client.indices.create(index=index, mappings=mappings)
|
|
111
128
|
|
|
112
129
|
def to_dict(self) -> Dict[str, Any]:
|
|
130
|
+
"""
|
|
131
|
+
Serializes the component to a dictionary.
|
|
132
|
+
|
|
133
|
+
:returns:
|
|
134
|
+
Dictionary with serialized data.
|
|
135
|
+
"""
|
|
113
136
|
# This is not the best solution to serialise this class but is the fastest to implement.
|
|
114
137
|
# Not all kwargs types can be serialised to text so this can fail. We must serialise each
|
|
115
138
|
# type explicitly to handle this properly.
|
|
@@ -123,11 +146,20 @@ class ElasticsearchDocumentStore:
|
|
|
123
146
|
|
|
124
147
|
@classmethod
|
|
125
148
|
def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchDocumentStore":
|
|
149
|
+
"""
|
|
150
|
+
Deserializes the component from a dictionary.
|
|
151
|
+
|
|
152
|
+
:param data:
|
|
153
|
+
Dictionary to deserialize from.
|
|
154
|
+
:returns:
|
|
155
|
+
Deserialized component.
|
|
156
|
+
"""
|
|
126
157
|
return default_from_dict(cls, data)
|
|
127
158
|
|
|
128
159
|
def count_documents(self) -> int:
|
|
129
160
|
"""
|
|
130
161
|
Returns how many documents are present in the document store.
|
|
162
|
+
:returns: Number of documents in the document store.
|
|
131
163
|
"""
|
|
132
164
|
return self._client.count(index=self._index)["count"]
|
|
133
165
|
|
|
@@ -160,6 +192,14 @@ class ElasticsearchDocumentStore:
|
|
|
160
192
|
return documents
|
|
161
193
|
|
|
162
194
|
def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
|
|
195
|
+
"""
|
|
196
|
+
The main query method for the document store. It retrieves all documents that match the filters.
|
|
197
|
+
|
|
198
|
+
:param filters: A dictionary of filters to apply. For more information on the structure of the filters,
|
|
199
|
+
see the official Elasticsearch
|
|
200
|
+
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html)
|
|
201
|
+
:returns: List of `Document`s that match the filters.
|
|
202
|
+
"""
|
|
163
203
|
if filters and "operator" not in filters and "conditions" not in filters:
|
|
164
204
|
filters = convert(filters)
|
|
165
205
|
|
|
@@ -169,9 +209,15 @@ class ElasticsearchDocumentStore:
|
|
|
169
209
|
|
|
170
210
|
def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int:
|
|
171
211
|
"""
|
|
172
|
-
Writes
|
|
173
|
-
|
|
174
|
-
|
|
212
|
+
Writes `Document`s to Elasticsearch.
|
|
213
|
+
|
|
214
|
+
:param documents: List of Documents to write to the document store.
|
|
215
|
+
:param policy: DuplicatePolicy to apply when a document with the same ID already exists in the document store.
|
|
216
|
+
:raises ValueError: If `documents` is not a list of `Document`s.
|
|
217
|
+
:raises DuplicateDocumentError: If a document with the same ID already exists in the document store and
|
|
218
|
+
`policy` is set to `DuplicatePolicy.FAIL` or `DuplicatePolicy.NONE`.
|
|
219
|
+
:raises DocumentStoreError: If an error occurs while writing the documents to the document store.
|
|
220
|
+
:returns: Number of documents written to the document store.
|
|
175
221
|
"""
|
|
176
222
|
if len(documents) > 0:
|
|
177
223
|
if not isinstance(documents[0], Document):
|
|
@@ -182,16 +228,30 @@ class ElasticsearchDocumentStore:
|
|
|
182
228
|
policy = DuplicatePolicy.FAIL
|
|
183
229
|
|
|
184
230
|
action = "index" if policy == DuplicatePolicy.OVERWRITE else "create"
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
231
|
+
|
|
232
|
+
elasticsearch_actions = []
|
|
233
|
+
for doc in documents:
|
|
234
|
+
doc_dict = doc.to_dict()
|
|
235
|
+
if "sparse_embedding" in doc_dict:
|
|
236
|
+
sparse_embedding = doc_dict.pop("sparse_embedding", None)
|
|
237
|
+
if sparse_embedding:
|
|
238
|
+
logger.warning(
|
|
239
|
+
"Document %s has the `sparse_embedding` field set,"
|
|
240
|
+
"but storing sparse embeddings in Elasticsearch is not currently supported."
|
|
241
|
+
"The `sparse_embedding` field will be ignored.",
|
|
242
|
+
doc.id,
|
|
243
|
+
)
|
|
244
|
+
elasticsearch_actions.append(
|
|
188
245
|
{
|
|
189
246
|
"_op_type": action,
|
|
190
247
|
"_id": doc.id,
|
|
191
|
-
"_source":
|
|
248
|
+
"_source": doc_dict,
|
|
192
249
|
}
|
|
193
|
-
|
|
194
|
-
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
documents_written, errors = helpers.bulk(
|
|
253
|
+
client=self._client,
|
|
254
|
+
actions=elasticsearch_actions,
|
|
195
255
|
refresh="wait_for",
|
|
196
256
|
index=self._index,
|
|
197
257
|
raise_on_error=False,
|
|
@@ -220,10 +280,15 @@ class ElasticsearchDocumentStore:
|
|
|
220
280
|
|
|
221
281
|
return documents_written
|
|
222
282
|
|
|
223
|
-
|
|
283
|
+
@staticmethod
|
|
284
|
+
def _deserialize_document(hit: Dict[str, Any]) -> Document:
|
|
224
285
|
"""
|
|
225
|
-
Creates a Document from the search hit provided.
|
|
286
|
+
Creates a `Document` from the search hit provided.
|
|
287
|
+
|
|
226
288
|
This is mostly useful in self.filter_documents().
|
|
289
|
+
|
|
290
|
+
:param hit: A search hit from Elasticsearch.
|
|
291
|
+
:returns: `Document` created from the search hit.
|
|
227
292
|
"""
|
|
228
293
|
data = hit["_source"]
|
|
229
294
|
|
|
@@ -235,12 +300,11 @@ class ElasticsearchDocumentStore:
|
|
|
235
300
|
|
|
236
301
|
def delete_documents(self, document_ids: List[str]) -> None:
|
|
237
302
|
"""
|
|
238
|
-
Deletes all
|
|
303
|
+
Deletes all `Document`s with a matching `document_ids` from the document store.
|
|
239
304
|
|
|
240
|
-
:param
|
|
305
|
+
:param document_ids: the object IDs to delete
|
|
241
306
|
"""
|
|
242
307
|
|
|
243
|
-
#
|
|
244
308
|
helpers.bulk(
|
|
245
309
|
client=self._client,
|
|
246
310
|
actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
|
|
@@ -259,26 +323,25 @@ class ElasticsearchDocumentStore:
|
|
|
259
323
|
scale_score: bool = False,
|
|
260
324
|
) -> List[Document]:
|
|
261
325
|
"""
|
|
262
|
-
Elasticsearch
|
|
326
|
+
Retrieves `Document`s from Elasticsearch using the BM25 search algorithm.
|
|
327
|
+
|
|
263
328
|
Even though this method is called `bm25_retrieval` it searches for `query`
|
|
264
329
|
using the search algorithm `_client` was configured with.
|
|
265
330
|
|
|
266
|
-
This method is not
|
|
331
|
+
This method is not meant to be part of the public interface of
|
|
267
332
|
`ElasticsearchDocumentStore` nor called directly.
|
|
268
333
|
`ElasticsearchBM25Retriever` uses this method directly and is the public interface for it.
|
|
269
334
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
:param
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
:param top_k: Maximum number of Documents to return, defaults to 10
|
|
279
|
-
:param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False
|
|
335
|
+
:param query: String to search in saved `Document`s' text.
|
|
336
|
+
:param filters: Filters applied to the retrieved `Document`s, for more info
|
|
337
|
+
see `ElasticsearchDocumentStore.filter_documents`.
|
|
338
|
+
:param fuzziness: Fuzziness parameter passed to Elasticsearch. See the official
|
|
339
|
+
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
|
|
340
|
+
for valid values.
|
|
341
|
+
:param top_k: Maximum number of `Document`s to return.
|
|
342
|
+
:param scale_score: If `True` scales the `Document``s scores between 0 and 1.
|
|
280
343
|
:raises ValueError: If `query` is an empty string
|
|
281
|
-
:
|
|
344
|
+
:returns: List of `Document` that match `query`
|
|
282
345
|
"""
|
|
283
346
|
|
|
284
347
|
if not query:
|
|
@@ -324,22 +387,23 @@ class ElasticsearchDocumentStore:
|
|
|
324
387
|
) -> List[Document]:
|
|
325
388
|
"""
|
|
326
389
|
Retrieves documents that are most similar to the query embedding using a vector similarity metric.
|
|
390
|
+
|
|
327
391
|
It uses the Elasticsearch's Approximate k-Nearest Neighbors search algorithm.
|
|
328
392
|
|
|
329
|
-
This method is not
|
|
393
|
+
This method is not meant to be part of the public interface of
|
|
330
394
|
`ElasticsearchDocumentStore` nor called directly.
|
|
331
395
|
`ElasticsearchEmbeddingRetriever` uses this method directly and is the public interface for it.
|
|
332
396
|
|
|
333
397
|
:param query_embedding: Embedding of the query.
|
|
334
|
-
:param filters: Filters applied to the retrieved
|
|
398
|
+
:param filters: Filters applied to the retrieved `Document`s.
|
|
335
399
|
Filters are applied during the approximate kNN search to ensure that top_k matching documents are returned.
|
|
336
|
-
:param top_k: Maximum number of
|
|
400
|
+
:param top_k: Maximum number of `Document`s to return.
|
|
337
401
|
:param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10.
|
|
338
402
|
Increasing this value will improve search accuracy at the cost of slower search speeds.
|
|
339
|
-
You can read more about it in the Elasticsearch
|
|
340
|
-
https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy
|
|
341
|
-
:raises ValueError: If `query_embedding` is an empty list
|
|
342
|
-
:
|
|
403
|
+
You can read more about it in the Elasticsearch
|
|
404
|
+
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy)
|
|
405
|
+
:raises ValueError: If `query_embedding` is an empty list.
|
|
406
|
+
:returns: List of `Document` that are most similar to `query_embedding`.
|
|
343
407
|
"""
|
|
344
408
|
|
|
345
409
|
if not query_embedding:
|
|
@@ -15,6 +15,36 @@ from haystack.testing.document_store import DocumentStoreBaseTests
|
|
|
15
15
|
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
|
|
16
16
|
|
|
17
17
|
|
|
18
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
19
|
+
def test_to_dict(_mock_elasticsearch_client):
|
|
20
|
+
document_store = ElasticsearchDocumentStore(hosts="some hosts")
|
|
21
|
+
res = document_store.to_dict()
|
|
22
|
+
assert res == {
|
|
23
|
+
"type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore",
|
|
24
|
+
"init_parameters": {
|
|
25
|
+
"hosts": "some hosts",
|
|
26
|
+
"index": "default",
|
|
27
|
+
"embedding_similarity_function": "cosine",
|
|
28
|
+
},
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
33
|
+
def test_from_dict(_mock_elasticsearch_client):
|
|
34
|
+
data = {
|
|
35
|
+
"type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore",
|
|
36
|
+
"init_parameters": {
|
|
37
|
+
"hosts": "some hosts",
|
|
38
|
+
"index": "default",
|
|
39
|
+
"embedding_similarity_function": "cosine",
|
|
40
|
+
},
|
|
41
|
+
}
|
|
42
|
+
document_store = ElasticsearchDocumentStore.from_dict(data)
|
|
43
|
+
assert document_store._hosts == "some hosts"
|
|
44
|
+
assert document_store._index == "default"
|
|
45
|
+
assert document_store._embedding_similarity_function == "cosine"
|
|
46
|
+
|
|
47
|
+
|
|
18
48
|
@pytest.mark.integration
|
|
19
49
|
class TestDocumentStore(DocumentStoreBaseTests):
|
|
20
50
|
"""
|
|
@@ -67,33 +97,8 @@ class TestDocumentStore(DocumentStoreBaseTests):
|
|
|
67
97
|
|
|
68
98
|
super().assert_documents_are_equal(received, expected)
|
|
69
99
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
document_store = ElasticsearchDocumentStore(hosts="some hosts")
|
|
73
|
-
res = document_store.to_dict()
|
|
74
|
-
assert res == {
|
|
75
|
-
"type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore",
|
|
76
|
-
"init_parameters": {
|
|
77
|
-
"hosts": "some hosts",
|
|
78
|
-
"index": "default",
|
|
79
|
-
"embedding_similarity_function": "cosine",
|
|
80
|
-
},
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
84
|
-
def test_from_dict(self, _mock_elasticsearch_client):
|
|
85
|
-
data = {
|
|
86
|
-
"type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore",
|
|
87
|
-
"init_parameters": {
|
|
88
|
-
"hosts": "some hosts",
|
|
89
|
-
"index": "default",
|
|
90
|
-
"embedding_similarity_function": "cosine",
|
|
91
|
-
},
|
|
92
|
-
}
|
|
93
|
-
document_store = ElasticsearchDocumentStore.from_dict(data)
|
|
94
|
-
assert document_store._hosts == "some hosts"
|
|
95
|
-
assert document_store._index == "default"
|
|
96
|
-
assert document_store._embedding_similarity_function == "cosine"
|
|
100
|
+
def test_user_agent_header(self, document_store: ElasticsearchDocumentStore):
|
|
101
|
+
assert document_store._client._headers["user-agent"].startswith("haystack-py-ds/")
|
|
97
102
|
|
|
98
103
|
def test_write_documents(self, document_store: ElasticsearchDocumentStore):
|
|
99
104
|
docs = [Document(id="1")]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{elasticsearch_haystack-0.2.0 → elasticsearch_haystack-0.4.0}/tests/test_embedding_retriever.py
RENAMED
|
File without changes
|
|
File without changes
|