elasticsearch-haystack 0.4.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of elasticsearch-haystack might be problematic. Click here for more details.
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/PKG-INFO +2 -1
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/pydoc/config.yml +1 -1
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/pyproject.toml +1 -0
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/src/haystack_integrations/document_stores/elasticsearch/document_store.py +53 -30
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/tests/test_bm25_retriever.py +1 -0
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/tests/test_document_store.py +42 -3
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/tests/test_embedding_retriever.py +1 -0
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/.gitignore +0 -0
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/LICENSE +0 -0
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/README.md +0 -0
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/docker-compose.yml +0 -0
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/src/haystack_integrations/components/retrievers/elasticsearch/__init__.py +0 -0
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +0 -0
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +0 -0
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/src/haystack_integrations/document_stores/elasticsearch/__init__.py +0 -0
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/src/haystack_integrations/document_stores/elasticsearch/filters.py +0 -0
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/tests/__init__.py +0 -0
- {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/tests/test_filters.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: elasticsearch-haystack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Haystack 2.x Document Store for ElasticSearch
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
@@ -9,6 +9,7 @@ Author-email: Silvano Cerza <silvanocerza@gmail.com>
|
|
|
9
9
|
License-Expression: Apache-2.0
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
13
|
Classifier: Programming Language :: Python
|
|
13
14
|
Classifier: Programming Language :: Python :: 3.8
|
|
14
15
|
Classifier: Programming Language :: Python :: 3.9
|
|
@@ -17,7 +17,7 @@ processors:
|
|
|
17
17
|
- type: smart
|
|
18
18
|
- type: crossref
|
|
19
19
|
renderer:
|
|
20
|
-
type: haystack_pydoc_tools.renderers.
|
|
20
|
+
type: haystack_pydoc_tools.renderers.ReadmeIntegrationRenderer
|
|
21
21
|
excerpt: Elasticsearch integration for Haystack
|
|
22
22
|
category_slug: integrations-api
|
|
23
23
|
title: Elasticsearch
|
|
@@ -14,6 +14,7 @@ authors = [
|
|
|
14
14
|
{ name = "Silvano Cerza", email = "silvanocerza@gmail.com" },
|
|
15
15
|
]
|
|
16
16
|
classifiers = [
|
|
17
|
+
"License :: OSI Approved :: Apache Software License",
|
|
17
18
|
"Development Status :: 4 - Beta",
|
|
18
19
|
"Programming Language :: Python",
|
|
19
20
|
"Programming Language :: Python :: 3.8",
|
|
@@ -63,6 +63,7 @@ class ElasticsearchDocumentStore:
|
|
|
63
63
|
self,
|
|
64
64
|
*,
|
|
65
65
|
hosts: Optional[Hosts] = None,
|
|
66
|
+
custom_mapping: Optional[Dict[str, Any]] = None,
|
|
66
67
|
index: str = "default",
|
|
67
68
|
embedding_similarity_function: Literal["cosine", "dot_product", "l2_norm", "max_inner_product"] = "cosine",
|
|
68
69
|
**kwargs,
|
|
@@ -82,6 +83,7 @@ class ElasticsearchDocumentStore:
|
|
|
82
83
|
[reference](https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch)
|
|
83
84
|
|
|
84
85
|
:param hosts: List of hosts running the Elasticsearch client.
|
|
86
|
+
:param custom_mapping: Custom mapping for the index. If not provided, a default mapping will be used.
|
|
85
87
|
:param index: Name of index in Elasticsearch.
|
|
86
88
|
:param embedding_similarity_function: The similarity function used to compare Documents embeddings.
|
|
87
89
|
This parameter only takes effect if the index does not yet exist and is created.
|
|
@@ -91,40 +93,60 @@ class ElasticsearchDocumentStore:
|
|
|
91
93
|
:param **kwargs: Optional arguments that `Elasticsearch` takes.
|
|
92
94
|
"""
|
|
93
95
|
self._hosts = hosts
|
|
94
|
-
self._client =
|
|
95
|
-
hosts,
|
|
96
|
-
headers={"user-agent": f"haystack-py-ds/{haystack_version}"},
|
|
97
|
-
**kwargs,
|
|
98
|
-
)
|
|
96
|
+
self._client = None
|
|
99
97
|
self._index = index
|
|
100
98
|
self._embedding_similarity_function = embedding_similarity_function
|
|
99
|
+
self._custom_mapping = custom_mapping
|
|
101
100
|
self._kwargs = kwargs
|
|
102
101
|
|
|
103
|
-
|
|
104
|
-
|
|
102
|
+
if self._custom_mapping and not isinstance(self._custom_mapping, Dict):
|
|
103
|
+
msg = "custom_mapping must be a dictionary"
|
|
104
|
+
raise ValueError(msg)
|
|
105
105
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
106
|
+
@property
|
|
107
|
+
def client(self) -> Elasticsearch:
|
|
108
|
+
if self._client is None:
|
|
109
|
+
client = Elasticsearch(
|
|
110
|
+
self._hosts,
|
|
111
|
+
headers={"user-agent": f"haystack-py-ds/{haystack_version}"},
|
|
112
|
+
**self._kwargs,
|
|
113
|
+
)
|
|
114
|
+
# Check client connection, this will raise if not connected
|
|
115
|
+
client.info()
|
|
116
|
+
|
|
117
|
+
if self._custom_mapping:
|
|
118
|
+
mappings = self._custom_mapping
|
|
119
|
+
else:
|
|
120
|
+
# Configure mapping for the embedding field if none is provided
|
|
121
|
+
mappings = {
|
|
122
|
+
"properties": {
|
|
123
|
+
"embedding": {
|
|
124
|
+
"type": "dense_vector",
|
|
125
|
+
"index": True,
|
|
126
|
+
"similarity": self._embedding_similarity_function,
|
|
119
127
|
},
|
|
120
|
-
|
|
128
|
+
"content": {"type": "text"},
|
|
129
|
+
},
|
|
130
|
+
"dynamic_templates": [
|
|
131
|
+
{
|
|
132
|
+
"strings": {
|
|
133
|
+
"path_match": "*",
|
|
134
|
+
"match_mapping_type": "string",
|
|
135
|
+
"mapping": {
|
|
136
|
+
"type": "keyword",
|
|
137
|
+
},
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
],
|
|
121
141
|
}
|
|
122
|
-
],
|
|
123
|
-
}
|
|
124
142
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
143
|
+
# Create the index if it doesn't exist
|
|
144
|
+
if not client.indices.exists(index=self._index):
|
|
145
|
+
client.indices.create(index=self._index, mappings=mappings)
|
|
146
|
+
|
|
147
|
+
self._client = client
|
|
148
|
+
|
|
149
|
+
return self._client
|
|
128
150
|
|
|
129
151
|
def to_dict(self) -> Dict[str, Any]:
|
|
130
152
|
"""
|
|
@@ -139,6 +161,7 @@ class ElasticsearchDocumentStore:
|
|
|
139
161
|
return default_to_dict(
|
|
140
162
|
self,
|
|
141
163
|
hosts=self._hosts,
|
|
164
|
+
custom_mapping=self._custom_mapping,
|
|
142
165
|
index=self._index,
|
|
143
166
|
embedding_similarity_function=self._embedding_similarity_function,
|
|
144
167
|
**self._kwargs,
|
|
@@ -161,7 +184,7 @@ class ElasticsearchDocumentStore:
|
|
|
161
184
|
Returns how many documents are present in the document store.
|
|
162
185
|
:returns: Number of documents in the document store.
|
|
163
186
|
"""
|
|
164
|
-
return self.
|
|
187
|
+
return self.client.count(index=self._index)["count"]
|
|
165
188
|
|
|
166
189
|
def _search_documents(self, **kwargs) -> List[Document]:
|
|
167
190
|
"""
|
|
@@ -176,7 +199,7 @@ class ElasticsearchDocumentStore:
|
|
|
176
199
|
from_ = 0
|
|
177
200
|
# Handle pagination
|
|
178
201
|
while True:
|
|
179
|
-
res = self.
|
|
202
|
+
res = self.client.search(
|
|
180
203
|
index=self._index,
|
|
181
204
|
from_=from_,
|
|
182
205
|
**kwargs,
|
|
@@ -250,7 +273,7 @@ class ElasticsearchDocumentStore:
|
|
|
250
273
|
)
|
|
251
274
|
|
|
252
275
|
documents_written, errors = helpers.bulk(
|
|
253
|
-
client=self.
|
|
276
|
+
client=self.client,
|
|
254
277
|
actions=elasticsearch_actions,
|
|
255
278
|
refresh="wait_for",
|
|
256
279
|
index=self._index,
|
|
@@ -306,7 +329,7 @@ class ElasticsearchDocumentStore:
|
|
|
306
329
|
"""
|
|
307
330
|
|
|
308
331
|
helpers.bulk(
|
|
309
|
-
client=self.
|
|
332
|
+
client=self.client,
|
|
310
333
|
actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
|
|
311
334
|
refresh="wait_for",
|
|
312
335
|
index=self._index,
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
import random
|
|
6
6
|
from typing import List
|
|
7
|
-
from unittest.mock import patch
|
|
7
|
+
from unittest.mock import Mock, patch
|
|
8
8
|
|
|
9
9
|
import pytest
|
|
10
10
|
from elasticsearch.exceptions import BadRequestError # type: ignore[import-not-found]
|
|
@@ -15,6 +15,12 @@ from haystack.testing.document_store import DocumentStoreBaseTests
|
|
|
15
15
|
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
|
|
16
16
|
|
|
17
17
|
|
|
18
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
19
|
+
def test_init_is_lazy(_mock_es_client):
|
|
20
|
+
ElasticsearchDocumentStore(hosts="testhost")
|
|
21
|
+
_mock_es_client.assert_not_called()
|
|
22
|
+
|
|
23
|
+
|
|
18
24
|
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
19
25
|
def test_to_dict(_mock_elasticsearch_client):
|
|
20
26
|
document_store = ElasticsearchDocumentStore(hosts="some hosts")
|
|
@@ -23,6 +29,7 @@ def test_to_dict(_mock_elasticsearch_client):
|
|
|
23
29
|
"type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore",
|
|
24
30
|
"init_parameters": {
|
|
25
31
|
"hosts": "some hosts",
|
|
32
|
+
"custom_mapping": None,
|
|
26
33
|
"index": "default",
|
|
27
34
|
"embedding_similarity_function": "cosine",
|
|
28
35
|
},
|
|
@@ -35,6 +42,7 @@ def test_from_dict(_mock_elasticsearch_client):
|
|
|
35
42
|
"type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore",
|
|
36
43
|
"init_parameters": {
|
|
37
44
|
"hosts": "some hosts",
|
|
45
|
+
"custom_mapping": None,
|
|
38
46
|
"index": "default",
|
|
39
47
|
"embedding_similarity_function": "cosine",
|
|
40
48
|
},
|
|
@@ -42,6 +50,7 @@ def test_from_dict(_mock_elasticsearch_client):
|
|
|
42
50
|
document_store = ElasticsearchDocumentStore.from_dict(data)
|
|
43
51
|
assert document_store._hosts == "some hosts"
|
|
44
52
|
assert document_store._index == "default"
|
|
53
|
+
assert document_store._custom_mapping is None
|
|
45
54
|
assert document_store._embedding_similarity_function == "cosine"
|
|
46
55
|
|
|
47
56
|
|
|
@@ -70,7 +79,7 @@ class TestDocumentStore(DocumentStoreBaseTests):
|
|
|
70
79
|
hosts=hosts, index=index, embedding_similarity_function=embedding_similarity_function
|
|
71
80
|
)
|
|
72
81
|
yield store
|
|
73
|
-
store.
|
|
82
|
+
store.client.options(ignore_status=[400, 404]).indices.delete(index=index)
|
|
74
83
|
|
|
75
84
|
def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
|
|
76
85
|
"""
|
|
@@ -98,7 +107,7 @@ class TestDocumentStore(DocumentStoreBaseTests):
|
|
|
98
107
|
super().assert_documents_are_equal(received, expected)
|
|
99
108
|
|
|
100
109
|
def test_user_agent_header(self, document_store: ElasticsearchDocumentStore):
|
|
101
|
-
assert document_store.
|
|
110
|
+
assert document_store.client._headers["user-agent"].startswith("haystack-py-ds/")
|
|
102
111
|
|
|
103
112
|
def test_write_documents(self, document_store: ElasticsearchDocumentStore):
|
|
104
113
|
docs = [Document(id="1")]
|
|
@@ -280,3 +289,33 @@ class TestDocumentStore(DocumentStoreBaseTests):
|
|
|
280
289
|
|
|
281
290
|
with pytest.raises(DocumentStoreError):
|
|
282
291
|
document_store.write_documents(docs)
|
|
292
|
+
|
|
293
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
294
|
+
def test_init_with_custom_mapping(self, mock_elasticsearch):
|
|
295
|
+
custom_mapping = {
|
|
296
|
+
"properties": {
|
|
297
|
+
"embedding": {"type": "dense_vector", "index": True, "similarity": "dot_product"},
|
|
298
|
+
"content": {"type": "text"},
|
|
299
|
+
},
|
|
300
|
+
"dynamic_templates": [
|
|
301
|
+
{
|
|
302
|
+
"strings": {
|
|
303
|
+
"path_match": "*",
|
|
304
|
+
"match_mapping_type": "string",
|
|
305
|
+
"mapping": {
|
|
306
|
+
"type": "keyword",
|
|
307
|
+
},
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
],
|
|
311
|
+
}
|
|
312
|
+
mock_client = Mock(
|
|
313
|
+
indices=Mock(create=Mock(), exists=Mock(return_value=False)),
|
|
314
|
+
)
|
|
315
|
+
mock_elasticsearch.return_value = mock_client
|
|
316
|
+
|
|
317
|
+
_ = ElasticsearchDocumentStore(hosts="some hosts", custom_mapping=custom_mapping).client
|
|
318
|
+
mock_client.indices.create.assert_called_once_with(
|
|
319
|
+
index="default",
|
|
320
|
+
mappings=custom_mapping,
|
|
321
|
+
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|