elasticsearch-haystack 0.4.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of elasticsearch-haystack might be problematic. Click here for more details.

Files changed (18) hide show
  1. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/PKG-INFO +2 -1
  2. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/pydoc/config.yml +1 -1
  3. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/pyproject.toml +1 -0
  4. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/src/haystack_integrations/document_stores/elasticsearch/document_store.py +53 -30
  5. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/tests/test_bm25_retriever.py +1 -0
  6. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/tests/test_document_store.py +42 -3
  7. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/tests/test_embedding_retriever.py +1 -0
  8. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/.gitignore +0 -0
  9. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/LICENSE +0 -0
  10. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/README.md +0 -0
  11. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/docker-compose.yml +0 -0
  12. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/src/haystack_integrations/components/retrievers/elasticsearch/__init__.py +0 -0
  13. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +0 -0
  14. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +0 -0
  15. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/src/haystack_integrations/document_stores/elasticsearch/__init__.py +0 -0
  16. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/src/haystack_integrations/document_stores/elasticsearch/filters.py +0 -0
  17. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/tests/__init__.py +0 -0
  18. {elasticsearch_haystack-0.4.0 → elasticsearch_haystack-0.6.0}/tests/test_filters.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: elasticsearch-haystack
3
- Version: 0.4.0
3
+ Version: 0.6.0
4
4
  Summary: Haystack 2.x Document Store for ElasticSearch
5
5
  Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
6
6
  Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
@@ -9,6 +9,7 @@ Author-email: Silvano Cerza <silvanocerza@gmail.com>
9
9
  License-Expression: Apache-2.0
10
10
  License-File: LICENSE
11
11
  Classifier: Development Status :: 4 - Beta
12
+ Classifier: License :: OSI Approved :: Apache Software License
12
13
  Classifier: Programming Language :: Python
13
14
  Classifier: Programming Language :: Python :: 3.8
14
15
  Classifier: Programming Language :: Python :: 3.9
@@ -17,7 +17,7 @@ processors:
17
17
  - type: smart
18
18
  - type: crossref
19
19
  renderer:
20
- type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer
20
+ type: haystack_pydoc_tools.renderers.ReadmeIntegrationRenderer
21
21
  excerpt: Elasticsearch integration for Haystack
22
22
  category_slug: integrations-api
23
23
  title: Elasticsearch
@@ -14,6 +14,7 @@ authors = [
14
14
  { name = "Silvano Cerza", email = "silvanocerza@gmail.com" },
15
15
  ]
16
16
  classifiers = [
17
+ "License :: OSI Approved :: Apache Software License",
17
18
  "Development Status :: 4 - Beta",
18
19
  "Programming Language :: Python",
19
20
  "Programming Language :: Python :: 3.8",
@@ -63,6 +63,7 @@ class ElasticsearchDocumentStore:
63
63
  self,
64
64
  *,
65
65
  hosts: Optional[Hosts] = None,
66
+ custom_mapping: Optional[Dict[str, Any]] = None,
66
67
  index: str = "default",
67
68
  embedding_similarity_function: Literal["cosine", "dot_product", "l2_norm", "max_inner_product"] = "cosine",
68
69
  **kwargs,
@@ -82,6 +83,7 @@ class ElasticsearchDocumentStore:
82
83
  [reference](https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch)
83
84
 
84
85
  :param hosts: List of hosts running the Elasticsearch client.
86
+ :param custom_mapping: Custom mapping for the index. If not provided, a default mapping will be used.
85
87
  :param index: Name of index in Elasticsearch.
86
88
  :param embedding_similarity_function: The similarity function used to compare Documents embeddings.
87
89
  This parameter only takes effect if the index does not yet exist and is created.
@@ -91,40 +93,60 @@ class ElasticsearchDocumentStore:
91
93
  :param **kwargs: Optional arguments that `Elasticsearch` takes.
92
94
  """
93
95
  self._hosts = hosts
94
- self._client = Elasticsearch(
95
- hosts,
96
- headers={"user-agent": f"haystack-py-ds/{haystack_version}"},
97
- **kwargs,
98
- )
96
+ self._client = None
99
97
  self._index = index
100
98
  self._embedding_similarity_function = embedding_similarity_function
99
+ self._custom_mapping = custom_mapping
101
100
  self._kwargs = kwargs
102
101
 
103
- # Check client connection, this will raise if not connected
104
- self._client.info()
102
+ if self._custom_mapping and not isinstance(self._custom_mapping, Dict):
103
+ msg = "custom_mapping must be a dictionary"
104
+ raise ValueError(msg)
105
105
 
106
- # configure mapping for the embedding field
107
- mappings = {
108
- "properties": {
109
- "embedding": {"type": "dense_vector", "index": True, "similarity": embedding_similarity_function},
110
- "content": {"type": "text"},
111
- },
112
- "dynamic_templates": [
113
- {
114
- "strings": {
115
- "path_match": "*",
116
- "match_mapping_type": "string",
117
- "mapping": {
118
- "type": "keyword",
106
+ @property
107
+ def client(self) -> Elasticsearch:
108
+ if self._client is None:
109
+ client = Elasticsearch(
110
+ self._hosts,
111
+ headers={"user-agent": f"haystack-py-ds/{haystack_version}"},
112
+ **self._kwargs,
113
+ )
114
+ # Check client connection, this will raise if not connected
115
+ client.info()
116
+
117
+ if self._custom_mapping:
118
+ mappings = self._custom_mapping
119
+ else:
120
+ # Configure mapping for the embedding field if none is provided
121
+ mappings = {
122
+ "properties": {
123
+ "embedding": {
124
+ "type": "dense_vector",
125
+ "index": True,
126
+ "similarity": self._embedding_similarity_function,
119
127
  },
120
- }
128
+ "content": {"type": "text"},
129
+ },
130
+ "dynamic_templates": [
131
+ {
132
+ "strings": {
133
+ "path_match": "*",
134
+ "match_mapping_type": "string",
135
+ "mapping": {
136
+ "type": "keyword",
137
+ },
138
+ }
139
+ }
140
+ ],
121
141
  }
122
- ],
123
- }
124
142
 
125
- # Create the index if it doesn't exist
126
- if not self._client.indices.exists(index=index):
127
- self._client.indices.create(index=index, mappings=mappings)
143
+ # Create the index if it doesn't exist
144
+ if not client.indices.exists(index=self._index):
145
+ client.indices.create(index=self._index, mappings=mappings)
146
+
147
+ self._client = client
148
+
149
+ return self._client
128
150
 
129
151
  def to_dict(self) -> Dict[str, Any]:
130
152
  """
@@ -139,6 +161,7 @@ class ElasticsearchDocumentStore:
139
161
  return default_to_dict(
140
162
  self,
141
163
  hosts=self._hosts,
164
+ custom_mapping=self._custom_mapping,
142
165
  index=self._index,
143
166
  embedding_similarity_function=self._embedding_similarity_function,
144
167
  **self._kwargs,
@@ -161,7 +184,7 @@ class ElasticsearchDocumentStore:
161
184
  Returns how many documents are present in the document store.
162
185
  :returns: Number of documents in the document store.
163
186
  """
164
- return self._client.count(index=self._index)["count"]
187
+ return self.client.count(index=self._index)["count"]
165
188
 
166
189
  def _search_documents(self, **kwargs) -> List[Document]:
167
190
  """
@@ -176,7 +199,7 @@ class ElasticsearchDocumentStore:
176
199
  from_ = 0
177
200
  # Handle pagination
178
201
  while True:
179
- res = self._client.search(
202
+ res = self.client.search(
180
203
  index=self._index,
181
204
  from_=from_,
182
205
  **kwargs,
@@ -250,7 +273,7 @@ class ElasticsearchDocumentStore:
250
273
  )
251
274
 
252
275
  documents_written, errors = helpers.bulk(
253
- client=self._client,
276
+ client=self.client,
254
277
  actions=elasticsearch_actions,
255
278
  refresh="wait_for",
256
279
  index=self._index,
@@ -306,7 +329,7 @@ class ElasticsearchDocumentStore:
306
329
  """
307
330
 
308
331
  helpers.bulk(
309
- client=self._client,
332
+ client=self.client,
310
333
  actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
311
334
  refresh="wait_for",
312
335
  index=self._index,
@@ -28,6 +28,7 @@ def test_to_dict(_mock_elasticsearch_client):
28
28
  "document_store": {
29
29
  "init_parameters": {
30
30
  "hosts": "some fake host",
31
+ "custom_mapping": None,
31
32
  "index": "default",
32
33
  "embedding_similarity_function": "cosine",
33
34
  },
@@ -4,7 +4,7 @@
4
4
 
5
5
  import random
6
6
  from typing import List
7
- from unittest.mock import patch
7
+ from unittest.mock import Mock, patch
8
8
 
9
9
  import pytest
10
10
  from elasticsearch.exceptions import BadRequestError # type: ignore[import-not-found]
@@ -15,6 +15,12 @@ from haystack.testing.document_store import DocumentStoreBaseTests
15
15
  from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
16
16
 
17
17
 
18
+ @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
19
+ def test_init_is_lazy(_mock_es_client):
20
+ ElasticsearchDocumentStore(hosts="testhost")
21
+ _mock_es_client.assert_not_called()
22
+
23
+
18
24
  @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
19
25
  def test_to_dict(_mock_elasticsearch_client):
20
26
  document_store = ElasticsearchDocumentStore(hosts="some hosts")
@@ -23,6 +29,7 @@ def test_to_dict(_mock_elasticsearch_client):
23
29
  "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore",
24
30
  "init_parameters": {
25
31
  "hosts": "some hosts",
32
+ "custom_mapping": None,
26
33
  "index": "default",
27
34
  "embedding_similarity_function": "cosine",
28
35
  },
@@ -35,6 +42,7 @@ def test_from_dict(_mock_elasticsearch_client):
35
42
  "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore",
36
43
  "init_parameters": {
37
44
  "hosts": "some hosts",
45
+ "custom_mapping": None,
38
46
  "index": "default",
39
47
  "embedding_similarity_function": "cosine",
40
48
  },
@@ -42,6 +50,7 @@ def test_from_dict(_mock_elasticsearch_client):
42
50
  document_store = ElasticsearchDocumentStore.from_dict(data)
43
51
  assert document_store._hosts == "some hosts"
44
52
  assert document_store._index == "default"
53
+ assert document_store._custom_mapping is None
45
54
  assert document_store._embedding_similarity_function == "cosine"
46
55
 
47
56
 
@@ -70,7 +79,7 @@ class TestDocumentStore(DocumentStoreBaseTests):
70
79
  hosts=hosts, index=index, embedding_similarity_function=embedding_similarity_function
71
80
  )
72
81
  yield store
73
- store._client.options(ignore_status=[400, 404]).indices.delete(index=index)
82
+ store.client.options(ignore_status=[400, 404]).indices.delete(index=index)
74
83
 
75
84
  def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
76
85
  """
@@ -98,7 +107,7 @@ class TestDocumentStore(DocumentStoreBaseTests):
98
107
  super().assert_documents_are_equal(received, expected)
99
108
 
100
109
  def test_user_agent_header(self, document_store: ElasticsearchDocumentStore):
101
- assert document_store._client._headers["user-agent"].startswith("haystack-py-ds/")
110
+ assert document_store.client._headers["user-agent"].startswith("haystack-py-ds/")
102
111
 
103
112
  def test_write_documents(self, document_store: ElasticsearchDocumentStore):
104
113
  docs = [Document(id="1")]
@@ -280,3 +289,33 @@ class TestDocumentStore(DocumentStoreBaseTests):
280
289
 
281
290
  with pytest.raises(DocumentStoreError):
282
291
  document_store.write_documents(docs)
292
+
293
+ @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
294
+ def test_init_with_custom_mapping(self, mock_elasticsearch):
295
+ custom_mapping = {
296
+ "properties": {
297
+ "embedding": {"type": "dense_vector", "index": True, "similarity": "dot_product"},
298
+ "content": {"type": "text"},
299
+ },
300
+ "dynamic_templates": [
301
+ {
302
+ "strings": {
303
+ "path_match": "*",
304
+ "match_mapping_type": "string",
305
+ "mapping": {
306
+ "type": "keyword",
307
+ },
308
+ }
309
+ }
310
+ ],
311
+ }
312
+ mock_client = Mock(
313
+ indices=Mock(create=Mock(), exists=Mock(return_value=False)),
314
+ )
315
+ mock_elasticsearch.return_value = mock_client
316
+
317
+ _ = ElasticsearchDocumentStore(hosts="some hosts", custom_mapping=custom_mapping).client
318
+ mock_client.indices.create.assert_called_once_with(
319
+ index="default",
320
+ mappings=custom_mapping,
321
+ )
@@ -29,6 +29,7 @@ def test_to_dict(_mock_elasticsearch_client):
29
29
  "document_store": {
30
30
  "init_parameters": {
31
31
  "hosts": "some fake host",
32
+ "custom_mapping": None,
32
33
  "index": "default",
33
34
  "embedding_similarity_function": "cosine",
34
35
  },