elasticsearch-haystack 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of elasticsearch-haystack might be problematic. Click here for more details.

Files changed (18) hide show
  1. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/PKG-INFO +2 -2
  2. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/pydoc/config.yml +1 -0
  3. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/pyproject.toml +7 -7
  4. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +30 -13
  5. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +51 -10
  6. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/src/haystack_integrations/document_stores/elasticsearch/document_store.py +113 -54
  7. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/tests/test_document_store.py +30 -28
  8. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/.gitignore +0 -0
  9. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/LICENSE +0 -0
  10. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/README.md +0 -0
  11. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/docker-compose.yml +0 -0
  12. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/src/haystack_integrations/components/retrievers/elasticsearch/__init__.py +0 -0
  13. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/src/haystack_integrations/document_stores/elasticsearch/__init__.py +0 -0
  14. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/src/haystack_integrations/document_stores/elasticsearch/filters.py +0 -0
  15. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/tests/__init__.py +0 -0
  16. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/tests/test_bm25_retriever.py +0 -0
  17. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/tests/test_embedding_retriever.py +0 -0
  18. {elasticsearch_haystack-0.3.0 → elasticsearch_haystack-0.4.0}/tests/test_filters.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: elasticsearch-haystack
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Haystack 2.x Document Store for ElasticSearch
5
5
  Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
6
6
  Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
@@ -25,6 +25,7 @@ renderer:
25
25
  order: 70
26
26
  markdown:
27
27
  descriptive_class_title: false
28
+ classdef_code_block: false
28
29
  descriptive_module_title: true
29
30
  add_method_class_prefix: true
30
31
  add_member_class_prefix: false
@@ -96,12 +96,12 @@ all = [
96
96
  allow-direct-references = true
97
97
 
98
98
  [tool.black]
99
- target-version = ["py37"]
99
+ target-version = ["py38"]
100
100
  line-length = 120
101
101
  skip-string-normalization = true
102
102
 
103
103
  [tool.ruff]
104
- target-version = "py37"
104
+ target-version = "py38"
105
105
  line-length = 120
106
106
  select = [
107
107
  "A",
@@ -156,21 +156,21 @@ ban-relative-imports = "parents"
156
156
  "tests/**/*" = ["PLR2004", "S101", "TID252"]
157
157
 
158
158
  [tool.coverage.run]
159
- source_pkgs = ["src", "tests"]
159
+ source = ["haystack_integrations"]
160
160
  branch = true
161
- parallel = true
161
+ parallel = false
162
162
 
163
- [tool.coverage.paths]
164
- elasticsearch_haystack = ["src/haystack_integrations", "*/elasticsearch/src/haystack_integrations"]
165
- tests = ["tests", "*/elasticsearch/src/tests"]
166
163
 
167
164
  [tool.coverage.report]
165
+ omit = ["*/tests/*", "*/__init__.py"]
166
+ show_missing=true
168
167
  exclude_lines = [
169
168
  "no cov",
170
169
  "if __name__ == .__main__.:",
171
170
  "if TYPE_CHECKING:",
172
171
  ]
173
172
 
173
+
174
174
  [tool.pytest.ini_options]
175
175
  minversion = "6.0"
176
176
  markers = [
@@ -11,8 +11,9 @@ from haystack_integrations.document_stores.elasticsearch.document_store import E
11
11
  @component
12
12
  class ElasticsearchBM25Retriever:
13
13
  """
14
- ElasticsearchBM25Retriever is a keyword-based retriever that uses BM25 to find the most
15
- similar documents to a user's query.
14
+ ElasticsearchBM25Retriever retrieves documents from the ElasticsearchDocumentStore using BM25 algorithm to find the
15
+ most similar documents to a user's query.
16
+
16
17
  This retriever is only compatible with ElasticsearchDocumentStore.
17
18
 
18
19
  Usage example:
@@ -35,7 +36,7 @@ class ElasticsearchBM25Retriever:
35
36
 
36
37
  result = retriever.run(query="Who lives in Berlin?")
37
38
  for doc in result["documents"]:
38
- print(doc.text)
39
+ print(doc.content)
39
40
  ```
40
41
  """
41
42
 
@@ -53,12 +54,13 @@ class ElasticsearchBM25Retriever:
53
54
 
54
55
  :param document_store: An instance of ElasticsearchDocumentStore.
55
56
  :param filters: Filters applied to the retrieved Documents, for more info
56
- see `ElasticsearchDocumentStore.filter_documents`, defaults to None
57
- :param fuzziness: Fuzziness parameter passed to Elasticsearch, defaults to "AUTO".
58
- see the official documentation for valid values:
59
- https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
60
- :param top_k: Maximum number of Documents to return, defaults to 10
61
- :param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False
57
+ see `ElasticsearchDocumentStore.filter_documents`.
58
+ :param fuzziness: Fuzziness parameter passed to Elasticsearch. See the official
59
+ [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
60
+ for more details.
61
+ :param top_k: Maximum number of Documents to return.
62
+ :param scale_score: If `True` scales the Document`s scores between 0 and 1.
63
+ :raises ValueError: If `document_store` is not an instance of `ElasticsearchDocumentStore`.
62
64
  """
63
65
 
64
66
  if not isinstance(document_store, ElasticsearchDocumentStore):
@@ -72,6 +74,12 @@ class ElasticsearchBM25Retriever:
72
74
  self._scale_score = scale_score
73
75
 
74
76
  def to_dict(self) -> Dict[str, Any]:
77
+ """
78
+ Serializes the component to a dictionary.
79
+
80
+ :returns:
81
+ Dictionary with serialized data.
82
+ """
75
83
  return default_to_dict(
76
84
  self,
77
85
  filters=self._filters,
@@ -83,6 +91,14 @@ class ElasticsearchBM25Retriever:
83
91
 
84
92
  @classmethod
85
93
  def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchBM25Retriever":
94
+ """
95
+ Deserializes the component from a dictionary.
96
+
97
+ :param data:
98
+ Dictionary to deserialize from.
99
+ :returns:
100
+ Deserialized component.
101
+ """
86
102
  data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict(
87
103
  data["init_parameters"]["document_store"]
88
104
  )
@@ -93,10 +109,11 @@ class ElasticsearchBM25Retriever:
93
109
  """
94
110
  Retrieve documents using the BM25 keyword-based algorithm.
95
111
 
96
- :param query: String to search in Documents' text.
97
- :param filters: Filters applied to the retrieved Documents.
98
- :param top_k: Maximum number of Documents to return.
99
- :return: List of Documents that match the query.
112
+ :param query: String to search in `Document`s' text.
113
+ :param filters: Filters applied to the retrieved `Document`s.
114
+ :param top_k: Maximum number of `Document` to return.
115
+ :returns: A dictionary with the following keys:
116
+ - `documents`: List of `Document`s that match the query.
100
117
  """
101
118
  docs = self._document_store._bm25_retrieval(
102
119
  query=query,
@@ -11,9 +11,35 @@ from haystack_integrations.document_stores.elasticsearch.document_store import E
11
11
  @component
12
12
  class ElasticsearchEmbeddingRetriever:
13
13
  """
14
- Uses a vector similarity metric to retrieve documents from the ElasticsearchDocumentStore.
14
+ ElasticsearchEmbeddingRetriever retrieves documents from the ElasticsearchDocumentStore using vector similarity.
15
15
 
16
- Needs to be connected to the ElasticsearchDocumentStore to run.
16
+ Usage example:
17
+ ```python
18
+ from haystack import Document
19
+ from haystack.components.embedders import SentenceTransformersTextEmbedder
20
+ from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
21
+ from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchEmbeddingRetriever
22
+
23
+ document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200")
24
+ retriever = ElasticsearchEmbeddingRetriever(document_store=document_store)
25
+
26
+ # Add documents to DocumentStore
27
+ documents = [
28
+ Document(text="My name is Carla and I live in Berlin"),
29
+ Document(text="My name is Paul and I live in New York"),
30
+ Document(text="My name is Silvano and I live in Matera"),
31
+ Document(text="My name is Usagi Tsukino and I live in Tokyo"),
32
+ ]
33
+ document_store.write_documents(documents)
34
+
35
+ te = SentenceTransformersTextEmbedder()
36
+ te.warm_up()
37
+ query_embeddings = te.run("Who lives in Berlin?")["embedding"]
38
+
39
+ result = retriever.run(query=query_embeddings)
40
+ for doc in result["documents"]:
41
+ print(doc.content)
42
+ ```
17
43
  """
18
44
 
19
45
  def __init__(
@@ -28,13 +54,13 @@ class ElasticsearchEmbeddingRetriever:
28
54
  Create the ElasticsearchEmbeddingRetriever component.
29
55
 
30
56
  :param document_store: An instance of ElasticsearchDocumentStore.
31
- :param filters: Filters applied to the retrieved Documents. Defaults to None.
32
- Filters are applied during the approximate kNN search to ensure that top_k matching documents are returned.
33
- :param top_k: Maximum number of Documents to return, defaults to 10
57
+ :param filters: Filters applied to the retrieved Documents.
58
+ Filters are applied during the approximate KNN search to ensure that top_k matching documents are returned.
59
+ :param top_k: Maximum number of Documents to return.
34
60
  :param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10.
35
61
  Increasing this value will improve search accuracy at the cost of slower search speeds.
36
- You can read more about it in the Elasticsearch documentation:
37
- https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy
62
+ You can read more about it in the Elasticsearch
63
+ [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy)
38
64
  :raises ValueError: If `document_store` is not an instance of ElasticsearchDocumentStore.
39
65
  """
40
66
  if not isinstance(document_store, ElasticsearchDocumentStore):
@@ -47,6 +73,12 @@ class ElasticsearchEmbeddingRetriever:
47
73
  self._num_candidates = num_candidates
48
74
 
49
75
  def to_dict(self) -> Dict[str, Any]:
76
+ """
77
+ Serializes the component to a dictionary.
78
+
79
+ :returns:
80
+ Dictionary with serialized data.
81
+ """
50
82
  return default_to_dict(
51
83
  self,
52
84
  filters=self._filters,
@@ -57,6 +89,14 @@ class ElasticsearchEmbeddingRetriever:
57
89
 
58
90
  @classmethod
59
91
  def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchEmbeddingRetriever":
92
+ """
93
+ Deserializes the component from a dictionary.
94
+
95
+ :param data:
96
+ Dictionary to deserialize from.
97
+ :returns:
98
+ Deserialized component.
99
+ """
60
100
  data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict(
61
101
  data["init_parameters"]["document_store"]
62
102
  )
@@ -68,9 +108,10 @@ class ElasticsearchEmbeddingRetriever:
68
108
  Retrieve documents using a vector similarity metric.
69
109
 
70
110
  :param query_embedding: Embedding of the query.
71
- :param filters: Filters applied to the retrieved Documents.
72
- :param top_k: Maximum number of Documents to return.
73
- :return: List of Documents similar to `query_embedding`.
111
+ :param filters: Filters applied to the retrieved `Document`s.
112
+ :param top_k: Maximum number of `Document`s to return.
113
+ :returns: A dictionary with the following keys:
114
+ - `documents`: List of `Document`s most similar to the given `query_embedding`
74
115
  """
75
116
  docs = self._document_store._embedding_retrieval(
76
117
  query_embedding=query_embedding,
@@ -35,16 +35,16 @@ BM25_SCALING_FACTOR = 8
35
35
 
36
36
  class ElasticsearchDocumentStore:
37
37
  """
38
- ElasticsearchDocumentStore is a Document Store for Elasticsearch.
39
- It can be used with Elastic Cloud or your own Elasticsearch cluster.
38
+ ElasticsearchDocumentStore is a Document Store for Elasticsearch. It can be used with Elastic Cloud or your own
39
+ Elasticsearch cluster.
40
40
 
41
- Simple usage with Elastic Cloud:
41
+ Usage example (Elastic Cloud):
42
42
  ```python
43
43
  from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
44
44
  document_store = ElasticsearchDocumentStore(cloud_id="YOUR_CLOUD_ID", api_key="YOUR_API_KEY")
45
45
  ```
46
46
 
47
- One can also connect to a self-hosted Elasticsearch instance:
47
+ Usage example (self-hosted Elasticsearch instance):
48
48
  ```python
49
49
  from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
50
50
  document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200")
@@ -53,8 +53,8 @@ class ElasticsearchDocumentStore:
53
53
  We strongly recommend to enable security so that only authorized users can access your data.
54
54
 
55
55
  For more details on how to connect to Elasticsearch and configure security,
56
- see the official Elasticsearch documentation:
57
- https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
56
+ see the official Elasticsearch
57
+ [documentation](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html)
58
58
 
59
59
  All extra keyword arguments will be passed to the Elasticsearch client.
60
60
  """
@@ -69,26 +69,26 @@ class ElasticsearchDocumentStore:
69
69
  ):
70
70
  """
71
71
  Creates a new ElasticsearchDocumentStore instance.
72
- When no index is explicitly specified, it will use the default index "default".
73
- It will also try to create that index if it doesn't exist yet. Otherwise it will use the existing one.
72
+
73
+ It will also try to create that index if it doesn't exist yet. Otherwise, it will use the existing one.
74
74
 
75
75
  One can also set the similarity function used to compare Documents embeddings. This is mostly useful
76
76
  when using the `ElasticsearchDocumentStore` in a Pipeline with an `ElasticsearchEmbeddingRetriever`.
77
77
 
78
- For more information on connection parameters, see the official Elasticsearch documentation:
79
- https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
78
+ For more information on connection parameters, see the official Elasticsearch
79
+ [documentation](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html)
80
80
 
81
- For the full list of supported kwargs, see the official Elasticsearch reference:
82
- https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch
81
+ For the full list of supported kwargs, see the official Elasticsearch
82
+ [reference](https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch)
83
83
 
84
- :param hosts: List of hosts running the Elasticsearch client. Defaults to None
85
- :param index: Name of index in Elasticsearch, if it doesn't exist it will be created. Defaults to "default"
84
+ :param hosts: List of hosts running the Elasticsearch client.
85
+ :param index: Name of index in Elasticsearch.
86
86
  :param embedding_similarity_function: The similarity function used to compare Documents embeddings.
87
- Defaults to "cosine". This parameter only takes effect if the index does not yet exist and is created.
87
+ This parameter only takes effect if the index does not yet exist and is created.
88
88
  To choose the most appropriate function, look for information about your embedding model.
89
- To understand how document scores are computed, see the Elasticsearch documentation:
90
- https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params
91
- :param **kwargs: Optional arguments that ``Elasticsearch`` takes.
89
+ To understand how document scores are computed, see the Elasticsearch
90
+ [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params)
91
+ :param **kwargs: Optional arguments that `Elasticsearch` takes.
92
92
  """
93
93
  self._hosts = hosts
94
94
  self._client = Elasticsearch(
@@ -106,8 +106,20 @@ class ElasticsearchDocumentStore:
106
106
  # configure mapping for the embedding field
107
107
  mappings = {
108
108
  "properties": {
109
- "embedding": {"type": "dense_vector", "index": True, "similarity": embedding_similarity_function}
110
- }
109
+ "embedding": {"type": "dense_vector", "index": True, "similarity": embedding_similarity_function},
110
+ "content": {"type": "text"},
111
+ },
112
+ "dynamic_templates": [
113
+ {
114
+ "strings": {
115
+ "path_match": "*",
116
+ "match_mapping_type": "string",
117
+ "mapping": {
118
+ "type": "keyword",
119
+ },
120
+ }
121
+ }
122
+ ],
111
123
  }
112
124
 
113
125
  # Create the index if it doesn't exist
@@ -115,6 +127,12 @@ class ElasticsearchDocumentStore:
115
127
  self._client.indices.create(index=index, mappings=mappings)
116
128
 
117
129
  def to_dict(self) -> Dict[str, Any]:
130
+ """
131
+ Serializes the component to a dictionary.
132
+
133
+ :returns:
134
+ Dictionary with serialized data.
135
+ """
118
136
  # This is not the best solution to serialise this class but is the fastest to implement.
119
137
  # Not all kwargs types can be serialised to text so this can fail. We must serialise each
120
138
  # type explicitly to handle this properly.
@@ -128,11 +146,20 @@ class ElasticsearchDocumentStore:
128
146
 
129
147
  @classmethod
130
148
  def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchDocumentStore":
149
+ """
150
+ Deserializes the component from a dictionary.
151
+
152
+ :param data:
153
+ Dictionary to deserialize from.
154
+ :returns:
155
+ Deserialized component.
156
+ """
131
157
  return default_from_dict(cls, data)
132
158
 
133
159
  def count_documents(self) -> int:
134
160
  """
135
161
  Returns how many documents are present in the document store.
162
+ :returns: Number of documents in the document store.
136
163
  """
137
164
  return self._client.count(index=self._index)["count"]
138
165
 
@@ -165,6 +192,14 @@ class ElasticsearchDocumentStore:
165
192
  return documents
166
193
 
167
194
  def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
195
+ """
196
+ The main query method for the document store. It retrieves all documents that match the filters.
197
+
198
+ :param filters: A dictionary of filters to apply. For more information on the structure of the filters,
199
+ see the official Elasticsearch
200
+ [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html)
201
+ :returns: List of `Document`s that match the filters.
202
+ """
168
203
  if filters and "operator" not in filters and "conditions" not in filters:
169
204
  filters = convert(filters)
170
205
 
@@ -174,9 +209,15 @@ class ElasticsearchDocumentStore:
174
209
 
175
210
  def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int:
176
211
  """
177
- Writes Documents to Elasticsearch.
178
- If policy is not specified or set to DuplicatePolicy.NONE, it will raise an exception if a document with the
179
- same ID already exists in the document store.
212
+ Writes `Document`s to Elasticsearch.
213
+
214
+ :param documents: List of Documents to write to the document store.
215
+ :param policy: DuplicatePolicy to apply when a document with the same ID already exists in the document store.
216
+ :raises ValueError: If `documents` is not a list of `Document`s.
217
+ :raises DuplicateDocumentError: If a document with the same ID already exists in the document store and
218
+ `policy` is set to `DuplicatePolicy.FAIL` or `DuplicatePolicy.NONE`.
219
+ :raises DocumentStoreError: If an error occurs while writing the documents to the document store.
220
+ :returns: Number of documents written to the document store.
180
221
  """
181
222
  if len(documents) > 0:
182
223
  if not isinstance(documents[0], Document):
@@ -187,16 +228,30 @@ class ElasticsearchDocumentStore:
187
228
  policy = DuplicatePolicy.FAIL
188
229
 
189
230
  action = "index" if policy == DuplicatePolicy.OVERWRITE else "create"
190
- documents_written, errors = helpers.bulk(
191
- client=self._client,
192
- actions=(
231
+
232
+ elasticsearch_actions = []
233
+ for doc in documents:
234
+ doc_dict = doc.to_dict()
235
+ if "sparse_embedding" in doc_dict:
236
+ sparse_embedding = doc_dict.pop("sparse_embedding", None)
237
+ if sparse_embedding:
238
+ logger.warning(
239
+ "Document %s has the `sparse_embedding` field set,"
240
+ "but storing sparse embeddings in Elasticsearch is not currently supported."
241
+ "The `sparse_embedding` field will be ignored.",
242
+ doc.id,
243
+ )
244
+ elasticsearch_actions.append(
193
245
  {
194
246
  "_op_type": action,
195
247
  "_id": doc.id,
196
- "_source": doc.to_dict(),
248
+ "_source": doc_dict,
197
249
  }
198
- for doc in documents
199
- ),
250
+ )
251
+
252
+ documents_written, errors = helpers.bulk(
253
+ client=self._client,
254
+ actions=elasticsearch_actions,
200
255
  refresh="wait_for",
201
256
  index=self._index,
202
257
  raise_on_error=False,
@@ -225,10 +280,15 @@ class ElasticsearchDocumentStore:
225
280
 
226
281
  return documents_written
227
282
 
228
- def _deserialize_document(self, hit: Dict[str, Any]) -> Document:
283
+ @staticmethod
284
+ def _deserialize_document(hit: Dict[str, Any]) -> Document:
229
285
  """
230
- Creates a Document from the search hit provided.
286
+ Creates a `Document` from the search hit provided.
287
+
231
288
  This is mostly useful in self.filter_documents().
289
+
290
+ :param hit: A search hit from Elasticsearch.
291
+ :returns: `Document` created from the search hit.
232
292
  """
233
293
  data = hit["_source"]
234
294
 
@@ -240,12 +300,11 @@ class ElasticsearchDocumentStore:
240
300
 
241
301
  def delete_documents(self, document_ids: List[str]) -> None:
242
302
  """
243
- Deletes all documents with a matching document_ids from the document store.
303
+ Deletes all `Document`s with a matching `document_ids` from the document store.
244
304
 
245
- :param object_ids: the object_ids to delete
305
+ :param document_ids: the object IDs to delete
246
306
  """
247
307
 
248
- #
249
308
  helpers.bulk(
250
309
  client=self._client,
251
310
  actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
@@ -264,26 +323,25 @@ class ElasticsearchDocumentStore:
264
323
  scale_score: bool = False,
265
324
  ) -> List[Document]:
266
325
  """
267
- Elasticsearch by defaults uses BM25 search algorithm.
326
+ Retrieves `Document`s from Elasticsearch using the BM25 search algorithm.
327
+
268
328
  Even though this method is called `bm25_retrieval` it searches for `query`
269
329
  using the search algorithm `_client` was configured with.
270
330
 
271
- This method is not mean to be part of the public interface of
331
+ This method is not meant to be part of the public interface of
272
332
  `ElasticsearchDocumentStore` nor called directly.
273
333
  `ElasticsearchBM25Retriever` uses this method directly and is the public interface for it.
274
334
 
275
- `query` must be a non empty string, otherwise a `ValueError` will be raised.
276
-
277
- :param query: String to search in saved Documents' text.
278
- :param filters: Filters applied to the retrieved Documents, for more info
279
- see `ElasticsearchDocumentStore.filter_documents`, defaults to None
280
- :param fuzziness: Fuzziness parameter passed to Elasticsearch, defaults to "AUTO".
281
- see the official documentation for valid values:
282
- https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
283
- :param top_k: Maximum number of Documents to return, defaults to 10
284
- :param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False
335
+ :param query: String to search in saved `Document`s' text.
336
+ :param filters: Filters applied to the retrieved `Document`s, for more info
337
+ see `ElasticsearchDocumentStore.filter_documents`.
338
+ :param fuzziness: Fuzziness parameter passed to Elasticsearch. See the official
339
+ [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
340
+ for valid values.
341
+ :param top_k: Maximum number of `Document`s to return.
342
+ :param scale_score: If `True` scales the `Document``s scores between 0 and 1.
285
343
  :raises ValueError: If `query` is an empty string
286
- :return: List of Document that match `query`
344
+ :returns: List of `Document` that match `query`
287
345
  """
288
346
 
289
347
  if not query:
@@ -329,22 +387,23 @@ class ElasticsearchDocumentStore:
329
387
  ) -> List[Document]:
330
388
  """
331
389
  Retrieves documents that are most similar to the query embedding using a vector similarity metric.
390
+
332
391
  It uses the Elasticsearch's Approximate k-Nearest Neighbors search algorithm.
333
392
 
334
- This method is not mean to be part of the public interface of
393
+ This method is not meant to be part of the public interface of
335
394
  `ElasticsearchDocumentStore` nor called directly.
336
395
  `ElasticsearchEmbeddingRetriever` uses this method directly and is the public interface for it.
337
396
 
338
397
  :param query_embedding: Embedding of the query.
339
- :param filters: Filters applied to the retrieved Documents. Defaults to None.
398
+ :param filters: Filters applied to the retrieved `Document`s.
340
399
  Filters are applied during the approximate kNN search to ensure that top_k matching documents are returned.
341
- :param top_k: Maximum number of Documents to return, defaults to 10
400
+ :param top_k: Maximum number of `Document`s to return.
342
401
  :param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10.
343
402
  Increasing this value will improve search accuracy at the cost of slower search speeds.
344
- You can read more about it in the Elasticsearch documentation:
345
- https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy
346
- :raises ValueError: If `query_embedding` is an empty list
347
- :return: List of Document that are most similar to `query_embedding`
403
+ You can read more about it in the Elasticsearch
404
+ [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy)
405
+ :raises ValueError: If `query_embedding` is an empty list.
406
+ :returns: List of `Document` that are most similar to `query_embedding`.
348
407
  """
349
408
 
350
409
  if not query_embedding:
@@ -15,6 +15,36 @@ from haystack.testing.document_store import DocumentStoreBaseTests
15
15
  from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
16
16
 
17
17
 
18
+ @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
19
+ def test_to_dict(_mock_elasticsearch_client):
20
+ document_store = ElasticsearchDocumentStore(hosts="some hosts")
21
+ res = document_store.to_dict()
22
+ assert res == {
23
+ "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore",
24
+ "init_parameters": {
25
+ "hosts": "some hosts",
26
+ "index": "default",
27
+ "embedding_similarity_function": "cosine",
28
+ },
29
+ }
30
+
31
+
32
+ @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
33
+ def test_from_dict(_mock_elasticsearch_client):
34
+ data = {
35
+ "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore",
36
+ "init_parameters": {
37
+ "hosts": "some hosts",
38
+ "index": "default",
39
+ "embedding_similarity_function": "cosine",
40
+ },
41
+ }
42
+ document_store = ElasticsearchDocumentStore.from_dict(data)
43
+ assert document_store._hosts == "some hosts"
44
+ assert document_store._index == "default"
45
+ assert document_store._embedding_similarity_function == "cosine"
46
+
47
+
18
48
  @pytest.mark.integration
19
49
  class TestDocumentStore(DocumentStoreBaseTests):
20
50
  """
@@ -67,34 +97,6 @@ class TestDocumentStore(DocumentStoreBaseTests):
67
97
 
68
98
  super().assert_documents_are_equal(received, expected)
69
99
 
70
- @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
71
- def test_to_dict(self, _mock_elasticsearch_client):
72
- document_store = ElasticsearchDocumentStore(hosts="some hosts")
73
- res = document_store.to_dict()
74
- assert res == {
75
- "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore",
76
- "init_parameters": {
77
- "hosts": "some hosts",
78
- "index": "default",
79
- "embedding_similarity_function": "cosine",
80
- },
81
- }
82
-
83
- @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
84
- def test_from_dict(self, _mock_elasticsearch_client):
85
- data = {
86
- "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore",
87
- "init_parameters": {
88
- "hosts": "some hosts",
89
- "index": "default",
90
- "embedding_similarity_function": "cosine",
91
- },
92
- }
93
- document_store = ElasticsearchDocumentStore.from_dict(data)
94
- assert document_store._hosts == "some hosts"
95
- assert document_store._index == "default"
96
- assert document_store._embedding_similarity_function == "cosine"
97
-
98
100
  def test_user_agent_header(self, document_store: ElasticsearchDocumentStore):
99
101
  assert document_store._client._headers["user-agent"].startswith("haystack-py-ds/")
100
102