elasticsearch-haystack 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- elasticsearch_haystack-5.1.0.dist-info/METADATA +41 -0
- elasticsearch_haystack-5.1.0.dist-info/RECORD +12 -0
- elasticsearch_haystack-5.1.0.dist-info/WHEEL +4 -0
- elasticsearch_haystack-5.1.0.dist-info/licenses/LICENSE +201 -0
- haystack_integrations/components/retrievers/elasticsearch/__init__.py +7 -0
- haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +166 -0
- haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +164 -0
- haystack_integrations/components/retrievers/py.typed +0 -0
- haystack_integrations/document_stores/elasticsearch/__init__.py +6 -0
- haystack_integrations/document_stores/elasticsearch/document_store.py +1477 -0
- haystack_integrations/document_stores/elasticsearch/filters.py +246 -0
- haystack_integrations/document_stores/py.typed +0 -0
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from haystack.errors import FilterError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _normalize_filters(filters: dict[str, Any]) -> dict[str, Any]:
|
|
11
|
+
"""
|
|
12
|
+
Converts Haystack filters in ElasticSearch compatible filters.
|
|
13
|
+
"""
|
|
14
|
+
if not isinstance(filters, dict):
|
|
15
|
+
msg = "Filters must be a dictionary"
|
|
16
|
+
raise FilterError(msg)
|
|
17
|
+
|
|
18
|
+
if "field" in filters:
|
|
19
|
+
return {"bool": {"must": _parse_comparison_condition(filters)}}
|
|
20
|
+
return _parse_logical_condition(filters)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _parse_logical_condition(condition: dict[str, Any]) -> dict[str, Any]:
|
|
24
|
+
if "operator" not in condition:
|
|
25
|
+
msg = f"'operator' key missing in {condition}"
|
|
26
|
+
raise FilterError(msg)
|
|
27
|
+
if "conditions" not in condition:
|
|
28
|
+
msg = f"'conditions' key missing in {condition}"
|
|
29
|
+
raise FilterError(msg)
|
|
30
|
+
|
|
31
|
+
operator = condition["operator"]
|
|
32
|
+
conditions = [_parse_comparison_condition(c) for c in condition["conditions"]]
|
|
33
|
+
if len(conditions) > 1:
|
|
34
|
+
conditions = _normalize_ranges(conditions)
|
|
35
|
+
if operator == "AND":
|
|
36
|
+
return {"bool": {"must": conditions}}
|
|
37
|
+
elif operator == "OR":
|
|
38
|
+
return {"bool": {"should": conditions}}
|
|
39
|
+
elif operator == "NOT":
|
|
40
|
+
return {"bool": {"must_not": [{"bool": {"must": conditions}}]}}
|
|
41
|
+
else:
|
|
42
|
+
msg = f"Unknown logical operator '{operator}'"
|
|
43
|
+
raise FilterError(msg)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _equal(field: str, value: Any) -> dict[str, Any]:
|
|
47
|
+
if value is None:
|
|
48
|
+
return {"bool": {"must_not": {"exists": {"field": field}}}}
|
|
49
|
+
|
|
50
|
+
if isinstance(value, list):
|
|
51
|
+
return {
|
|
52
|
+
"terms_set": {
|
|
53
|
+
field: {
|
|
54
|
+
"terms": value,
|
|
55
|
+
"minimum_should_match_script": {"source": f"Math.max(params.num_terms, doc['{field}'].size())"},
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
if field == "text":
|
|
60
|
+
# We want to fully match the text field.
|
|
61
|
+
return {"match": {field: {"query": value, "minimum_should_match": "100%"}}}
|
|
62
|
+
return {"term": {field: value}}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _not_equal(field: str, value: Any) -> dict[str, Any]:
|
|
66
|
+
if value is None:
|
|
67
|
+
return {"exists": {"field": field}}
|
|
68
|
+
|
|
69
|
+
if isinstance(value, list):
|
|
70
|
+
return {"bool": {"must_not": {"terms": {field: value}}}}
|
|
71
|
+
if field == "text":
|
|
72
|
+
# We want to fully match the text field.
|
|
73
|
+
return {"bool": {"must_not": {"match": {field: {"query": value, "minimum_should_match": "100%"}}}}}
|
|
74
|
+
|
|
75
|
+
return {"bool": {"must_not": {"term": {field: value}}}}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _greater_than(field: str, value: Any) -> dict[str, Any]:
|
|
79
|
+
if value is None:
|
|
80
|
+
# When the value is None and '>' is used we create a filter that would return a Document
|
|
81
|
+
# if it has a field set and not set at the same time.
|
|
82
|
+
# This will cause the filter to match no Document.
|
|
83
|
+
# This way we keep the behavior consistent with other Document Stores.
|
|
84
|
+
return {"bool": {"must": [{"exists": {"field": field}}, {"bool": {"must_not": {"exists": {"field": field}}}}]}}
|
|
85
|
+
if isinstance(value, str):
|
|
86
|
+
try:
|
|
87
|
+
datetime.fromisoformat(value)
|
|
88
|
+
except (ValueError, TypeError) as exc:
|
|
89
|
+
msg = (
|
|
90
|
+
"Can't compare strings using operators '>', '>=', '<', '<='. "
|
|
91
|
+
"Strings are only comparable if they are ISO formatted dates."
|
|
92
|
+
)
|
|
93
|
+
raise FilterError(msg) from exc
|
|
94
|
+
if isinstance(value, list):
|
|
95
|
+
msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
|
|
96
|
+
raise FilterError(msg)
|
|
97
|
+
return {"range": {field: {"gt": value}}}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _greater_than_equal(field: str, value: Any) -> dict[str, Any]:
|
|
101
|
+
if value is None:
|
|
102
|
+
# When the value is None and '>=' is used we create a filter that would return a Document
|
|
103
|
+
# if it has a field set and not set at the same time.
|
|
104
|
+
# This will cause the filter to match no Document.
|
|
105
|
+
# This way we keep the behavior consistent with other Document Stores.
|
|
106
|
+
return {"bool": {"must": [{"exists": {"field": field}}, {"bool": {"must_not": {"exists": {"field": field}}}}]}}
|
|
107
|
+
if isinstance(value, str):
|
|
108
|
+
try:
|
|
109
|
+
datetime.fromisoformat(value)
|
|
110
|
+
except (ValueError, TypeError) as exc:
|
|
111
|
+
msg = (
|
|
112
|
+
"Can't compare strings using operators '>', '>=', '<', '<='. "
|
|
113
|
+
"Strings are only comparable if they are ISO formatted dates."
|
|
114
|
+
)
|
|
115
|
+
raise FilterError(msg) from exc
|
|
116
|
+
if isinstance(value, list):
|
|
117
|
+
msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
|
|
118
|
+
raise FilterError(msg)
|
|
119
|
+
return {"range": {field: {"gte": value}}}
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _less_than(field: str, value: Any) -> dict[str, Any]:
|
|
123
|
+
if value is None:
|
|
124
|
+
# When the value is None and '<' is used we create a filter that would return a Document
|
|
125
|
+
# if it has a field set and not set at the same time.
|
|
126
|
+
# This will cause the filter to match no Document.
|
|
127
|
+
# This way we keep the behavior consistent with other Document Stores.
|
|
128
|
+
return {"bool": {"must": [{"exists": {"field": field}}, {"bool": {"must_not": {"exists": {"field": field}}}}]}}
|
|
129
|
+
if isinstance(value, str):
|
|
130
|
+
try:
|
|
131
|
+
datetime.fromisoformat(value)
|
|
132
|
+
except (ValueError, TypeError) as exc:
|
|
133
|
+
msg = (
|
|
134
|
+
"Can't compare strings using operators '>', '>=', '<', '<='. "
|
|
135
|
+
"Strings are only comparable if they are ISO formatted dates."
|
|
136
|
+
)
|
|
137
|
+
raise FilterError(msg) from exc
|
|
138
|
+
if isinstance(value, list):
|
|
139
|
+
msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
|
|
140
|
+
raise FilterError(msg)
|
|
141
|
+
return {"range": {field: {"lt": value}}}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _less_than_equal(field: str, value: Any) -> dict[str, Any]:
|
|
145
|
+
if value is None:
|
|
146
|
+
# When the value is None and '<=' is used we create a filter that would return a Document
|
|
147
|
+
# if it has a field set and not set at the same time.
|
|
148
|
+
# This will cause the filter to match no Document.
|
|
149
|
+
# This way we keep the behavior consistent with other Document Stores.
|
|
150
|
+
return {"bool": {"must": [{"exists": {"field": field}}, {"bool": {"must_not": {"exists": {"field": field}}}}]}}
|
|
151
|
+
if isinstance(value, str):
|
|
152
|
+
try:
|
|
153
|
+
datetime.fromisoformat(value)
|
|
154
|
+
except (ValueError, TypeError) as exc:
|
|
155
|
+
msg = (
|
|
156
|
+
"Can't compare strings using operators '>', '>=', '<', '<='. "
|
|
157
|
+
"Strings are only comparable if they are ISO formatted dates."
|
|
158
|
+
)
|
|
159
|
+
raise FilterError(msg) from exc
|
|
160
|
+
if isinstance(value, list):
|
|
161
|
+
msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
|
|
162
|
+
raise FilterError(msg)
|
|
163
|
+
return {"range": {field: {"lte": value}}}
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _in(field: str, value: Any) -> dict[str, Any]:
|
|
167
|
+
if not isinstance(value, list):
|
|
168
|
+
msg = f"{field}'s value must be a list when using 'in' or 'not in' comparators"
|
|
169
|
+
raise FilterError(msg)
|
|
170
|
+
return {"terms": {field: value}}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _not_in(field: str, value: Any) -> dict[str, Any]:
|
|
174
|
+
if not isinstance(value, list):
|
|
175
|
+
msg = f"{field}'s value must be a list when using 'in' or 'not in' comparators"
|
|
176
|
+
raise FilterError(msg)
|
|
177
|
+
return {"bool": {"must_not": {"terms": {field: value}}}}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
COMPARISON_OPERATORS = {
|
|
181
|
+
"==": _equal,
|
|
182
|
+
"!=": _not_equal,
|
|
183
|
+
">": _greater_than,
|
|
184
|
+
">=": _greater_than_equal,
|
|
185
|
+
"<": _less_than,
|
|
186
|
+
"<=": _less_than_equal,
|
|
187
|
+
"in": _in,
|
|
188
|
+
"not in": _not_in,
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _parse_comparison_condition(condition: dict[str, Any]) -> dict[str, Any]:
|
|
193
|
+
if "field" not in condition:
|
|
194
|
+
# 'field' key is only found in comparison dictionaries.
|
|
195
|
+
# We assume this is a logic dictionary since it's not present.
|
|
196
|
+
return _parse_logical_condition(condition)
|
|
197
|
+
field: str = condition["field"]
|
|
198
|
+
|
|
199
|
+
if field.startswith("meta."):
|
|
200
|
+
# Remove the "meta." prefix if present.
|
|
201
|
+
# Documents are flattened when using the ElasticSearchDocumentStore
|
|
202
|
+
# so we don't need to specify the "meta." prefix.
|
|
203
|
+
# Instead of raising an error we handle it gracefully.
|
|
204
|
+
field = field[5:]
|
|
205
|
+
|
|
206
|
+
if "operator" not in condition:
|
|
207
|
+
msg = f"'operator' key missing in {condition}"
|
|
208
|
+
raise FilterError(msg)
|
|
209
|
+
if "value" not in condition:
|
|
210
|
+
msg = f"'value' key missing in {condition}"
|
|
211
|
+
raise FilterError(msg)
|
|
212
|
+
operator: str = condition["operator"]
|
|
213
|
+
value: Any = condition["value"]
|
|
214
|
+
|
|
215
|
+
return COMPARISON_OPERATORS[operator](field, value)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _normalize_ranges(conditions: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
219
|
+
"""
|
|
220
|
+
Merges range conditions acting on a same field.
|
|
221
|
+
|
|
222
|
+
Example usage:
|
|
223
|
+
|
|
224
|
+
```python
|
|
225
|
+
conditions = [
|
|
226
|
+
{"range": {"date": {"lt": "2021-01-01"}}},
|
|
227
|
+
{"range": {"date": {"gte": "2015-01-01"}}},
|
|
228
|
+
]
|
|
229
|
+
conditions = _normalize_ranges(conditions)
|
|
230
|
+
assert conditions == [
|
|
231
|
+
{"range": {"date": {"lt": "2021-01-01", "gte": "2015-01-01"}}},
|
|
232
|
+
]
|
|
233
|
+
```
|
|
234
|
+
"""
|
|
235
|
+
range_conditions = [next(iter(c["range"].items())) for c in conditions if "range" in c]
|
|
236
|
+
if range_conditions:
|
|
237
|
+
conditions = [c for c in conditions if "range" not in c]
|
|
238
|
+
range_conditions_dict: dict[str, Any] = {}
|
|
239
|
+
for field_name, comparison in range_conditions:
|
|
240
|
+
if field_name not in range_conditions_dict:
|
|
241
|
+
range_conditions_dict[field_name] = {}
|
|
242
|
+
range_conditions_dict[field_name].update(comparison)
|
|
243
|
+
|
|
244
|
+
for field_name, comparisons in range_conditions_dict.items():
|
|
245
|
+
conditions.append({"range": {field_name: comparisons}})
|
|
246
|
+
return conditions
|
|
File without changes
|