nucliadb 6.3.7.post4091__py3-none-any.whl → 6.3.7.post4116__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/ingest/consumer/consumer.py +3 -4
- nucliadb/search/api/v1/find.py +5 -5
- nucliadb/search/api/v1/search.py +2 -10
- nucliadb/search/search/chat/ask.py +6 -3
- nucliadb/search/search/chat/query.py +21 -17
- nucliadb/search/search/find.py +14 -5
- nucliadb/search/search/find_merge.py +27 -13
- nucliadb/search/search/merge.py +17 -18
- nucliadb/search/search/query_parser/models.py +22 -27
- nucliadb/search/search/query_parser/parsers/common.py +32 -21
- nucliadb/search/search/query_parser/parsers/find.py +31 -8
- nucliadb/search/search/query_parser/parsers/search.py +33 -10
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +207 -115
- nucliadb/search/search/utils.py +2 -42
- {nucliadb-6.3.7.post4091.dist-info → nucliadb-6.3.7.post4116.dist-info}/METADATA +6 -6
- {nucliadb-6.3.7.post4091.dist-info → nucliadb-6.3.7.post4116.dist-info}/RECORD +19 -19
- {nucliadb-6.3.7.post4091.dist-info → nucliadb-6.3.7.post4116.dist-info}/WHEEL +1 -1
- {nucliadb-6.3.7.post4091.dist-info → nucliadb-6.3.7.post4116.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.3.7.post4091.dist-info → nucliadb-6.3.7.post4116.dist-info}/top_level.txt +0 -0
@@ -53,7 +53,8 @@ from .common import (
|
|
53
53
|
parse_keyword_query,
|
54
54
|
parse_semantic_query,
|
55
55
|
parse_top_k,
|
56
|
-
|
56
|
+
should_disable_vector_search,
|
57
|
+
validate_query_syntax,
|
57
58
|
)
|
58
59
|
|
59
60
|
|
@@ -93,7 +94,7 @@ class _FindParser:
|
|
93
94
|
self._top_k: Optional[int] = None
|
94
95
|
|
95
96
|
async def parse(self) -> UnitRetrieval:
|
96
|
-
|
97
|
+
self._validate_request()
|
97
98
|
|
98
99
|
self._top_k = parse_top_k(self.item)
|
99
100
|
|
@@ -101,13 +102,13 @@ class _FindParser:
|
|
101
102
|
|
102
103
|
self._query = Query()
|
103
104
|
|
104
|
-
if search_models.
|
105
|
+
if search_models.FindOptions.KEYWORD in self.item.features:
|
105
106
|
self._query.keyword = await parse_keyword_query(self.item, fetcher=self.fetcher)
|
106
107
|
|
107
|
-
if search_models.
|
108
|
+
if search_models.FindOptions.SEMANTIC in self.item.features:
|
108
109
|
self._query.semantic = await parse_semantic_query(self.item, fetcher=self.fetcher)
|
109
110
|
|
110
|
-
if search_models.
|
111
|
+
if search_models.FindOptions.RELATIONS in self.item.features:
|
111
112
|
self._query.relation = await self._parse_relation_query()
|
112
113
|
|
113
114
|
# TODO: graph search
|
@@ -130,13 +131,35 @@ class _FindParser:
|
|
130
131
|
if isinstance(reranker, PredictReranker):
|
131
132
|
rank_fusion.window = max(rank_fusion.window, reranker.window)
|
132
133
|
|
133
|
-
|
134
|
+
retrieval = UnitRetrieval(
|
134
135
|
query=self._query,
|
135
136
|
top_k=self._top_k,
|
136
137
|
filters=filters,
|
137
138
|
rank_fusion=rank_fusion,
|
138
139
|
reranker=reranker,
|
139
140
|
)
|
141
|
+
return retrieval
|
142
|
+
|
143
|
+
def _validate_request(self):
|
144
|
+
validate_query_syntax(self.item.query)
|
145
|
+
|
146
|
+
# synonyms are not compatible with vector/graph search
|
147
|
+
if (
|
148
|
+
self.item.with_synonyms
|
149
|
+
and self.item.query
|
150
|
+
and (
|
151
|
+
search_models.FindOptions.SEMANTIC in self.item.features
|
152
|
+
or search_models.FindOptions.RELATIONS in self.item.features
|
153
|
+
)
|
154
|
+
):
|
155
|
+
raise InvalidQueryError(
|
156
|
+
"synonyms",
|
157
|
+
"Search with custom synonyms is only supported on paragraph and document search",
|
158
|
+
)
|
159
|
+
|
160
|
+
if search_models.FindOptions.SEMANTIC in self.item.features:
|
161
|
+
if should_disable_vector_search(self.item):
|
162
|
+
self.item.features.remove(search_models.FindOptions.SEMANTIC)
|
140
163
|
|
141
164
|
async def _parse_relation_query(self) -> RelationQuery:
|
142
165
|
detected_entities = await self._get_detected_entities()
|
@@ -147,7 +170,7 @@ class _FindParser:
|
|
147
170
|
deleted_entities = meta_cache.deleted_entities
|
148
171
|
|
149
172
|
return RelationQuery(
|
150
|
-
|
173
|
+
entry_points=detected_entities,
|
151
174
|
deleted_entity_groups=deleted_entity_groups,
|
152
175
|
deleted_entities=deleted_entities,
|
153
176
|
)
|
@@ -220,7 +243,7 @@ class _FindParser:
|
|
220
243
|
autofilter = None
|
221
244
|
if self.item.autofilter:
|
222
245
|
if self._query.relation is not None:
|
223
|
-
autofilter = self._query.relation.
|
246
|
+
autofilter = self._query.relation.entry_points
|
224
247
|
else:
|
225
248
|
autofilter = await self._get_detected_entities()
|
226
249
|
|
@@ -26,10 +26,8 @@ from nucliadb.search.search.query_parser.fetcher import Fetcher
|
|
26
26
|
from nucliadb.search.search.query_parser.filter_expression import parse_expression
|
27
27
|
from nucliadb.search.search.query_parser.models import (
|
28
28
|
Filters,
|
29
|
-
NoopReranker,
|
30
29
|
ParsedQuery,
|
31
30
|
Query,
|
32
|
-
RankFusion,
|
33
31
|
RelationQuery,
|
34
32
|
UnitRetrieval,
|
35
33
|
_TextQuery,
|
@@ -46,7 +44,13 @@ from nucliadb_models.search import (
|
|
46
44
|
)
|
47
45
|
from nucliadb_protos import nodereader_pb2, utils_pb2
|
48
46
|
|
49
|
-
from .common import
|
47
|
+
from .common import (
|
48
|
+
parse_keyword_query,
|
49
|
+
parse_semantic_query,
|
50
|
+
parse_top_k,
|
51
|
+
should_disable_vector_search,
|
52
|
+
validate_query_syntax,
|
53
|
+
)
|
50
54
|
|
51
55
|
INDEX_SORTABLE_FIELDS = [
|
52
56
|
SortField.CREATED,
|
@@ -87,7 +91,7 @@ class _SearchParser:
|
|
87
91
|
self._top_k: Optional[int] = None
|
88
92
|
|
89
93
|
async def parse(self) -> UnitRetrieval:
|
90
|
-
|
94
|
+
self._validate_request()
|
91
95
|
|
92
96
|
self._top_k = parse_top_k(self.item)
|
93
97
|
|
@@ -113,14 +117,33 @@ class _SearchParser:
|
|
113
117
|
|
114
118
|
filters = await self._parse_filters()
|
115
119
|
|
116
|
-
|
120
|
+
retrieval = UnitRetrieval(
|
117
121
|
query=self._query,
|
118
122
|
top_k=self._top_k,
|
119
123
|
filters=filters,
|
120
|
-
# TODO: this should be in a post retrieval step
|
121
|
-
rank_fusion=RankFusion(window=self._top_k),
|
122
|
-
reranker=NoopReranker(),
|
123
124
|
)
|
125
|
+
return retrieval
|
126
|
+
|
127
|
+
def _validate_request(self):
|
128
|
+
validate_query_syntax(self.item.query)
|
129
|
+
|
130
|
+
# synonyms are not compatible with vector/graph search
|
131
|
+
if (
|
132
|
+
self.item.with_synonyms
|
133
|
+
and self.item.query
|
134
|
+
and (
|
135
|
+
search_models.SearchOptions.SEMANTIC in self.item.features
|
136
|
+
or search_models.SearchOptions.RELATIONS in self.item.features
|
137
|
+
)
|
138
|
+
):
|
139
|
+
raise InvalidQueryError(
|
140
|
+
"synonyms",
|
141
|
+
"Search with custom synonyms is only supported on paragraph and document search",
|
142
|
+
)
|
143
|
+
|
144
|
+
if search_models.SearchOptions.SEMANTIC in self.item.features:
|
145
|
+
if should_disable_vector_search(self.item):
|
146
|
+
self.item.features.remove(search_models.SearchOptions.SEMANTIC)
|
124
147
|
|
125
148
|
async def _parse_text_query(self) -> _TextQuery:
|
126
149
|
assert self._top_k is not None, "top_k must be parsed before text query"
|
@@ -140,7 +163,7 @@ class _SearchParser:
|
|
140
163
|
meta_cache = await self.fetcher.get_entities_meta_cache()
|
141
164
|
deleted_entities = meta_cache.deleted_entities
|
142
165
|
return RelationQuery(
|
143
|
-
|
166
|
+
entry_points=detected_entities,
|
144
167
|
deleted_entity_groups=deleted_entity_groups,
|
145
168
|
deleted_entities=deleted_entities,
|
146
169
|
)
|
@@ -231,7 +254,7 @@ class _SearchParser:
|
|
231
254
|
autofilter = None
|
232
255
|
if self.item.autofilter:
|
233
256
|
if self._query.relation is not None:
|
234
|
-
autofilter = self._query.relation.
|
257
|
+
autofilter = self._query.relation.entry_points
|
235
258
|
else:
|
236
259
|
autofilter = await self._get_detected_entities()
|
237
260
|
|
@@ -19,154 +19,246 @@
|
|
19
19
|
#
|
20
20
|
from typing import Optional
|
21
21
|
|
22
|
-
from nucliadb.search.search.filters import
|
23
|
-
|
24
|
-
|
25
|
-
from nucliadb.search.search.metrics import (
|
26
|
-
node_features,
|
27
|
-
query_parser_observer,
|
28
|
-
)
|
29
|
-
from nucliadb.search.search.query import (
|
30
|
-
apply_entities_filter,
|
31
|
-
get_sort_field_proto,
|
32
|
-
)
|
22
|
+
from nucliadb.search.search.filters import translate_label
|
23
|
+
from nucliadb.search.search.metrics import node_features, query_parser_observer
|
24
|
+
from nucliadb.search.search.query import apply_entities_filter, get_sort_field_proto
|
33
25
|
from nucliadb.search.search.query_parser.filter_expression import add_and_expression
|
34
26
|
from nucliadb.search.search.query_parser.models import ParsedQuery, PredictReranker, UnitRetrieval
|
35
27
|
from nucliadb_models.labels import LABEL_HIDDEN, translate_system_to_alias_label
|
36
|
-
from nucliadb_models.search import
|
37
|
-
SortOrderMap,
|
38
|
-
)
|
28
|
+
from nucliadb_models.search import SortOrderMap
|
39
29
|
from nucliadb_protos import nodereader_pb2, utils_pb2
|
40
30
|
from nucliadb_protos.nodereader_pb2 import SearchRequest
|
41
31
|
|
42
32
|
|
43
33
|
@query_parser_observer.wrap({"type": "convert_retrieval_to_proto"})
|
44
|
-
async def
|
34
|
+
async def legacy_convert_retrieval_to_proto(
|
45
35
|
parsed: ParsedQuery,
|
46
36
|
) -> tuple[SearchRequest, bool, list[str], Optional[str]]:
|
47
|
-
|
37
|
+
converter = _Converter(parsed.retrieval)
|
38
|
+
request = converter.into_search_request()
|
48
39
|
|
49
|
-
|
40
|
+
# XXX: legacy values that were returned by QueryParser but not always
|
41
|
+
# needed. We should find a better abstraction
|
50
42
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
43
|
+
incomplete = is_incomplete(parsed.retrieval)
|
44
|
+
autofilter = converter._autofilter
|
45
|
+
|
46
|
+
rephrased_query = None
|
47
|
+
if parsed.retrieval.query.semantic:
|
48
|
+
rephrased_query = await parsed.fetcher.get_rephrased_query()
|
55
49
|
|
56
|
-
|
57
|
-
request.document = True
|
58
|
-
node_features.inc({"type": "documents"})
|
59
|
-
if parsed.retrieval.query.keyword:
|
60
|
-
request.paragraph = True
|
61
|
-
node_features.inc({"type": "paragraphs"})
|
50
|
+
return request, incomplete, autofilter, rephrased_query
|
62
51
|
|
63
|
-
|
64
|
-
|
65
|
-
|
52
|
+
|
53
|
+
@query_parser_observer.wrap({"type": "convert_retrieval_to_proto"})
|
54
|
+
def convert_retrieval_to_proto(retrieval: UnitRetrieval) -> SearchRequest:
|
55
|
+
converter = _Converter(retrieval)
|
56
|
+
request = converter.into_search_request()
|
57
|
+
return request
|
58
|
+
|
59
|
+
|
60
|
+
class _Converter:
|
61
|
+
def __init__(self, retrieval: UnitRetrieval):
|
62
|
+
self.req = nodereader_pb2.SearchRequest()
|
63
|
+
self.retrieval = retrieval
|
64
|
+
|
65
|
+
self._autofilter: list[str] = []
|
66
|
+
|
67
|
+
def into_search_request(self) -> nodereader_pb2.SearchRequest:
|
68
|
+
"""Generate a SearchRequest proto from a retrieval operation."""
|
69
|
+
self._apply_text_queries()
|
70
|
+
self._apply_semantic_query()
|
71
|
+
self._apply_relation_query()
|
72
|
+
self._apply_filters()
|
73
|
+
self._apply_top_k()
|
74
|
+
return self.req
|
75
|
+
|
76
|
+
def _apply_text_queries(self):
|
77
|
+
text_query = self.retrieval.query.keyword or self.retrieval.query.fulltext
|
78
|
+
if text_query is None:
|
79
|
+
return
|
80
|
+
|
81
|
+
if self.retrieval.query.keyword and self.retrieval.query.fulltext:
|
82
|
+
assert self.retrieval.query.keyword == self.retrieval.query.fulltext, (
|
83
|
+
"search proto doesn't support different queries for fulltext and keyword search"
|
84
|
+
)
|
85
|
+
|
86
|
+
if self.retrieval.query.fulltext:
|
87
|
+
self.req.document = True
|
88
|
+
node_features.inc({"type": "documents"})
|
89
|
+
if self.retrieval.query.keyword:
|
90
|
+
self.req.paragraph = True
|
91
|
+
node_features.inc({"type": "paragraphs"})
|
92
|
+
|
93
|
+
self.req.min_score_bm25 = text_query.min_score
|
66
94
|
|
67
95
|
if text_query.is_synonyms_query:
|
68
|
-
|
96
|
+
self.req.advanced_query = text_query.query
|
69
97
|
else:
|
70
|
-
|
98
|
+
self.req.body = text_query.query
|
71
99
|
|
72
100
|
# sort order
|
73
101
|
sort_field = get_sort_field_proto(text_query.order_by)
|
74
102
|
if sort_field is not None:
|
75
|
-
|
76
|
-
|
103
|
+
self.req.order.sort_by = sort_field
|
104
|
+
self.req.order.type = SortOrderMap[text_query.sort] # type: ignore
|
105
|
+
|
106
|
+
def _apply_semantic_query(self):
|
107
|
+
if self.retrieval.query.semantic is None:
|
108
|
+
return
|
77
109
|
|
78
|
-
if parsed.retrieval.query.semantic:
|
79
110
|
node_features.inc({"type": "vectors"})
|
80
111
|
|
81
|
-
|
112
|
+
self.req.min_score_semantic = self.retrieval.query.semantic.min_score
|
82
113
|
|
83
|
-
query_vector =
|
114
|
+
query_vector = self.retrieval.query.semantic.query
|
84
115
|
if query_vector is not None:
|
85
|
-
|
86
|
-
|
116
|
+
self.req.vectorset = self.retrieval.query.semantic.vectorset
|
117
|
+
self.req.vector.extend(query_vector)
|
87
118
|
|
88
|
-
|
89
|
-
|
119
|
+
def _apply_relation_query(self):
|
120
|
+
"""Relation queries are the legacy way to query the knowledge graph.
|
121
|
+
Given a set of entry points and some subtypes and entities to exclude
|
122
|
+
from search, it'd find the distance 1 neighbours (BFS)."""
|
90
123
|
|
91
|
-
|
92
|
-
|
93
|
-
request.relation_subgraph.deleted_groups.extend(
|
94
|
-
parsed.retrieval.query.relation.deleted_entity_groups
|
95
|
-
)
|
96
|
-
for group_id, deleted_entities in parsed.retrieval.query.relation.deleted_entities.items():
|
97
|
-
request.relation_subgraph.deleted_entities.append(
|
98
|
-
nodereader_pb2.EntitiesSubgraphRequest.DeletedEntities(
|
99
|
-
node_subtype=group_id, node_values=deleted_entities
|
100
|
-
)
|
101
|
-
)
|
124
|
+
if self.retrieval.query.relation is None:
|
125
|
+
return
|
102
126
|
|
103
|
-
|
104
|
-
|
105
|
-
request.with_duplicates = parsed.retrieval.filters.with_duplicates
|
106
|
-
|
107
|
-
request.faceted.labels.extend([translate_label(facet) for facet in parsed.retrieval.filters.facets])
|
108
|
-
|
109
|
-
if (
|
110
|
-
parsed.retrieval.filters.security is not None
|
111
|
-
and len(parsed.retrieval.filters.security.groups) > 0
|
112
|
-
):
|
113
|
-
security_pb = utils_pb2.Security()
|
114
|
-
for group_id in parsed.retrieval.filters.security.groups:
|
115
|
-
if group_id not in security_pb.access_groups:
|
116
|
-
security_pb.access_groups.append(group_id)
|
117
|
-
request.security.CopyFrom(security_pb)
|
118
|
-
|
119
|
-
if parsed.retrieval.filters.field_expression:
|
120
|
-
request.field_filter.CopyFrom(parsed.retrieval.filters.field_expression)
|
121
|
-
if parsed.retrieval.filters.paragraph_expression:
|
122
|
-
request.paragraph_filter.CopyFrom(parsed.retrieval.filters.paragraph_expression)
|
123
|
-
request.filter_operator = parsed.retrieval.filters.filter_expression_operator
|
124
|
-
|
125
|
-
autofilter = []
|
126
|
-
if parsed.retrieval.filters.autofilter:
|
127
|
-
entity_filters = apply_entities_filter(request, parsed.retrieval.filters.autofilter)
|
128
|
-
autofilter.extend([translate_system_to_alias_label(e) for e in entity_filters])
|
129
|
-
|
130
|
-
if parsed.retrieval.filters.hidden is not None:
|
131
|
-
expr = nodereader_pb2.FilterExpression()
|
132
|
-
if parsed.retrieval.filters.hidden:
|
133
|
-
expr.facet.facet = LABEL_HIDDEN
|
134
|
-
else:
|
135
|
-
expr.bool_not.facet.facet = LABEL_HIDDEN
|
136
|
-
|
137
|
-
add_and_expression(request.field_filter, expr)
|
138
|
-
|
139
|
-
# top_k
|
140
|
-
|
141
|
-
# Adjust requested page size depending on rank fusion and reranking algorithms.
|
142
|
-
#
|
143
|
-
# Some rerankers want more results than the requested by the user so
|
144
|
-
# reranking can have more choices.
|
145
|
-
|
146
|
-
rank_fusion_window = 0
|
147
|
-
if parsed.retrieval.rank_fusion is not None:
|
148
|
-
rank_fusion_window = parsed.retrieval.rank_fusion.window
|
149
|
-
|
150
|
-
reranker_window = 0
|
151
|
-
if parsed.retrieval.reranker is not None and isinstance(parsed.retrieval.reranker, PredictReranker):
|
152
|
-
reranker_window = parsed.retrieval.reranker.window
|
153
|
-
|
154
|
-
request.result_per_page = max(
|
155
|
-
request.result_per_page,
|
156
|
-
rank_fusion_window,
|
157
|
-
reranker_window,
|
158
|
-
)
|
127
|
+
node_features.inc({"type": "relations"})
|
159
128
|
|
160
|
-
|
161
|
-
|
129
|
+
# Entry points are source or target nodes we want to search for. We want
|
130
|
+
# any undirected path containing any entry point
|
131
|
+
entry_points_queries = []
|
132
|
+
for entry_point in self.retrieval.query.relation.entry_points:
|
133
|
+
q = nodereader_pb2.GraphQuery.PathQuery()
|
134
|
+
if entry_point.value:
|
135
|
+
q.path.source.value = entry_point.value
|
136
|
+
q.path.source.node_type = entry_point.ntype
|
137
|
+
if entry_point.subtype:
|
138
|
+
q.path.source.node_subtype = entry_point.subtype
|
139
|
+
q.path.undirected = True
|
140
|
+
entry_points_queries.append(q)
|
141
|
+
|
142
|
+
# A query can specifiy nodes marked as deleted in the db (but not
|
143
|
+
# removed from the index). We want to exclude any path containing any of
|
144
|
+
# those nodes.
|
145
|
+
#
|
146
|
+
# The request groups values per subtype (to optimize request size) but,
|
147
|
+
# as we don't support OR at node value level, we'll split them.
|
148
|
+
deleted_nodes_queries = []
|
149
|
+
for subtype, deleted_entities in self.retrieval.query.relation.deleted_entities.items():
|
150
|
+
if len(deleted_entities) == 0:
|
151
|
+
continue
|
152
|
+
for deleted_entity_value in deleted_entities:
|
153
|
+
q = nodereader_pb2.GraphQuery.PathQuery()
|
154
|
+
q.path.source.value = deleted_entity_value
|
155
|
+
q.path.source.node_subtype = subtype
|
156
|
+
q.path.undirected = True
|
157
|
+
deleted_nodes_queries.append(q)
|
158
|
+
|
159
|
+
# Subtypes can also be marked as deleted in the db (but kept in the
|
160
|
+
# index). We also want to exclude any triplet containg a node with such
|
161
|
+
# subtypes
|
162
|
+
excluded_subtypes_queries = []
|
163
|
+
for deleted_subtype in self.retrieval.query.relation.deleted_entity_groups:
|
164
|
+
q = nodereader_pb2.GraphQuery.PathQuery()
|
165
|
+
q.path.source.node_subtype = deleted_subtype
|
166
|
+
q.path.undirected = True
|
167
|
+
excluded_subtypes_queries.append(q)
|
168
|
+
|
169
|
+
subqueries = []
|
170
|
+
|
171
|
+
if len(entry_points_queries) > 0:
|
172
|
+
if len(entry_points_queries) == 1:
|
173
|
+
q = entry_points_queries[0]
|
174
|
+
else:
|
175
|
+
q = nodereader_pb2.GraphQuery.PathQuery()
|
176
|
+
q.bool_or.operands.extend(entry_points_queries)
|
177
|
+
subqueries.append(q)
|
178
|
+
|
179
|
+
if len(deleted_nodes_queries) > 0:
|
180
|
+
q = nodereader_pb2.GraphQuery.PathQuery()
|
181
|
+
if len(deleted_nodes_queries) == 1:
|
182
|
+
q.bool_not.CopyFrom(deleted_nodes_queries[0])
|
183
|
+
else:
|
184
|
+
q.bool_not.bool_or.operands.extend(deleted_nodes_queries)
|
185
|
+
subqueries.append(q)
|
186
|
+
|
187
|
+
if len(excluded_subtypes_queries) > 0:
|
188
|
+
q = nodereader_pb2.GraphQuery.PathQuery()
|
189
|
+
if len(excluded_subtypes_queries) == 1:
|
190
|
+
q.bool_not.CopyFrom(excluded_subtypes_queries[0])
|
191
|
+
else:
|
192
|
+
q.bool_not.bool_or.operands.extend(excluded_subtypes_queries)
|
193
|
+
subqueries.append(q)
|
194
|
+
|
195
|
+
if len(subqueries) == 0:
|
196
|
+
# don't set anything, no graph query
|
197
|
+
pass
|
198
|
+
elif len(subqueries) == 1:
|
199
|
+
q = subqueries[0]
|
200
|
+
self.req.graph_search.query.path.CopyFrom(q)
|
201
|
+
else:
|
202
|
+
self.req.graph_search.query.path.bool_and.operands.extend(subqueries)
|
162
203
|
|
163
|
-
|
204
|
+
def _apply_filters(self):
|
205
|
+
self.req.with_duplicates = self.retrieval.filters.with_duplicates
|
164
206
|
|
165
|
-
|
166
|
-
|
167
|
-
|
207
|
+
self.req.faceted.labels.extend(
|
208
|
+
[translate_label(facet) for facet in self.retrieval.filters.facets]
|
209
|
+
)
|
168
210
|
|
169
|
-
|
211
|
+
if (
|
212
|
+
self.retrieval.filters.security is not None
|
213
|
+
and len(self.retrieval.filters.security.groups) > 0
|
214
|
+
):
|
215
|
+
security_pb = utils_pb2.Security()
|
216
|
+
for group_id in self.retrieval.filters.security.groups:
|
217
|
+
if group_id not in security_pb.access_groups:
|
218
|
+
security_pb.access_groups.append(group_id)
|
219
|
+
self.req.security.CopyFrom(security_pb)
|
220
|
+
|
221
|
+
if self.retrieval.filters.field_expression:
|
222
|
+
self.req.field_filter.CopyFrom(self.retrieval.filters.field_expression)
|
223
|
+
if self.retrieval.filters.paragraph_expression:
|
224
|
+
self.req.paragraph_filter.CopyFrom(self.retrieval.filters.paragraph_expression)
|
225
|
+
self.req.filter_operator = self.retrieval.filters.filter_expression_operator
|
226
|
+
|
227
|
+
if self.retrieval.filters.autofilter:
|
228
|
+
entity_filters = apply_entities_filter(self.req, self.retrieval.filters.autofilter)
|
229
|
+
self._autofilter.extend([translate_system_to_alias_label(e) for e in entity_filters])
|
230
|
+
|
231
|
+
if self.retrieval.filters.hidden is not None:
|
232
|
+
expr = nodereader_pb2.FilterExpression()
|
233
|
+
if self.retrieval.filters.hidden:
|
234
|
+
expr.facet.facet = LABEL_HIDDEN
|
235
|
+
else:
|
236
|
+
expr.bool_not.facet.facet = LABEL_HIDDEN
|
237
|
+
|
238
|
+
add_and_expression(self.req.field_filter, expr)
|
239
|
+
|
240
|
+
def _apply_top_k(self):
|
241
|
+
"""Adjust requested page size depending on rank fusion and reranking
|
242
|
+
algorithms.
|
243
|
+
|
244
|
+
Some rerankers want more results than the requested by the user so
|
245
|
+
reranking can have more choices.
|
246
|
+
"""
|
247
|
+
top_k = self.retrieval.top_k
|
248
|
+
|
249
|
+
rank_fusion_window = 0
|
250
|
+
if self.retrieval.rank_fusion is not None:
|
251
|
+
rank_fusion_window = self.retrieval.rank_fusion.window
|
252
|
+
|
253
|
+
reranker_window = 0
|
254
|
+
if self.retrieval.reranker is not None and isinstance(self.retrieval.reranker, PredictReranker):
|
255
|
+
reranker_window = self.retrieval.reranker.window
|
256
|
+
|
257
|
+
self.req.result_per_page = max(
|
258
|
+
top_k,
|
259
|
+
rank_fusion_window,
|
260
|
+
reranker_window,
|
261
|
+
)
|
170
262
|
|
171
263
|
|
172
264
|
def is_incomplete(retrieval: UnitRetrieval) -> bool:
|
nucliadb/search/search/utils.py
CHANGED
@@ -18,12 +18,12 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
import logging
|
21
|
-
from typing import Optional
|
21
|
+
from typing import Optional
|
22
22
|
|
23
23
|
from pydantic import BaseModel
|
24
24
|
|
25
25
|
from nucliadb.common.datamanagers.atomic import kb
|
26
|
-
from nucliadb_models.search import
|
26
|
+
from nucliadb_models.search import MinScore
|
27
27
|
from nucliadb_utils import const
|
28
28
|
from nucliadb_utils.utilities import has_feature
|
29
29
|
|
@@ -39,36 +39,6 @@ async def filter_hidden_resources(kbid: str, show_hidden: bool) -> Optional[bool
|
|
39
39
|
return None # None = No filtering, show all resources
|
40
40
|
|
41
41
|
|
42
|
-
def is_empty_query(request: BaseSearchRequest) -> bool:
|
43
|
-
return len(request.query) == 0
|
44
|
-
|
45
|
-
|
46
|
-
def has_user_vectors(request: BaseSearchRequest) -> bool:
|
47
|
-
return request.vector is not None and len(request.vector) > 0
|
48
|
-
|
49
|
-
|
50
|
-
def is_exact_match_only_query(request: BaseSearchRequest) -> bool:
|
51
|
-
"""
|
52
|
-
'"something"' -> True
|
53
|
-
'foo "something" else' -> False
|
54
|
-
"""
|
55
|
-
query = request.query.strip()
|
56
|
-
return len(query) > 0 and query.startswith('"') and query.endswith('"')
|
57
|
-
|
58
|
-
|
59
|
-
def should_disable_vector_search(request: BaseSearchRequest) -> bool:
|
60
|
-
if has_user_vectors(request):
|
61
|
-
return False
|
62
|
-
|
63
|
-
if is_exact_match_only_query(request):
|
64
|
-
return True
|
65
|
-
|
66
|
-
if is_empty_query(request):
|
67
|
-
return True
|
68
|
-
|
69
|
-
return False
|
70
|
-
|
71
|
-
|
72
42
|
def min_score_from_query_params(
|
73
43
|
min_score_bm25: float,
|
74
44
|
min_score_semantic: Optional[float],
|
@@ -79,16 +49,6 @@ def min_score_from_query_params(
|
|
79
49
|
return MinScore(bm25=min_score_bm25, semantic=semantic)
|
80
50
|
|
81
51
|
|
82
|
-
def min_score_from_payload(min_score: Optional[Union[float, MinScore]]) -> MinScore:
|
83
|
-
# Keep backward compatibility with the deprecated
|
84
|
-
# min_score payload parameter being a float
|
85
|
-
if min_score is None:
|
86
|
-
return MinScore(bm25=0, semantic=None)
|
87
|
-
elif isinstance(min_score, float):
|
88
|
-
return MinScore(bm25=0, semantic=min_score)
|
89
|
-
return min_score
|
90
|
-
|
91
|
-
|
92
52
|
def maybe_log_request_payload(kbid: str, endpoint: str, item: BaseModel):
|
93
53
|
if has_feature(const.Features.LOG_REQUEST_PAYLOADS, context={"kbid": kbid}, default=False):
|
94
54
|
logger.info(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: nucliadb
|
3
|
-
Version: 6.3.7.
|
3
|
+
Version: 6.3.7.post4116
|
4
4
|
Summary: NucliaDB
|
5
5
|
Author-email: Nuclia <nucliadb@nuclia.com>
|
6
6
|
License: AGPL
|
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Programming Language :: Python :: 3 :: Only
|
21
21
|
Requires-Python: <4,>=3.9
|
22
22
|
Description-Content-Type: text/markdown
|
23
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.3.7.
|
24
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.7.
|
25
|
-
Requires-Dist: nucliadb-protos>=6.3.7.
|
26
|
-
Requires-Dist: nucliadb-models>=6.3.7.
|
27
|
-
Requires-Dist: nidx-protos>=6.3.7.
|
23
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.3.7.post4116
|
24
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.7.post4116
|
25
|
+
Requires-Dist: nucliadb-protos>=6.3.7.post4116
|
26
|
+
Requires-Dist: nucliadb-models>=6.3.7.post4116
|
27
|
+
Requires-Dist: nidx-protos>=6.3.7.post4116
|
28
28
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
29
29
|
Requires-Dist: nuclia-models>=0.24.2
|
30
30
|
Requires-Dist: uvicorn[standard]
|