nucliadb 6.3.7.post4081__py3-none-any.whl → 6.3.7.post4114__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/common/context/__init__.py +90 -25
- nucliadb/common/context/fastapi.py +4 -2
- nucliadb/ingest/consumer/consumer.py +3 -4
- nucliadb/search/api/v1/find.py +5 -5
- nucliadb/search/api/v1/search.py +2 -10
- nucliadb/search/search/chat/ask.py +6 -3
- nucliadb/search/search/chat/query.py +21 -17
- nucliadb/search/search/find.py +14 -5
- nucliadb/search/search/find_merge.py +27 -13
- nucliadb/search/search/merge.py +17 -18
- nucliadb/search/search/query_parser/models.py +22 -27
- nucliadb/search/search/query_parser/parsers/common.py +32 -21
- nucliadb/search/search/query_parser/parsers/find.py +31 -8
- nucliadb/search/search/query_parser/parsers/search.py +33 -10
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +207 -115
- nucliadb/search/search/utils.py +2 -42
- nucliadb/train/app.py +0 -3
- nucliadb/train/lifecycle.py +16 -11
- {nucliadb-6.3.7.post4081.dist-info → nucliadb-6.3.7.post4114.dist-info}/METADATA +6 -6
- {nucliadb-6.3.7.post4081.dist-info → nucliadb-6.3.7.post4114.dist-info}/RECORD +23 -23
- {nucliadb-6.3.7.post4081.dist-info → nucliadb-6.3.7.post4114.dist-info}/WHEEL +1 -1
- {nucliadb-6.3.7.post4081.dist-info → nucliadb-6.3.7.post4114.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.3.7.post4081.dist-info → nucliadb-6.3.7.post4114.dist-info}/top_level.txt +0 -0
@@ -21,10 +21,7 @@ from dataclasses import dataclass
|
|
21
21
|
from datetime import datetime
|
22
22
|
from typing import Literal, Optional, Union
|
23
23
|
|
24
|
-
from pydantic import
|
25
|
-
BaseModel,
|
26
|
-
Field,
|
27
|
-
)
|
24
|
+
from pydantic import BaseModel, ConfigDict, Field
|
28
25
|
|
29
26
|
from nucliadb.search.search.query_parser.fetcher import Fetcher
|
30
27
|
from nucliadb_models import search as search_models
|
@@ -35,8 +32,7 @@ from nucliadb_protos import nodereader_pb2, utils_pb2
|
|
35
32
|
# query
|
36
33
|
|
37
34
|
|
38
|
-
|
39
|
-
class _TextQuery:
|
35
|
+
class _TextQuery(BaseModel):
|
40
36
|
query: str
|
41
37
|
is_synonyms_query: bool
|
42
38
|
min_score: float
|
@@ -48,24 +44,23 @@ FulltextQuery = _TextQuery
|
|
48
44
|
KeywordQuery = _TextQuery
|
49
45
|
|
50
46
|
|
51
|
-
|
52
|
-
class SemanticQuery:
|
47
|
+
class SemanticQuery(BaseModel):
|
53
48
|
query: Optional[list[float]]
|
54
49
|
vectorset: str
|
55
50
|
min_score: float
|
56
51
|
|
57
52
|
|
58
|
-
|
59
|
-
|
60
|
-
|
53
|
+
class RelationQuery(BaseModel):
|
54
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
55
|
+
|
56
|
+
entry_points: list[utils_pb2.RelationNode]
|
61
57
|
# list[subtype]
|
62
58
|
deleted_entity_groups: list[str]
|
63
59
|
# subtype -> list[entity]
|
64
60
|
deleted_entities: dict[str, list[str]]
|
65
61
|
|
66
62
|
|
67
|
-
|
68
|
-
class Query:
|
63
|
+
class Query(BaseModel):
|
69
64
|
fulltext: Optional[FulltextQuery] = None
|
70
65
|
keyword: Optional[KeywordQuery] = None
|
71
66
|
semantic: Optional[SemanticQuery] = None
|
@@ -75,8 +70,9 @@ class Query:
|
|
75
70
|
# filters
|
76
71
|
|
77
72
|
|
78
|
-
|
79
|
-
|
73
|
+
class Filters(BaseModel):
|
74
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
75
|
+
|
80
76
|
field_expression: Optional[nodereader_pb2.FilterExpression] = None
|
81
77
|
paragraph_expression: Optional[nodereader_pb2.FilterExpression] = None
|
82
78
|
filter_expression_operator: nodereader_pb2.FilterOperator.ValueType = (
|
@@ -125,30 +121,29 @@ Reranker = Union[NoopReranker, PredictReranker]
|
|
125
121
|
# retrieval and generation operations
|
126
122
|
|
127
123
|
|
128
|
-
|
129
|
-
class UnitRetrieval:
|
124
|
+
class UnitRetrieval(BaseModel):
|
130
125
|
query: Query
|
131
126
|
top_k: int
|
132
|
-
filters: Filters
|
133
|
-
|
134
|
-
|
135
|
-
# TODO: reranking fusion depends on the response building, not the retrieval
|
136
|
-
reranker: Reranker
|
127
|
+
filters: Filters = Field(default_factory=Filters)
|
128
|
+
rank_fusion: Optional[RankFusion] = None
|
129
|
+
reranker: Optional[Reranker] = None
|
137
130
|
|
138
131
|
|
139
|
-
|
140
|
-
|
132
|
+
# TODO: augmentation things: hydration...
|
133
|
+
|
134
|
+
|
135
|
+
class Generation(BaseModel):
|
141
136
|
use_visual_llm: bool
|
142
137
|
max_context_tokens: int
|
143
138
|
max_answer_tokens: Optional[int]
|
144
139
|
|
145
140
|
|
146
|
-
|
147
|
-
|
141
|
+
class ParsedQuery(BaseModel):
|
142
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
143
|
+
|
148
144
|
fetcher: Fetcher
|
149
145
|
retrieval: UnitRetrieval
|
150
146
|
generation: Optional[Generation] = None
|
151
|
-
# TODO: add merge, rank fusion, rerank...
|
152
147
|
|
153
148
|
|
154
149
|
### Catalog
|
@@ -28,7 +28,6 @@ from nucliadb.search.search.query_parser.models import (
|
|
28
28
|
KeywordQuery,
|
29
29
|
SemanticQuery,
|
30
30
|
)
|
31
|
-
from nucliadb.search.search.utils import should_disable_vector_search
|
32
31
|
from nucliadb_models import search as search_models
|
33
32
|
|
34
33
|
DEFAULT_GENERIC_SEMANTIC_THRESHOLD = 0.7
|
@@ -38,28 +37,40 @@ DEFAULT_GENERIC_SEMANTIC_THRESHOLD = 0.7
|
|
38
37
|
INVALID_QUERY = re.compile(r"- +\*")
|
39
38
|
|
40
39
|
|
41
|
-
def
|
40
|
+
def validate_query_syntax(query: str):
|
42
41
|
# Filter some queries that panic tantivy, better than returning the 500
|
43
|
-
if INVALID_QUERY.search(
|
42
|
+
if INVALID_QUERY.search(query):
|
44
43
|
raise InvalidQueryError("query", "Invalid query syntax")
|
45
44
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
45
|
+
|
46
|
+
def is_empty_query(request: search_models.BaseSearchRequest) -> bool:
|
47
|
+
return len(request.query) == 0
|
48
|
+
|
49
|
+
|
50
|
+
def has_user_vectors(request: search_models.BaseSearchRequest) -> bool:
|
51
|
+
return request.vector is not None and len(request.vector) > 0
|
52
|
+
|
53
|
+
|
54
|
+
def is_exact_match_only_query(request: search_models.BaseSearchRequest) -> bool:
|
55
|
+
"""
|
56
|
+
'"something"' -> True
|
57
|
+
'foo "something" else' -> False
|
58
|
+
"""
|
59
|
+
query = request.query.strip()
|
60
|
+
return len(query) > 0 and query.startswith('"') and query.endswith('"')
|
61
|
+
|
62
|
+
|
63
|
+
def should_disable_vector_search(request: search_models.BaseSearchRequest) -> bool:
|
64
|
+
if has_user_vectors(request):
|
65
|
+
return False
|
66
|
+
|
67
|
+
if is_exact_match_only_query(request):
|
68
|
+
return True
|
69
|
+
|
70
|
+
if is_empty_query(request):
|
71
|
+
return True
|
72
|
+
|
73
|
+
return False
|
63
74
|
|
64
75
|
|
65
76
|
def parse_top_k(item: search_models.BaseSearchRequest) -> int:
|
@@ -92,7 +103,7 @@ async def parse_keyword_query(
|
|
92
103
|
|
93
104
|
|
94
105
|
async def parse_semantic_query(
|
95
|
-
item: search_models.
|
106
|
+
item: Union[search_models.SearchRequest, search_models.FindRequest],
|
96
107
|
*,
|
97
108
|
fetcher: Fetcher,
|
98
109
|
) -> SemanticQuery:
|
@@ -53,7 +53,8 @@ from .common import (
|
|
53
53
|
parse_keyword_query,
|
54
54
|
parse_semantic_query,
|
55
55
|
parse_top_k,
|
56
|
-
|
56
|
+
should_disable_vector_search,
|
57
|
+
validate_query_syntax,
|
57
58
|
)
|
58
59
|
|
59
60
|
|
@@ -93,7 +94,7 @@ class _FindParser:
|
|
93
94
|
self._top_k: Optional[int] = None
|
94
95
|
|
95
96
|
async def parse(self) -> UnitRetrieval:
|
96
|
-
|
97
|
+
self._validate_request()
|
97
98
|
|
98
99
|
self._top_k = parse_top_k(self.item)
|
99
100
|
|
@@ -101,13 +102,13 @@ class _FindParser:
|
|
101
102
|
|
102
103
|
self._query = Query()
|
103
104
|
|
104
|
-
if search_models.
|
105
|
+
if search_models.FindOptions.KEYWORD in self.item.features:
|
105
106
|
self._query.keyword = await parse_keyword_query(self.item, fetcher=self.fetcher)
|
106
107
|
|
107
|
-
if search_models.
|
108
|
+
if search_models.FindOptions.SEMANTIC in self.item.features:
|
108
109
|
self._query.semantic = await parse_semantic_query(self.item, fetcher=self.fetcher)
|
109
110
|
|
110
|
-
if search_models.
|
111
|
+
if search_models.FindOptions.RELATIONS in self.item.features:
|
111
112
|
self._query.relation = await self._parse_relation_query()
|
112
113
|
|
113
114
|
# TODO: graph search
|
@@ -130,13 +131,35 @@ class _FindParser:
|
|
130
131
|
if isinstance(reranker, PredictReranker):
|
131
132
|
rank_fusion.window = max(rank_fusion.window, reranker.window)
|
132
133
|
|
133
|
-
|
134
|
+
retrieval = UnitRetrieval(
|
134
135
|
query=self._query,
|
135
136
|
top_k=self._top_k,
|
136
137
|
filters=filters,
|
137
138
|
rank_fusion=rank_fusion,
|
138
139
|
reranker=reranker,
|
139
140
|
)
|
141
|
+
return retrieval
|
142
|
+
|
143
|
+
def _validate_request(self):
|
144
|
+
validate_query_syntax(self.item.query)
|
145
|
+
|
146
|
+
# synonyms are not compatible with vector/graph search
|
147
|
+
if (
|
148
|
+
self.item.with_synonyms
|
149
|
+
and self.item.query
|
150
|
+
and (
|
151
|
+
search_models.FindOptions.SEMANTIC in self.item.features
|
152
|
+
or search_models.FindOptions.RELATIONS in self.item.features
|
153
|
+
)
|
154
|
+
):
|
155
|
+
raise InvalidQueryError(
|
156
|
+
"synonyms",
|
157
|
+
"Search with custom synonyms is only supported on paragraph and document search",
|
158
|
+
)
|
159
|
+
|
160
|
+
if search_models.FindOptions.SEMANTIC in self.item.features:
|
161
|
+
if should_disable_vector_search(self.item):
|
162
|
+
self.item.features.remove(search_models.FindOptions.SEMANTIC)
|
140
163
|
|
141
164
|
async def _parse_relation_query(self) -> RelationQuery:
|
142
165
|
detected_entities = await self._get_detected_entities()
|
@@ -147,7 +170,7 @@ class _FindParser:
|
|
147
170
|
deleted_entities = meta_cache.deleted_entities
|
148
171
|
|
149
172
|
return RelationQuery(
|
150
|
-
|
173
|
+
entry_points=detected_entities,
|
151
174
|
deleted_entity_groups=deleted_entity_groups,
|
152
175
|
deleted_entities=deleted_entities,
|
153
176
|
)
|
@@ -220,7 +243,7 @@ class _FindParser:
|
|
220
243
|
autofilter = None
|
221
244
|
if self.item.autofilter:
|
222
245
|
if self._query.relation is not None:
|
223
|
-
autofilter = self._query.relation.
|
246
|
+
autofilter = self._query.relation.entry_points
|
224
247
|
else:
|
225
248
|
autofilter = await self._get_detected_entities()
|
226
249
|
|
@@ -26,10 +26,8 @@ from nucliadb.search.search.query_parser.fetcher import Fetcher
|
|
26
26
|
from nucliadb.search.search.query_parser.filter_expression import parse_expression
|
27
27
|
from nucliadb.search.search.query_parser.models import (
|
28
28
|
Filters,
|
29
|
-
NoopReranker,
|
30
29
|
ParsedQuery,
|
31
30
|
Query,
|
32
|
-
RankFusion,
|
33
31
|
RelationQuery,
|
34
32
|
UnitRetrieval,
|
35
33
|
_TextQuery,
|
@@ -46,7 +44,13 @@ from nucliadb_models.search import (
|
|
46
44
|
)
|
47
45
|
from nucliadb_protos import nodereader_pb2, utils_pb2
|
48
46
|
|
49
|
-
from .common import
|
47
|
+
from .common import (
|
48
|
+
parse_keyword_query,
|
49
|
+
parse_semantic_query,
|
50
|
+
parse_top_k,
|
51
|
+
should_disable_vector_search,
|
52
|
+
validate_query_syntax,
|
53
|
+
)
|
50
54
|
|
51
55
|
INDEX_SORTABLE_FIELDS = [
|
52
56
|
SortField.CREATED,
|
@@ -87,7 +91,7 @@ class _SearchParser:
|
|
87
91
|
self._top_k: Optional[int] = None
|
88
92
|
|
89
93
|
async def parse(self) -> UnitRetrieval:
|
90
|
-
|
94
|
+
self._validate_request()
|
91
95
|
|
92
96
|
self._top_k = parse_top_k(self.item)
|
93
97
|
|
@@ -113,14 +117,33 @@ class _SearchParser:
|
|
113
117
|
|
114
118
|
filters = await self._parse_filters()
|
115
119
|
|
116
|
-
|
120
|
+
retrieval = UnitRetrieval(
|
117
121
|
query=self._query,
|
118
122
|
top_k=self._top_k,
|
119
123
|
filters=filters,
|
120
|
-
# TODO: this should be in a post retrieval step
|
121
|
-
rank_fusion=RankFusion(window=self._top_k),
|
122
|
-
reranker=NoopReranker(),
|
123
124
|
)
|
125
|
+
return retrieval
|
126
|
+
|
127
|
+
def _validate_request(self):
|
128
|
+
validate_query_syntax(self.item.query)
|
129
|
+
|
130
|
+
# synonyms are not compatible with vector/graph search
|
131
|
+
if (
|
132
|
+
self.item.with_synonyms
|
133
|
+
and self.item.query
|
134
|
+
and (
|
135
|
+
search_models.SearchOptions.SEMANTIC in self.item.features
|
136
|
+
or search_models.SearchOptions.RELATIONS in self.item.features
|
137
|
+
)
|
138
|
+
):
|
139
|
+
raise InvalidQueryError(
|
140
|
+
"synonyms",
|
141
|
+
"Search with custom synonyms is only supported on paragraph and document search",
|
142
|
+
)
|
143
|
+
|
144
|
+
if search_models.SearchOptions.SEMANTIC in self.item.features:
|
145
|
+
if should_disable_vector_search(self.item):
|
146
|
+
self.item.features.remove(search_models.SearchOptions.SEMANTIC)
|
124
147
|
|
125
148
|
async def _parse_text_query(self) -> _TextQuery:
|
126
149
|
assert self._top_k is not None, "top_k must be parsed before text query"
|
@@ -140,7 +163,7 @@ class _SearchParser:
|
|
140
163
|
meta_cache = await self.fetcher.get_entities_meta_cache()
|
141
164
|
deleted_entities = meta_cache.deleted_entities
|
142
165
|
return RelationQuery(
|
143
|
-
|
166
|
+
entry_points=detected_entities,
|
144
167
|
deleted_entity_groups=deleted_entity_groups,
|
145
168
|
deleted_entities=deleted_entities,
|
146
169
|
)
|
@@ -231,7 +254,7 @@ class _SearchParser:
|
|
231
254
|
autofilter = None
|
232
255
|
if self.item.autofilter:
|
233
256
|
if self._query.relation is not None:
|
234
|
-
autofilter = self._query.relation.
|
257
|
+
autofilter = self._query.relation.entry_points
|
235
258
|
else:
|
236
259
|
autofilter = await self._get_detected_entities()
|
237
260
|
|