nucliadb 6.3.7.post4081__py3-none-any.whl → 6.3.7.post4114__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,10 +21,7 @@ from dataclasses import dataclass
21
21
  from datetime import datetime
22
22
  from typing import Literal, Optional, Union
23
23
 
24
- from pydantic import (
25
- BaseModel,
26
- Field,
27
- )
24
+ from pydantic import BaseModel, ConfigDict, Field
28
25
 
29
26
  from nucliadb.search.search.query_parser.fetcher import Fetcher
30
27
  from nucliadb_models import search as search_models
@@ -35,8 +32,7 @@ from nucliadb_protos import nodereader_pb2, utils_pb2
35
32
  # query
36
33
 
37
34
 
38
- @dataclass
39
- class _TextQuery:
35
+ class _TextQuery(BaseModel):
40
36
  query: str
41
37
  is_synonyms_query: bool
42
38
  min_score: float
@@ -48,24 +44,23 @@ FulltextQuery = _TextQuery
48
44
  KeywordQuery = _TextQuery
49
45
 
50
46
 
51
- @dataclass
52
- class SemanticQuery:
47
+ class SemanticQuery(BaseModel):
53
48
  query: Optional[list[float]]
54
49
  vectorset: str
55
50
  min_score: float
56
51
 
57
52
 
58
- @dataclass
59
- class RelationQuery:
60
- detected_entities: list[utils_pb2.RelationNode]
53
+ class RelationQuery(BaseModel):
54
+ model_config = ConfigDict(arbitrary_types_allowed=True)
55
+
56
+ entry_points: list[utils_pb2.RelationNode]
61
57
  # list[subtype]
62
58
  deleted_entity_groups: list[str]
63
59
  # subtype -> list[entity]
64
60
  deleted_entities: dict[str, list[str]]
65
61
 
66
62
 
67
- @dataclass
68
- class Query:
63
+ class Query(BaseModel):
69
64
  fulltext: Optional[FulltextQuery] = None
70
65
  keyword: Optional[KeywordQuery] = None
71
66
  semantic: Optional[SemanticQuery] = None
@@ -75,8 +70,9 @@ class Query:
75
70
  # filters
76
71
 
77
72
 
78
- @dataclass
79
- class Filters:
73
+ class Filters(BaseModel):
74
+ model_config = ConfigDict(arbitrary_types_allowed=True)
75
+
80
76
  field_expression: Optional[nodereader_pb2.FilterExpression] = None
81
77
  paragraph_expression: Optional[nodereader_pb2.FilterExpression] = None
82
78
  filter_expression_operator: nodereader_pb2.FilterOperator.ValueType = (
@@ -125,30 +121,29 @@ Reranker = Union[NoopReranker, PredictReranker]
125
121
  # retrieval and generation operations
126
122
 
127
123
 
128
- @dataclass
129
- class UnitRetrieval:
124
+ class UnitRetrieval(BaseModel):
130
125
  query: Query
131
126
  top_k: int
132
- filters: Filters
133
- # TODO: rank fusion depends on the response building, not the retrieval
134
- rank_fusion: RankFusion
135
- # TODO: reranking fusion depends on the response building, not the retrieval
136
- reranker: Reranker
127
+ filters: Filters = Field(default_factory=Filters)
128
+ rank_fusion: Optional[RankFusion] = None
129
+ reranker: Optional[Reranker] = None
137
130
 
138
131
 
139
- @dataclass
140
- class Generation:
132
+ # TODO: augmentation things: hydration...
133
+
134
+
135
+ class Generation(BaseModel):
141
136
  use_visual_llm: bool
142
137
  max_context_tokens: int
143
138
  max_answer_tokens: Optional[int]
144
139
 
145
140
 
146
- @dataclass
147
- class ParsedQuery:
141
+ class ParsedQuery(BaseModel):
142
+ model_config = ConfigDict(arbitrary_types_allowed=True)
143
+
148
144
  fetcher: Fetcher
149
145
  retrieval: UnitRetrieval
150
146
  generation: Optional[Generation] = None
151
- # TODO: add merge, rank fusion, rerank...
152
147
 
153
148
 
154
149
  ### Catalog
@@ -28,7 +28,6 @@ from nucliadb.search.search.query_parser.models import (
28
28
  KeywordQuery,
29
29
  SemanticQuery,
30
30
  )
31
- from nucliadb.search.search.utils import should_disable_vector_search
32
31
  from nucliadb_models import search as search_models
33
32
 
34
33
  DEFAULT_GENERIC_SEMANTIC_THRESHOLD = 0.7
@@ -38,28 +37,40 @@ DEFAULT_GENERIC_SEMANTIC_THRESHOLD = 0.7
38
37
  INVALID_QUERY = re.compile(r"- +\*")
39
38
 
40
39
 
41
- def validate_base_request(item: search_models.BaseSearchRequest):
40
+ def validate_query_syntax(query: str):
42
41
  # Filter some queries that panic tantivy, better than returning the 500
43
- if INVALID_QUERY.search(item.query):
42
+ if INVALID_QUERY.search(query):
44
43
  raise InvalidQueryError("query", "Invalid query syntax")
45
44
 
46
- # synonyms are not compatible with vector/graph search
47
- if (
48
- item.with_synonyms
49
- and item.query
50
- and (
51
- search_models.SearchOptions.SEMANTIC in item.features
52
- or search_models.SearchOptions.RELATIONS in item.features
53
- )
54
- ):
55
- raise InvalidQueryError(
56
- "synonyms",
57
- "Search with custom synonyms is only supported on paragraph and document search",
58
- )
59
-
60
- if search_models.SearchOptions.SEMANTIC in item.features:
61
- if should_disable_vector_search(item):
62
- item.features.remove(search_models.SearchOptions.SEMANTIC)
45
+
46
+ def is_empty_query(request: search_models.BaseSearchRequest) -> bool:
47
+ return len(request.query) == 0
48
+
49
+
50
+ def has_user_vectors(request: search_models.BaseSearchRequest) -> bool:
51
+ return request.vector is not None and len(request.vector) > 0
52
+
53
+
54
+ def is_exact_match_only_query(request: search_models.BaseSearchRequest) -> bool:
55
+ """
56
+ '"something"' -> True
57
+ 'foo "something" else' -> False
58
+ """
59
+ query = request.query.strip()
60
+ return len(query) > 0 and query.startswith('"') and query.endswith('"')
61
+
62
+
63
+ def should_disable_vector_search(request: search_models.BaseSearchRequest) -> bool:
64
+ if has_user_vectors(request):
65
+ return False
66
+
67
+ if is_exact_match_only_query(request):
68
+ return True
69
+
70
+ if is_empty_query(request):
71
+ return True
72
+
73
+ return False
63
74
 
64
75
 
65
76
  def parse_top_k(item: search_models.BaseSearchRequest) -> int:
@@ -92,7 +103,7 @@ async def parse_keyword_query(
92
103
 
93
104
 
94
105
  async def parse_semantic_query(
95
- item: search_models.BaseSearchRequest,
106
+ item: Union[search_models.SearchRequest, search_models.FindRequest],
96
107
  *,
97
108
  fetcher: Fetcher,
98
109
  ) -> SemanticQuery:
@@ -53,7 +53,8 @@ from .common import (
53
53
  parse_keyword_query,
54
54
  parse_semantic_query,
55
55
  parse_top_k,
56
- validate_base_request,
56
+ should_disable_vector_search,
57
+ validate_query_syntax,
57
58
  )
58
59
 
59
60
 
@@ -93,7 +94,7 @@ class _FindParser:
93
94
  self._top_k: Optional[int] = None
94
95
 
95
96
  async def parse(self) -> UnitRetrieval:
96
- validate_base_request(self.item)
97
+ self._validate_request()
97
98
 
98
99
  self._top_k = parse_top_k(self.item)
99
100
 
@@ -101,13 +102,13 @@ class _FindParser:
101
102
 
102
103
  self._query = Query()
103
104
 
104
- if search_models.SearchOptions.KEYWORD in self.item.features:
105
+ if search_models.FindOptions.KEYWORD in self.item.features:
105
106
  self._query.keyword = await parse_keyword_query(self.item, fetcher=self.fetcher)
106
107
 
107
- if search_models.SearchOptions.SEMANTIC in self.item.features:
108
+ if search_models.FindOptions.SEMANTIC in self.item.features:
108
109
  self._query.semantic = await parse_semantic_query(self.item, fetcher=self.fetcher)
109
110
 
110
- if search_models.SearchOptions.RELATIONS in self.item.features:
111
+ if search_models.FindOptions.RELATIONS in self.item.features:
111
112
  self._query.relation = await self._parse_relation_query()
112
113
 
113
114
  # TODO: graph search
@@ -130,13 +131,35 @@ class _FindParser:
130
131
  if isinstance(reranker, PredictReranker):
131
132
  rank_fusion.window = max(rank_fusion.window, reranker.window)
132
133
 
133
- return UnitRetrieval(
134
+ retrieval = UnitRetrieval(
134
135
  query=self._query,
135
136
  top_k=self._top_k,
136
137
  filters=filters,
137
138
  rank_fusion=rank_fusion,
138
139
  reranker=reranker,
139
140
  )
141
+ return retrieval
142
+
143
+ def _validate_request(self):
144
+ validate_query_syntax(self.item.query)
145
+
146
+ # synonyms are not compatible with vector/graph search
147
+ if (
148
+ self.item.with_synonyms
149
+ and self.item.query
150
+ and (
151
+ search_models.FindOptions.SEMANTIC in self.item.features
152
+ or search_models.FindOptions.RELATIONS in self.item.features
153
+ )
154
+ ):
155
+ raise InvalidQueryError(
156
+ "synonyms",
157
+ "Search with custom synonyms is only supported on paragraph and document search",
158
+ )
159
+
160
+ if search_models.FindOptions.SEMANTIC in self.item.features:
161
+ if should_disable_vector_search(self.item):
162
+ self.item.features.remove(search_models.FindOptions.SEMANTIC)
140
163
 
141
164
  async def _parse_relation_query(self) -> RelationQuery:
142
165
  detected_entities = await self._get_detected_entities()
@@ -147,7 +170,7 @@ class _FindParser:
147
170
  deleted_entities = meta_cache.deleted_entities
148
171
 
149
172
  return RelationQuery(
150
- detected_entities=detected_entities,
173
+ entry_points=detected_entities,
151
174
  deleted_entity_groups=deleted_entity_groups,
152
175
  deleted_entities=deleted_entities,
153
176
  )
@@ -220,7 +243,7 @@ class _FindParser:
220
243
  autofilter = None
221
244
  if self.item.autofilter:
222
245
  if self._query.relation is not None:
223
- autofilter = self._query.relation.detected_entities
246
+ autofilter = self._query.relation.entry_points
224
247
  else:
225
248
  autofilter = await self._get_detected_entities()
226
249
 
@@ -26,10 +26,8 @@ from nucliadb.search.search.query_parser.fetcher import Fetcher
26
26
  from nucliadb.search.search.query_parser.filter_expression import parse_expression
27
27
  from nucliadb.search.search.query_parser.models import (
28
28
  Filters,
29
- NoopReranker,
30
29
  ParsedQuery,
31
30
  Query,
32
- RankFusion,
33
31
  RelationQuery,
34
32
  UnitRetrieval,
35
33
  _TextQuery,
@@ -46,7 +44,13 @@ from nucliadb_models.search import (
46
44
  )
47
45
  from nucliadb_protos import nodereader_pb2, utils_pb2
48
46
 
49
- from .common import parse_keyword_query, parse_semantic_query, parse_top_k, validate_base_request
47
+ from .common import (
48
+ parse_keyword_query,
49
+ parse_semantic_query,
50
+ parse_top_k,
51
+ should_disable_vector_search,
52
+ validate_query_syntax,
53
+ )
50
54
 
51
55
  INDEX_SORTABLE_FIELDS = [
52
56
  SortField.CREATED,
@@ -87,7 +91,7 @@ class _SearchParser:
87
91
  self._top_k: Optional[int] = None
88
92
 
89
93
  async def parse(self) -> UnitRetrieval:
90
- validate_base_request(self.item)
94
+ self._validate_request()
91
95
 
92
96
  self._top_k = parse_top_k(self.item)
93
97
 
@@ -113,14 +117,33 @@ class _SearchParser:
113
117
 
114
118
  filters = await self._parse_filters()
115
119
 
116
- return UnitRetrieval(
120
+ retrieval = UnitRetrieval(
117
121
  query=self._query,
118
122
  top_k=self._top_k,
119
123
  filters=filters,
120
- # TODO: this should be in a post retrieval step
121
- rank_fusion=RankFusion(window=self._top_k),
122
- reranker=NoopReranker(),
123
124
  )
125
+ return retrieval
126
+
127
+ def _validate_request(self):
128
+ validate_query_syntax(self.item.query)
129
+
130
+ # synonyms are not compatible with vector/graph search
131
+ if (
132
+ self.item.with_synonyms
133
+ and self.item.query
134
+ and (
135
+ search_models.SearchOptions.SEMANTIC in self.item.features
136
+ or search_models.SearchOptions.RELATIONS in self.item.features
137
+ )
138
+ ):
139
+ raise InvalidQueryError(
140
+ "synonyms",
141
+ "Search with custom synonyms is only supported on paragraph and document search",
142
+ )
143
+
144
+ if search_models.SearchOptions.SEMANTIC in self.item.features:
145
+ if should_disable_vector_search(self.item):
146
+ self.item.features.remove(search_models.SearchOptions.SEMANTIC)
124
147
 
125
148
  async def _parse_text_query(self) -> _TextQuery:
126
149
  assert self._top_k is not None, "top_k must be parsed before text query"
@@ -140,7 +163,7 @@ class _SearchParser:
140
163
  meta_cache = await self.fetcher.get_entities_meta_cache()
141
164
  deleted_entities = meta_cache.deleted_entities
142
165
  return RelationQuery(
143
- detected_entities=detected_entities,
166
+ entry_points=detected_entities,
144
167
  deleted_entity_groups=deleted_entity_groups,
145
168
  deleted_entities=deleted_entities,
146
169
  )
@@ -231,7 +254,7 @@ class _SearchParser:
231
254
  autofilter = None
232
255
  if self.item.autofilter:
233
256
  if self._query.relation is not None:
234
- autofilter = self._query.relation.detected_entities
257
+ autofilter = self._query.relation.entry_points
235
258
  else:
236
259
  autofilter = await self._get_detected_entities()
237
260