nucliadb 6.2.1.post3139__py3-none-any.whl → 6.2.1.post3165__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/search/search/exceptions.py +2 -7
- nucliadb/search/search/find.py +1 -1
- nucliadb/search/search/query.py +41 -240
- nucliadb/search/search/query_parser/exceptions.py +11 -1
- nucliadb/search/search/query_parser/fetcher.py +399 -0
- nucliadb/search/search/query_parser/parser.py +13 -12
- {nucliadb-6.2.1.post3139.dist-info → nucliadb-6.2.1.post3165.dist-info}/METADATA +5 -5
- {nucliadb-6.2.1.post3139.dist-info → nucliadb-6.2.1.post3165.dist-info}/RECORD +12 -11
- {nucliadb-6.2.1.post3139.dist-info → nucliadb-6.2.1.post3165.dist-info}/WHEEL +0 -0
- {nucliadb-6.2.1.post3139.dist-info → nucliadb-6.2.1.post3165.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.2.1.post3139.dist-info → nucliadb-6.2.1.post3165.dist-info}/top_level.txt +0 -0
- {nucliadb-6.2.1.post3139.dist-info → nucliadb-6.2.1.post3165.dist-info}/zip-safe +0 -0
@@ -17,6 +17,8 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
|
20
|
+
from nucliadb.search.search.query_parser.exceptions import InvalidQueryError as InvalidQueryError
|
21
|
+
|
20
22
|
|
21
23
|
class IncompleteFindResultsError(Exception):
|
22
24
|
pass
|
@@ -24,10 +26,3 @@ class IncompleteFindResultsError(Exception):
|
|
24
26
|
|
25
27
|
class ResourceNotFoundError(Exception):
|
26
28
|
pass
|
27
|
-
|
28
|
-
|
29
|
-
class InvalidQueryError(Exception):
|
30
|
-
def __init__(self, param: str, reason: str):
|
31
|
-
self.param = param
|
32
|
-
self.reason = reason
|
33
|
-
super().__init__(f"Invalid query. Error in {param}: {reason}")
|
nucliadb/search/search/find.py
CHANGED
@@ -260,7 +260,7 @@ async def query_parser_from_find_request(
|
|
260
260
|
# XXX this is becoming the new /find query parsing, this should be moved to
|
261
261
|
# a cleaner abstraction
|
262
262
|
|
263
|
-
parsed = parse_find(item)
|
263
|
+
parsed = await parse_find(kbid, item)
|
264
264
|
|
265
265
|
rank_fusion = get_rank_fusion(parsed.rank_fusion)
|
266
266
|
reranker = get_reranker(parsed.reranker)
|
nucliadb/search/search/query.py
CHANGED
@@ -23,12 +23,9 @@ import string
|
|
23
23
|
from datetime import datetime
|
24
24
|
from typing import Any, Awaitable, Optional, Union
|
25
25
|
|
26
|
-
from async_lru import alru_cache
|
27
|
-
|
28
26
|
from nucliadb.common import datamanagers
|
29
|
-
from nucliadb.common.maindb.utils import get_driver
|
30
27
|
from nucliadb.search import logger
|
31
|
-
from nucliadb.search.predict import SendToPredictError
|
28
|
+
from nucliadb.search.predict import SendToPredictError
|
32
29
|
from nucliadb.search.search.filters import (
|
33
30
|
convert_to_node_filters,
|
34
31
|
flatten_filter_literals,
|
@@ -39,15 +36,14 @@ from nucliadb.search.search.filters import (
|
|
39
36
|
)
|
40
37
|
from nucliadb.search.search.metrics import (
|
41
38
|
node_features,
|
42
|
-
query_parse_dependency_observer,
|
43
39
|
)
|
40
|
+
from nucliadb.search.search.query_parser.fetcher import Fetcher, get_classification_labels
|
44
41
|
from nucliadb.search.search.rank_fusion import (
|
45
42
|
RankFusionAlgorithm,
|
46
43
|
)
|
47
44
|
from nucliadb.search.search.rerankers import (
|
48
45
|
Reranker,
|
49
46
|
)
|
50
|
-
from nucliadb.search.utilities import get_predict
|
51
47
|
from nucliadb_models.internal.predict import QueryInfo
|
52
48
|
from nucliadb_models.labels import LABEL_HIDDEN, translate_system_to_alias_label
|
53
49
|
from nucliadb_models.metadata import ResourceProcessingStatus
|
@@ -63,7 +59,7 @@ from nucliadb_models.search import (
|
|
63
59
|
SuggestOptions,
|
64
60
|
)
|
65
61
|
from nucliadb_models.security import RequestSecurity
|
66
|
-
from nucliadb_protos import
|
62
|
+
from nucliadb_protos import nodereader_pb2, utils_pb2
|
67
63
|
from nucliadb_protos.noderesources_pb2 import Resource
|
68
64
|
|
69
65
|
from .exceptions import InvalidQueryError
|
@@ -87,13 +83,6 @@ class QueryParser:
|
|
87
83
|
"""
|
88
84
|
|
89
85
|
_query_information_task: Optional[asyncio.Task] = None
|
90
|
-
_get_vectorset_task: Optional[asyncio.Task] = None
|
91
|
-
_detected_entities_task: Optional[asyncio.Task] = None
|
92
|
-
_entities_meta_cache_task: Optional[asyncio.Task] = None
|
93
|
-
_deleted_entities_groups_task: Optional[asyncio.Task] = None
|
94
|
-
_synonyms_task: Optional[asyncio.Task] = None
|
95
|
-
_get_classification_labels_task: Optional[asyncio.Task] = None
|
96
|
-
_get_matryoshka_dimension_task: Optional[asyncio.Task] = None
|
97
86
|
|
98
87
|
def __init__(
|
99
88
|
self,
|
@@ -168,6 +157,15 @@ class QueryParser:
|
|
168
157
|
self.max_tokens = max_tokens
|
169
158
|
self.rank_fusion = rank_fusion
|
170
159
|
self.reranker = reranker
|
160
|
+
self.fetcher = Fetcher(
|
161
|
+
kbid=kbid,
|
162
|
+
query=query,
|
163
|
+
user_vector=user_vector,
|
164
|
+
vectorset=vectorset,
|
165
|
+
rephrase=rephrase,
|
166
|
+
rephrase_prompt=rephrase_prompt,
|
167
|
+
generative_model=generative_model,
|
168
|
+
)
|
171
169
|
|
172
170
|
@property
|
173
171
|
def has_vector_search(self) -> bool:
|
@@ -183,78 +181,12 @@ class QueryParser:
|
|
183
181
|
return self._query_information_task
|
184
182
|
|
185
183
|
async def _query_information(self) -> QueryInfo:
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
if self._get_vectorset_task is None:
|
193
|
-
self._get_vectorset_task = asyncio.create_task(self._select_vectorset())
|
194
|
-
return self._get_vectorset_task
|
195
|
-
|
196
|
-
async def _select_vectorset(self) -> Optional[str]:
|
197
|
-
if self.vectorset:
|
198
|
-
return self.vectorset
|
199
|
-
|
200
|
-
# When vectorset is not provided we get the default from Predict API
|
201
|
-
|
202
|
-
try:
|
203
|
-
query_information = await self._get_query_information()
|
204
|
-
except SendToPredictError:
|
205
|
-
return None
|
206
|
-
|
207
|
-
if query_information.sentence is None:
|
208
|
-
logger.error(
|
209
|
-
"Asking for a vectorset but /query didn't return one", extra={"kbid": self.kbid}
|
210
|
-
)
|
211
|
-
return None
|
212
|
-
|
213
|
-
for vectorset in query_information.sentence.vectors.keys():
|
214
|
-
self.vectorset = vectorset
|
215
|
-
break
|
216
|
-
|
217
|
-
return self.vectorset
|
218
|
-
|
219
|
-
def _get_matryoshka_dimension(self) -> Awaitable[Optional[int]]:
|
220
|
-
if self._get_matryoshka_dimension_task is None:
|
221
|
-
self._get_matryoshka_dimension_task = asyncio.create_task(self._matryoshka_dimension())
|
222
|
-
return self._get_matryoshka_dimension_task
|
223
|
-
|
224
|
-
async def _matryoshka_dimension(self) -> Optional[int]:
|
225
|
-
vectorset = await self._select_vectorset()
|
226
|
-
return await get_matryoshka_dimension_cached(self.kbid, vectorset)
|
227
|
-
|
228
|
-
def _get_detected_entities(self) -> Awaitable[list[utils_pb2.RelationNode]]:
|
229
|
-
if self._detected_entities_task is None: # pragma: no cover
|
230
|
-
self._detected_entities_task = asyncio.create_task(detect_entities(self.kbid, self.query))
|
231
|
-
return self._detected_entities_task
|
232
|
-
|
233
|
-
def _get_entities_meta_cache(
|
234
|
-
self,
|
235
|
-
) -> Awaitable[datamanagers.entities.EntitiesMetaCache]:
|
236
|
-
if self._entities_meta_cache_task is None:
|
237
|
-
self._entities_meta_cache_task = asyncio.create_task(get_entities_meta_cache(self.kbid))
|
238
|
-
return self._entities_meta_cache_task
|
239
|
-
|
240
|
-
def _get_deleted_entity_groups(self) -> Awaitable[list[str]]:
|
241
|
-
if self._deleted_entities_groups_task is None:
|
242
|
-
self._deleted_entities_groups_task = asyncio.create_task(
|
243
|
-
get_deleted_entity_groups(self.kbid)
|
244
|
-
)
|
245
|
-
return self._deleted_entities_groups_task
|
246
|
-
|
247
|
-
def _get_synomyns(self) -> Awaitable[Optional[knowledgebox_pb2.Synonyms]]:
|
248
|
-
if self._synonyms_task is None:
|
249
|
-
self._synonyms_task = asyncio.create_task(get_kb_synonyms(self.kbid))
|
250
|
-
return self._synonyms_task
|
251
|
-
|
252
|
-
def _get_classification_labels(self) -> Awaitable[knowledgebox_pb2.Labels]:
|
253
|
-
if self._get_classification_labels_task is None:
|
254
|
-
self._get_classification_labels_task = asyncio.create_task(
|
255
|
-
get_classification_labels(self.kbid)
|
256
|
-
)
|
257
|
-
return self._get_classification_labels_task
|
184
|
+
# HACK: while transitioning to the new query parser, use fetcher under
|
185
|
+
# the hood for a smoother migration
|
186
|
+
query_info = await self.fetcher._predict_query_endpoint()
|
187
|
+
if query_info is None:
|
188
|
+
raise SendToPredictError("Error while using predict's query endpoint")
|
189
|
+
return query_info
|
258
190
|
|
259
191
|
async def _schedule_dependency_tasks(self) -> None:
|
260
192
|
"""
|
@@ -262,21 +194,22 @@ class QueryParser:
|
|
262
194
|
for the sake of the query being performed
|
263
195
|
"""
|
264
196
|
if len(self.label_filters) > 0 and has_classification_label_filters(self.flat_label_filters):
|
265
|
-
asyncio.ensure_future(self.
|
197
|
+
asyncio.ensure_future(self.fetcher.get_classification_labels())
|
266
198
|
|
267
199
|
if self.has_vector_search and self.user_vector is None:
|
268
200
|
self.query_endpoint_used = True
|
269
201
|
asyncio.ensure_future(self._get_query_information())
|
270
|
-
|
202
|
+
# XXX: should we also ensure get_vectorset and get_query_vector?
|
203
|
+
asyncio.ensure_future(self.fetcher.get_matryoshka_dimension())
|
271
204
|
|
272
205
|
if (self.has_relations_search or self.autofilter) and len(self.query) > 0:
|
273
206
|
if not self.query_endpoint_used:
|
274
207
|
# If we only need to detect entities, we don't need the query endpoint
|
275
|
-
asyncio.ensure_future(self.
|
276
|
-
asyncio.ensure_future(self.
|
277
|
-
asyncio.ensure_future(self.
|
208
|
+
asyncio.ensure_future(self.fetcher.get_detected_entities())
|
209
|
+
asyncio.ensure_future(self.fetcher.get_entities_meta_cache())
|
210
|
+
asyncio.ensure_future(self.fetcher.get_deleted_entity_groups())
|
278
211
|
if self.with_synonyms and self.query:
|
279
|
-
asyncio.ensure_future(self.
|
212
|
+
asyncio.ensure_future(self.fetcher.get_synonyms())
|
280
213
|
|
281
214
|
async def parse(self) -> tuple[nodereader_pb2.SearchRequest, bool, list[str]]:
|
282
215
|
"""
|
@@ -309,7 +242,7 @@ class QueryParser:
|
|
309
242
|
field_labels = self.flat_label_filters
|
310
243
|
paragraph_labels: list[str] = []
|
311
244
|
if has_classification_label_filters(self.flat_label_filters):
|
312
|
-
classification_labels = await self.
|
245
|
+
classification_labels = await self.fetcher.get_classification_labels()
|
313
246
|
field_labels, paragraph_labels = split_labels_by_type(
|
314
247
|
self.flat_label_filters, classification_labels
|
315
248
|
)
|
@@ -398,19 +331,13 @@ class QueryParser:
|
|
398
331
|
semantic_min_score = self.min_score.semantic
|
399
332
|
elif self.has_vector_search and not incomplete:
|
400
333
|
query_information = await self._get_query_information()
|
401
|
-
vectorset = await self.
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
semantic_min_score = semantic_threshold
|
406
|
-
else:
|
407
|
-
logger.warning(
|
408
|
-
"Semantic threshold not found in query information, using default",
|
409
|
-
extra={"kbid": self.kbid},
|
410
|
-
)
|
334
|
+
vectorset = await self.fetcher.get_vectorset()
|
335
|
+
semantic_threshold = query_information.semantic_thresholds.get(vectorset, None)
|
336
|
+
if semantic_threshold is not None:
|
337
|
+
semantic_min_score = semantic_threshold
|
411
338
|
else:
|
412
339
|
logger.warning(
|
413
|
-
"
|
340
|
+
"Semantic threshold not found in query information, using default",
|
414
341
|
extra={"kbid": self.kbid},
|
415
342
|
)
|
416
343
|
self.min_score.semantic = semantic_min_score
|
@@ -427,70 +354,18 @@ class QueryParser:
|
|
427
354
|
request.paragraph = True
|
428
355
|
node_features.inc({"type": "paragraphs"})
|
429
356
|
|
430
|
-
async def select_query_vectorset(self) -> Optional[str]:
|
431
|
-
"""Set and return the requested vectorset parameter (if used) validated
|
432
|
-
for the current KB.
|
433
|
-
|
434
|
-
"""
|
435
|
-
if not self.vectorset:
|
436
|
-
return None
|
437
|
-
|
438
|
-
# validate vectorset
|
439
|
-
async with datamanagers.with_ro_transaction() as txn:
|
440
|
-
if not await datamanagers.vectorsets.exists(
|
441
|
-
txn, kbid=self.kbid, vectorset_id=self.vectorset
|
442
|
-
):
|
443
|
-
raise InvalidQueryError(
|
444
|
-
"vectorset",
|
445
|
-
f"Vectorset {self.vectorset} doesn't exist in you Knowledge Box",
|
446
|
-
)
|
447
|
-
return self.vectorset
|
448
|
-
|
449
357
|
async def parse_vector_search(self, request: nodereader_pb2.SearchRequest) -> bool:
|
450
358
|
if not self.has_vector_search:
|
451
359
|
return False
|
452
360
|
|
453
361
|
node_features.inc({"type": "vectors"})
|
454
362
|
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
if vectorset is not None:
|
459
|
-
request.vectorset = vectorset
|
460
|
-
|
461
|
-
query_vector = None
|
462
|
-
if self.user_vector is None:
|
463
|
-
try:
|
464
|
-
query_info = await self._get_query_information()
|
465
|
-
except SendToPredictError as err:
|
466
|
-
logger.warning(f"Errors on predict api trying to embedd query: {err}")
|
467
|
-
incomplete = True
|
468
|
-
else:
|
469
|
-
if query_info and query_info.sentence:
|
470
|
-
if vectorset:
|
471
|
-
if vectorset in query_info.sentence.vectors:
|
472
|
-
query_vector = query_info.sentence.vectors[vectorset]
|
473
|
-
else:
|
474
|
-
incomplete = True
|
475
|
-
else:
|
476
|
-
for vectorset_id, vector in query_info.sentence.vectors.items():
|
477
|
-
if vector:
|
478
|
-
query_vector = vector
|
479
|
-
break
|
480
|
-
else:
|
481
|
-
incomplete = True
|
482
|
-
|
483
|
-
else:
|
484
|
-
incomplete = True
|
485
|
-
else:
|
486
|
-
query_vector = self.user_vector
|
363
|
+
vectorset = await self.fetcher.get_vectorset()
|
364
|
+
query_vector = await self.fetcher.get_query_vector()
|
365
|
+
incomplete = query_vector is None
|
487
366
|
|
367
|
+
request.vectorset = vectorset
|
488
368
|
if query_vector is not None:
|
489
|
-
matryoshka_dimension = await self._get_matryoshka_dimension()
|
490
|
-
if matryoshka_dimension is not None:
|
491
|
-
# KB using a matryoshka embeddings model, cut the query vector
|
492
|
-
# accordingly
|
493
|
-
query_vector = query_vector[:matryoshka_dimension]
|
494
369
|
request.vector.extend(query_vector)
|
495
370
|
|
496
371
|
return incomplete
|
@@ -498,20 +373,15 @@ class QueryParser:
|
|
498
373
|
async def parse_relation_search(self, request: nodereader_pb2.SearchRequest) -> list[str]:
|
499
374
|
autofilters = []
|
500
375
|
if self.has_relations_search or self.autofilter:
|
501
|
-
|
502
|
-
|
503
|
-
else:
|
504
|
-
query_info_result = await self._get_query_information()
|
505
|
-
if query_info_result.entities:
|
506
|
-
detected_entities = convert_relations(query_info_result.entities.model_dump())
|
507
|
-
else:
|
508
|
-
detected_entities = []
|
509
|
-
meta_cache = await self._get_entities_meta_cache()
|
376
|
+
detected_entities = await self.fetcher.get_detected_entities()
|
377
|
+
meta_cache = await self.fetcher.get_entities_meta_cache()
|
510
378
|
detected_entities = expand_entities(meta_cache, detected_entities)
|
511
379
|
if self.has_relations_search:
|
512
380
|
request.relation_subgraph.entry_points.extend(detected_entities)
|
513
381
|
request.relation_subgraph.depth = 1
|
514
|
-
request.relation_subgraph.deleted_groups.extend(
|
382
|
+
request.relation_subgraph.deleted_groups.extend(
|
383
|
+
await self.fetcher.get_deleted_entity_groups()
|
384
|
+
)
|
515
385
|
for group_id, deleted_entities in meta_cache.deleted_entities.items():
|
516
386
|
request.relation_subgraph.deleted_entities.append(
|
517
387
|
nodereader_pb2.EntitiesSubgraphRequest.DeletedEntities(
|
@@ -544,7 +414,7 @@ class QueryParser:
|
|
544
414
|
"Search with custom synonyms is only supported on paragraph and document search",
|
545
415
|
)
|
546
416
|
|
547
|
-
synonyms = await self.
|
417
|
+
synonyms = await self.fetcher.get_synonyms()
|
548
418
|
if synonyms is None:
|
549
419
|
# No synonyms found
|
550
420
|
return
|
@@ -680,29 +550,6 @@ async def paragraph_query_to_pb(
|
|
680
550
|
return request
|
681
551
|
|
682
552
|
|
683
|
-
@query_parse_dependency_observer.wrap({"type": "query_information"})
|
684
|
-
async def query_information(
|
685
|
-
kbid: str,
|
686
|
-
query: str,
|
687
|
-
semantic_model: Optional[str],
|
688
|
-
generative_model: Optional[str] = None,
|
689
|
-
rephrase: bool = False,
|
690
|
-
rephrase_prompt: Optional[str] = None,
|
691
|
-
) -> QueryInfo:
|
692
|
-
predict = get_predict()
|
693
|
-
return await predict.query(kbid, query, semantic_model, generative_model, rephrase, rephrase_prompt)
|
694
|
-
|
695
|
-
|
696
|
-
@query_parse_dependency_observer.wrap({"type": "detect_entities"})
|
697
|
-
async def detect_entities(kbid: str, query: str) -> list[utils_pb2.RelationNode]:
|
698
|
-
predict = get_predict()
|
699
|
-
try:
|
700
|
-
return await predict.detect_entities(kbid, query)
|
701
|
-
except SendToPredictError as ex:
|
702
|
-
logger.warning(f"Errors on predict api detecting entities: {ex}")
|
703
|
-
return []
|
704
|
-
|
705
|
-
|
706
553
|
def expand_entities(
|
707
554
|
meta_cache: datamanagers.entities.EntitiesMetaCache,
|
708
555
|
detected_entities: list[utils_pb2.RelationNode],
|
@@ -833,30 +680,6 @@ PROCESSING_STATUS_TO_PB_MAP = {
|
|
833
680
|
}
|
834
681
|
|
835
682
|
|
836
|
-
@query_parse_dependency_observer.wrap({"type": "synonyms"})
|
837
|
-
async def get_kb_synonyms(kbid: str) -> Optional[knowledgebox_pb2.Synonyms]:
|
838
|
-
async with get_driver().transaction(read_only=True) as txn:
|
839
|
-
return await datamanagers.synonyms.get(txn, kbid=kbid)
|
840
|
-
|
841
|
-
|
842
|
-
@query_parse_dependency_observer.wrap({"type": "entities_meta_cache"})
|
843
|
-
async def get_entities_meta_cache(kbid: str) -> datamanagers.entities.EntitiesMetaCache:
|
844
|
-
async with get_driver().transaction(read_only=True) as txn:
|
845
|
-
return await datamanagers.entities.get_entities_meta_cache(txn, kbid=kbid)
|
846
|
-
|
847
|
-
|
848
|
-
@query_parse_dependency_observer.wrap({"type": "deleted_entities_groups"})
|
849
|
-
async def get_deleted_entity_groups(kbid: str) -> list[str]:
|
850
|
-
async with get_driver().transaction(read_only=True) as txn:
|
851
|
-
return list((await datamanagers.entities.get_deleted_groups(txn, kbid=kbid)).entities_groups)
|
852
|
-
|
853
|
-
|
854
|
-
@query_parse_dependency_observer.wrap({"type": "classification_labels"})
|
855
|
-
async def get_classification_labels(kbid: str) -> knowledgebox_pb2.Labels:
|
856
|
-
async with get_driver().transaction(read_only=True) as txn:
|
857
|
-
return await datamanagers.labels.get_labels(txn, kbid=kbid)
|
858
|
-
|
859
|
-
|
860
683
|
def check_supported_filters(filters: dict[str, Any], paragraph_labels: list[str]):
|
861
684
|
"""
|
862
685
|
Check if the provided filters are supported:
|
@@ -889,28 +712,6 @@ def check_supported_filters(filters: dict[str, Any], paragraph_labels: list[str]
|
|
889
712
|
)
|
890
713
|
|
891
714
|
|
892
|
-
@alru_cache(maxsize=None)
|
893
|
-
async def get_matryoshka_dimension_cached(kbid: str, vectorset: Optional[str]) -> Optional[int]:
|
894
|
-
# This can be safely cached as the matryoshka dimension is not expected to change
|
895
|
-
return await get_matryoshka_dimension(kbid, vectorset)
|
896
|
-
|
897
|
-
|
898
|
-
@query_parse_dependency_observer.wrap({"type": "matryoshka_dimension"})
|
899
|
-
async def get_matryoshka_dimension(kbid: str, vectorset: Optional[str]) -> Optional[int]:
|
900
|
-
async with get_driver().transaction(read_only=True) as txn:
|
901
|
-
matryoshka_dimension = None
|
902
|
-
if not vectorset:
|
903
|
-
# XXX this should be migrated once we remove the "default" vectorset
|
904
|
-
# concept
|
905
|
-
matryoshka_dimension = await datamanagers.kb.get_matryoshka_vector_dimension(txn, kbid=kbid)
|
906
|
-
else:
|
907
|
-
vectorset_config = await datamanagers.vectorsets.get(txn, kbid=kbid, vectorset_id=vectorset)
|
908
|
-
if vectorset_config is not None and vectorset_config.vectorset_index_config.vector_dimension:
|
909
|
-
matryoshka_dimension = vectorset_config.vectorset_index_config.vector_dimension
|
910
|
-
|
911
|
-
return matryoshka_dimension
|
912
|
-
|
913
|
-
|
914
715
|
def get_sort_field_proto(obj: SortField) -> Optional[nodereader_pb2.OrderBy.OrderField.ValueType]:
|
915
716
|
return {
|
916
717
|
SortField.SCORE: None,
|
@@ -19,4 +19,14 @@
|
|
19
19
|
#
|
20
20
|
|
21
21
|
|
22
|
-
class
|
22
|
+
class InternalParserError(ValueError):
|
23
|
+
"""Raised when parsing fails due to some internal error"""
|
24
|
+
|
25
|
+
|
26
|
+
class InvalidQueryError(Exception):
|
27
|
+
"""Raised when parsing a query containing an invalid parameter"""
|
28
|
+
|
29
|
+
def __init__(self, param: str, reason: str):
|
30
|
+
self.param = param
|
31
|
+
self.reason = reason
|
32
|
+
super().__init__(f"Invalid query. Error in {param}: {reason}")
|
@@ -0,0 +1,399 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
from typing import Optional, TypeVar, Union
|
21
|
+
|
22
|
+
from async_lru import alru_cache
|
23
|
+
from typing_extensions import TypeIs
|
24
|
+
|
25
|
+
from nucliadb.common import datamanagers
|
26
|
+
from nucliadb.common.maindb.utils import get_driver
|
27
|
+
from nucliadb.search import logger
|
28
|
+
from nucliadb.search.predict import SendToPredictError, convert_relations
|
29
|
+
from nucliadb.search.search.metrics import (
|
30
|
+
query_parse_dependency_observer,
|
31
|
+
)
|
32
|
+
from nucliadb.search.search.query_parser.exceptions import InvalidQueryError
|
33
|
+
from nucliadb.search.utilities import get_predict
|
34
|
+
from nucliadb_models.internal.predict import QueryInfo
|
35
|
+
from nucliadb_protos import knowledgebox_pb2, utils_pb2
|
36
|
+
|
37
|
+
|
38
|
+
# We use a class as cache miss marker to allow None values in the cache and to
|
39
|
+
# make mypy happy with typing
|
40
|
+
class NotCached:
|
41
|
+
pass
|
42
|
+
|
43
|
+
|
44
|
+
not_cached = NotCached()
|
45
|
+
|
46
|
+
|
47
|
+
T = TypeVar("T")
|
48
|
+
|
49
|
+
|
50
|
+
def is_cached(field: Union[T, NotCached]) -> TypeIs[T]:
|
51
|
+
return not isinstance(field, NotCached)
|
52
|
+
|
53
|
+
|
54
|
+
class FetcherCache:
|
55
|
+
predict_query_info: Union[Optional[QueryInfo], NotCached] = not_cached
|
56
|
+
predict_detected_entities: Union[list[utils_pb2.RelationNode], NotCached] = not_cached
|
57
|
+
|
58
|
+
# semantic search
|
59
|
+
query_vector: Union[Optional[list[float]], NotCached] = not_cached
|
60
|
+
vectorset: Union[str, NotCached] = not_cached
|
61
|
+
matryoshka_dimension: Union[Optional[int], NotCached] = not_cached
|
62
|
+
|
63
|
+
labels: Union[knowledgebox_pb2.Labels, NotCached] = not_cached
|
64
|
+
|
65
|
+
synonyms: Union[Optional[knowledgebox_pb2.Synonyms], NotCached] = not_cached
|
66
|
+
|
67
|
+
entities_meta_cache: Union[datamanagers.entities.EntitiesMetaCache, NotCached] = not_cached
|
68
|
+
deleted_entity_groups: Union[list[str], NotCached] = not_cached
|
69
|
+
detected_entities: Union[list[utils_pb2.RelationNode], NotCached] = not_cached
|
70
|
+
|
71
|
+
|
72
|
+
class Fetcher:
|
73
|
+
"""Queries are getting more and more complex and different phases of the
|
74
|
+
query depend on different data, not only from the user but from other parts
|
75
|
+
of the system.
|
76
|
+
|
77
|
+
This class is an encapsulation of data gathering across different parts of
|
78
|
+
the system. Given the user query input, it aims to be as efficient as
|
79
|
+
possible removing redundant expensive calls to other parts of the system. An
|
80
|
+
instance of a fetcher caches it's results and it's thought to be used in the
|
81
|
+
context of a single request. DO NOT use this as a global object!
|
82
|
+
|
83
|
+
"""
|
84
|
+
|
85
|
+
def __init__(
|
86
|
+
self,
|
87
|
+
kbid: str,
|
88
|
+
*,
|
89
|
+
query: str,
|
90
|
+
user_vector: Optional[list[float]],
|
91
|
+
vectorset: Optional[str],
|
92
|
+
rephrase: bool,
|
93
|
+
rephrase_prompt: Optional[str],
|
94
|
+
generative_model: Optional[str],
|
95
|
+
):
|
96
|
+
self.kbid = kbid
|
97
|
+
self.query = query
|
98
|
+
self.user_vector = user_vector
|
99
|
+
self.user_vectorset = vectorset
|
100
|
+
self.rephrase = rephrase
|
101
|
+
self.rephrase_prompt = rephrase_prompt
|
102
|
+
self.generative_model = generative_model
|
103
|
+
|
104
|
+
self.cache = FetcherCache()
|
105
|
+
self._validated = False
|
106
|
+
|
107
|
+
# Validation
|
108
|
+
|
109
|
+
async def initial_validate(self):
|
110
|
+
"""Runs a validation on the input parameters. It can raise errors if
|
111
|
+
there's some wrong parameter.
|
112
|
+
|
113
|
+
This function should be always called if validated input for fetching is
|
114
|
+
desired
|
115
|
+
"""
|
116
|
+
if self._validated:
|
117
|
+
return
|
118
|
+
|
119
|
+
self._validated = True
|
120
|
+
|
121
|
+
async def _validate_vectorset(self):
|
122
|
+
if self.user_vectorset is not None:
|
123
|
+
await validate_vectorset(self.kbid, self.user_vectorset)
|
124
|
+
|
125
|
+
# Semantic search
|
126
|
+
|
127
|
+
async def get_matryoshka_dimension(self) -> Optional[int]:
|
128
|
+
if is_cached(self.cache.matryoshka_dimension):
|
129
|
+
return self.cache.matryoshka_dimension
|
130
|
+
|
131
|
+
vectorset = await self.get_vectorset()
|
132
|
+
matryoshka_dimension = await get_matryoshka_dimension_cached(self.kbid, vectorset)
|
133
|
+
self.cache.matryoshka_dimension = matryoshka_dimension
|
134
|
+
return matryoshka_dimension
|
135
|
+
|
136
|
+
async def _get_user_vectorset(self) -> Optional[str]:
|
137
|
+
"""Returns the user's requested vectorset and validates if it does exist
|
138
|
+
in the KB.
|
139
|
+
|
140
|
+
"""
|
141
|
+
vectorset = self.user_vectorset
|
142
|
+
if not self._validated:
|
143
|
+
await self._validate_vectorset()
|
144
|
+
return vectorset
|
145
|
+
|
146
|
+
async def get_vectorset(self) -> str:
|
147
|
+
"""Get the vectorset to be used in the search. If not specified, by the
|
148
|
+
user, Predict API or the own uses KB will provide a default.
|
149
|
+
|
150
|
+
"""
|
151
|
+
|
152
|
+
if is_cached(self.cache.vectorset):
|
153
|
+
return self.cache.vectorset
|
154
|
+
|
155
|
+
if self.user_vectorset:
|
156
|
+
# user explicitly asked for a vectorset
|
157
|
+
self.cache.vectorset = self.user_vectorset
|
158
|
+
return self.user_vectorset
|
159
|
+
|
160
|
+
# when it's not provided, we get the default from Predict API
|
161
|
+
query_info = await self._predict_query_endpoint()
|
162
|
+
if query_info is None:
|
163
|
+
vectorset = None
|
164
|
+
else:
|
165
|
+
if query_info.sentence is None:
|
166
|
+
logger.error(
|
167
|
+
"Asking for a vectorset but /query didn't return one", extra={"kbid": self.kbid}
|
168
|
+
)
|
169
|
+
vectorset = None
|
170
|
+
else:
|
171
|
+
# vectors field is enforced by the data model to have at least one key
|
172
|
+
for vectorset in query_info.sentence.vectors.keys():
|
173
|
+
vectorset = vectorset
|
174
|
+
break
|
175
|
+
|
176
|
+
if vectorset is None:
|
177
|
+
# in case predict don't answer which vectorset to use, fallback to
|
178
|
+
# the first vectorset of the KB
|
179
|
+
async with datamanagers.with_ro_transaction() as txn:
|
180
|
+
async for vectorset, _ in datamanagers.vectorsets.iter(txn, kbid=self.kbid):
|
181
|
+
break
|
182
|
+
assert vectorset is not None, "All KBs must have at least one vectorset in maindb"
|
183
|
+
|
184
|
+
self.cache.vectorset = vectorset
|
185
|
+
return vectorset
|
186
|
+
|
187
|
+
async def get_query_vector(self) -> Optional[list[float]]:
|
188
|
+
if is_cached(self.cache.query_vector):
|
189
|
+
return self.cache.query_vector
|
190
|
+
|
191
|
+
if self.user_vector is not None:
|
192
|
+
query_vector = self.user_vector
|
193
|
+
else:
|
194
|
+
query_info = await self._predict_query_endpoint()
|
195
|
+
if query_info is None or query_info.sentence is None:
|
196
|
+
self.cache.query_vector = None
|
197
|
+
return None
|
198
|
+
|
199
|
+
vectorset = await self.get_vectorset()
|
200
|
+
if vectorset not in query_info.sentence.vectors:
|
201
|
+
logger.warning(
|
202
|
+
"Predict is not responding with a valid query nucliadb vectorset",
|
203
|
+
extra={
|
204
|
+
"kbid": self.kbid,
|
205
|
+
"vectorset": vectorset,
|
206
|
+
"predict_vectorsets": ",".join(query_info.sentence.vectors.keys()),
|
207
|
+
},
|
208
|
+
)
|
209
|
+
self.cache.query_vector = None
|
210
|
+
return None
|
211
|
+
|
212
|
+
query_vector = query_info.sentence.vectors[vectorset]
|
213
|
+
|
214
|
+
matryoshka_dimension = await self.get_matryoshka_dimension()
|
215
|
+
if matryoshka_dimension is not None:
|
216
|
+
if self.user_vector is not None and len(query_vector) < matryoshka_dimension:
|
217
|
+
raise InvalidQueryError(
|
218
|
+
"vector",
|
219
|
+
f"Invalid vector length, please check valid embedding size for {vectorset} model",
|
220
|
+
)
|
221
|
+
|
222
|
+
# KB using a matryoshka embeddings model, cut the query vector
|
223
|
+
# accordingly
|
224
|
+
query_vector = query_vector[:matryoshka_dimension]
|
225
|
+
|
226
|
+
self.cache.query_vector = query_vector
|
227
|
+
return query_vector
|
228
|
+
|
229
|
+
# Labels
|
230
|
+
|
231
|
+
async def get_classification_labels(self) -> knowledgebox_pb2.Labels:
|
232
|
+
if is_cached(self.cache.labels):
|
233
|
+
return self.cache.labels
|
234
|
+
|
235
|
+
labels = await get_classification_labels(self.kbid)
|
236
|
+
self.cache.labels = labels
|
237
|
+
return labels
|
238
|
+
|
239
|
+
# Entities
|
240
|
+
|
241
|
+
async def get_entities_meta_cache(self) -> datamanagers.entities.EntitiesMetaCache:
|
242
|
+
if is_cached(self.cache.entities_meta_cache):
|
243
|
+
return self.cache.entities_meta_cache
|
244
|
+
|
245
|
+
entities_meta_cache = await get_entities_meta_cache(self.kbid)
|
246
|
+
self.cache.entities_meta_cache = entities_meta_cache
|
247
|
+
return entities_meta_cache
|
248
|
+
|
249
|
+
async def get_deleted_entity_groups(self) -> list[str]:
|
250
|
+
if is_cached(self.cache.deleted_entity_groups):
|
251
|
+
return self.cache.deleted_entity_groups
|
252
|
+
|
253
|
+
deleted_entity_groups = await get_deleted_entity_groups(self.kbid)
|
254
|
+
self.cache.deleted_entity_groups = deleted_entity_groups
|
255
|
+
return deleted_entity_groups
|
256
|
+
|
257
|
+
async def get_detected_entities(self) -> list[utils_pb2.RelationNode]:
|
258
|
+
if is_cached(self.cache.detected_entities):
|
259
|
+
return self.cache.detected_entities
|
260
|
+
|
261
|
+
# Optimization to avoid calling predict twice
|
262
|
+
if is_cached(self.cache.predict_query_info):
|
263
|
+
# /query supersets detect entities, so we already have them
|
264
|
+
query_info = self.cache.predict_query_info
|
265
|
+
if query_info is not None and query_info.entities is not None:
|
266
|
+
detected_entities = convert_relations(query_info.entities.model_dump())
|
267
|
+
else:
|
268
|
+
detected_entities = []
|
269
|
+
else:
|
270
|
+
# No call to /query has been done, we'll use detect entities
|
271
|
+
# endpoint instead (as it's faster)
|
272
|
+
detected_entities = await self._predict_detect_entities()
|
273
|
+
|
274
|
+
self.cache.detected_entities = detected_entities
|
275
|
+
return detected_entities
|
276
|
+
|
277
|
+
# Synonyms
|
278
|
+
|
279
|
+
async def get_synonyms(self) -> Optional[knowledgebox_pb2.Synonyms]:
|
280
|
+
if is_cached(self.cache.synonyms):
|
281
|
+
return self.cache.synonyms
|
282
|
+
|
283
|
+
synonyms = await get_kb_synonyms(self.kbid)
|
284
|
+
self.cache.synonyms = synonyms
|
285
|
+
return synonyms
|
286
|
+
|
287
|
+
# Predict API
|
288
|
+
|
289
|
+
async def _predict_query_endpoint(self) -> Optional[QueryInfo]:
|
290
|
+
if is_cached(self.cache.predict_query_info):
|
291
|
+
return self.cache.predict_query_info
|
292
|
+
|
293
|
+
# calling twice should be avoided as query endpoint is a superset of detect entities
|
294
|
+
if is_cached(self.cache.predict_detected_entities):
|
295
|
+
logger.warning("Fetcher is not being efficient enough and has called predict twice!")
|
296
|
+
|
297
|
+
# we can't call get_vectorset, as it would do a recirsive loop between
|
298
|
+
# functions, so we'll manually parse it
|
299
|
+
vectorset = await self._get_user_vectorset()
|
300
|
+
try:
|
301
|
+
query_info = await query_information(
|
302
|
+
self.kbid,
|
303
|
+
self.query,
|
304
|
+
vectorset,
|
305
|
+
self.generative_model,
|
306
|
+
self.rephrase,
|
307
|
+
self.rephrase_prompt,
|
308
|
+
)
|
309
|
+
except SendToPredictError:
|
310
|
+
query_info = None
|
311
|
+
|
312
|
+
self.cache.predict_query_info = query_info
|
313
|
+
return query_info
|
314
|
+
|
315
|
+
async def _predict_detect_entities(self) -> list[utils_pb2.RelationNode]:
|
316
|
+
if is_cached(self.cache.predict_detected_entities):
|
317
|
+
return self.cache.predict_detected_entities
|
318
|
+
|
319
|
+
try:
|
320
|
+
detected_entities = await detect_entities(self.kbid, self.query)
|
321
|
+
except SendToPredictError as ex:
|
322
|
+
logger.warning(f"Errors on Predict API detecting entities: {ex}", extra={"kbid": self.kbid})
|
323
|
+
detected_entities = []
|
324
|
+
|
325
|
+
self.cache.predict_detected_entities = detected_entities
|
326
|
+
return detected_entities
|
327
|
+
|
328
|
+
|
329
|
+
async def validate_vectorset(kbid: str, vectorset: str):
|
330
|
+
async with datamanagers.with_ro_transaction() as txn:
|
331
|
+
if not await datamanagers.vectorsets.exists(txn, kbid=kbid, vectorset_id=vectorset):
|
332
|
+
raise InvalidQueryError(
|
333
|
+
"vectorset", f"Vectorset {vectorset} doesn't exist in you Knowledge Box"
|
334
|
+
)
|
335
|
+
|
336
|
+
|
337
|
+
@query_parse_dependency_observer.wrap({"type": "query_information"})
|
338
|
+
async def query_information(
|
339
|
+
kbid: str,
|
340
|
+
query: str,
|
341
|
+
semantic_model: Optional[str],
|
342
|
+
generative_model: Optional[str] = None,
|
343
|
+
rephrase: bool = False,
|
344
|
+
rephrase_prompt: Optional[str] = None,
|
345
|
+
) -> QueryInfo:
|
346
|
+
predict = get_predict()
|
347
|
+
return await predict.query(kbid, query, semantic_model, generative_model, rephrase, rephrase_prompt)
|
348
|
+
|
349
|
+
|
350
|
+
@query_parse_dependency_observer.wrap({"type": "detect_entities"})
|
351
|
+
async def detect_entities(kbid: str, query: str) -> list[utils_pb2.RelationNode]:
|
352
|
+
predict = get_predict()
|
353
|
+
return await predict.detect_entities(kbid, query)
|
354
|
+
|
355
|
+
|
356
|
+
@alru_cache(maxsize=None)
|
357
|
+
async def get_matryoshka_dimension_cached(kbid: str, vectorset: Optional[str]) -> Optional[int]:
|
358
|
+
# This can be safely cached as the matryoshka dimension is not expected to change
|
359
|
+
return await get_matryoshka_dimension(kbid, vectorset)
|
360
|
+
|
361
|
+
|
362
|
+
@query_parse_dependency_observer.wrap({"type": "matryoshka_dimension"})
|
363
|
+
async def get_matryoshka_dimension(kbid: str, vectorset: Optional[str]) -> Optional[int]:
|
364
|
+
async with get_driver().transaction(read_only=True) as txn:
|
365
|
+
matryoshka_dimension = None
|
366
|
+
if not vectorset:
|
367
|
+
# XXX this should be migrated once we remove the "default" vectorset
|
368
|
+
# concept
|
369
|
+
matryoshka_dimension = await datamanagers.kb.get_matryoshka_vector_dimension(txn, kbid=kbid)
|
370
|
+
else:
|
371
|
+
vectorset_config = await datamanagers.vectorsets.get(txn, kbid=kbid, vectorset_id=vectorset)
|
372
|
+
if vectorset_config is not None and vectorset_config.vectorset_index_config.vector_dimension:
|
373
|
+
matryoshka_dimension = vectorset_config.vectorset_index_config.vector_dimension
|
374
|
+
|
375
|
+
return matryoshka_dimension
|
376
|
+
|
377
|
+
|
378
|
+
@query_parse_dependency_observer.wrap({"type": "classification_labels"})
|
379
|
+
async def get_classification_labels(kbid: str) -> knowledgebox_pb2.Labels:
|
380
|
+
async with get_driver().transaction(read_only=True) as txn:
|
381
|
+
return await datamanagers.labels.get_labels(txn, kbid=kbid)
|
382
|
+
|
383
|
+
|
384
|
+
@query_parse_dependency_observer.wrap({"type": "synonyms"})
|
385
|
+
async def get_kb_synonyms(kbid: str) -> Optional[knowledgebox_pb2.Synonyms]:
|
386
|
+
async with get_driver().transaction(read_only=True) as txn:
|
387
|
+
return await datamanagers.synonyms.get(txn, kbid=kbid)
|
388
|
+
|
389
|
+
|
390
|
+
@query_parse_dependency_observer.wrap({"type": "entities_meta_cache"})
|
391
|
+
async def get_entities_meta_cache(kbid: str) -> datamanagers.entities.EntitiesMetaCache:
|
392
|
+
async with get_driver().transaction(read_only=True) as txn:
|
393
|
+
return await datamanagers.entities.get_entities_meta_cache(txn, kbid=kbid)
|
394
|
+
|
395
|
+
|
396
|
+
@query_parse_dependency_observer.wrap({"type": "deleted_entities_groups"})
|
397
|
+
async def get_deleted_entity_groups(kbid: str) -> list[str]:
|
398
|
+
async with get_driver().transaction(read_only=True) as txn:
|
399
|
+
return list((await datamanagers.entities.get_deleted_groups(txn, kbid=kbid)).entities_groups)
|
@@ -26,7 +26,7 @@ from nucliadb.search.search.filters import (
|
|
26
26
|
convert_to_node_filters,
|
27
27
|
translate_label_filters,
|
28
28
|
)
|
29
|
-
from nucliadb.search.search.query_parser.exceptions import
|
29
|
+
from nucliadb.search.search.query_parser.exceptions import InternalParserError
|
30
30
|
from nucliadb.search.search.query_parser.models import (
|
31
31
|
CatalogFilters,
|
32
32
|
CatalogQuery,
|
@@ -50,25 +50,26 @@ from nucliadb_models.search import (
|
|
50
50
|
)
|
51
51
|
|
52
52
|
|
53
|
-
def parse_find(item: FindRequest) -> UnitRetrieval:
|
54
|
-
parser = _FindParser(item)
|
55
|
-
return parser.parse()
|
53
|
+
async def parse_find(kbid: str, item: FindRequest) -> UnitRetrieval:
|
54
|
+
parser = _FindParser(kbid, item)
|
55
|
+
return await parser.parse()
|
56
56
|
|
57
57
|
|
58
58
|
class _FindParser:
|
59
|
-
def __init__(self, item: FindRequest):
|
59
|
+
def __init__(self, kbid: str, item: FindRequest):
|
60
|
+
self.kbid = kbid
|
60
61
|
self.item = item
|
61
62
|
|
62
|
-
def parse(self) -> UnitRetrieval:
|
63
|
+
async def parse(self) -> UnitRetrieval:
|
63
64
|
top_k = self._parse_top_k()
|
64
65
|
try:
|
65
66
|
rank_fusion = self._parse_rank_fusion()
|
66
67
|
except ValidationError as exc:
|
67
|
-
raise
|
68
|
+
raise InternalParserError(f"Parsing error in rank fusion: {str(exc)}") from exc
|
68
69
|
try:
|
69
70
|
reranker = self._parse_reranker()
|
70
71
|
except ValidationError as exc:
|
71
|
-
raise
|
72
|
+
raise InternalParserError(f"Parsing error in reranker: {str(exc)}") from exc
|
72
73
|
|
73
74
|
# Adjust retrieval windows. Our current implementation assume:
|
74
75
|
# `top_k <= reranker.window <= rank_fusion.window`
|
@@ -98,7 +99,7 @@ class _FindParser:
|
|
98
99
|
if self.item.rank_fusion == search_models.RankFusionName.RECIPROCAL_RANK_FUSION:
|
99
100
|
rank_fusion = ReciprocalRankFusion(window=window)
|
100
101
|
else:
|
101
|
-
raise
|
102
|
+
raise InternalParserError(f"Unknown rank fusion algorithm: {self.item.rank_fusion}")
|
102
103
|
|
103
104
|
elif isinstance(self.item.rank_fusion, search_models.ReciprocalRankFusion):
|
104
105
|
user_window = self.item.rank_fusion.window
|
@@ -109,7 +110,7 @@ class _FindParser:
|
|
109
110
|
)
|
110
111
|
|
111
112
|
else:
|
112
|
-
raise
|
113
|
+
raise InternalParserError(f"Unknown rank fusion {self.item.rank_fusion}")
|
113
114
|
|
114
115
|
return rank_fusion
|
115
116
|
|
@@ -131,14 +132,14 @@ class _FindParser:
|
|
131
132
|
reranking = PredictReranker(window=min(top_k * 2, 200))
|
132
133
|
|
133
134
|
else:
|
134
|
-
raise
|
135
|
+
raise InternalParserError(f"Unknown reranker algorithm: {self.item.reranker}")
|
135
136
|
|
136
137
|
elif isinstance(self.item.reranker, search_models.PredictReranker):
|
137
138
|
user_window = self.item.reranker.window
|
138
139
|
reranking = PredictReranker(window=min(max(user_window or 0, top_k), 200))
|
139
140
|
|
140
141
|
else:
|
141
|
-
raise
|
142
|
+
raise InternalParserError(f"Unknown reranker {self.item.reranker}")
|
142
143
|
|
143
144
|
return reranking
|
144
145
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: nucliadb
|
3
|
-
Version: 6.2.1.
|
3
|
+
Version: 6.2.1.post3165
|
4
4
|
Home-page: https://docs.nuclia.dev/docs/management/nucliadb/intro
|
5
5
|
Author: NucliaDB Community
|
6
6
|
Author-email: nucliadb@nuclia.com
|
@@ -22,10 +22,10 @@ Classifier: Programming Language :: Python :: 3.12
|
|
22
22
|
Classifier: Programming Language :: Python :: 3 :: Only
|
23
23
|
Requires-Python: >=3.9, <4
|
24
24
|
Description-Content-Type: text/markdown
|
25
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.2.1.
|
26
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.2.1.
|
27
|
-
Requires-Dist: nucliadb-protos>=6.2.1.
|
28
|
-
Requires-Dist: nucliadb-models>=6.2.1.
|
25
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.2.1.post3165
|
26
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.2.1.post3165
|
27
|
+
Requires-Dist: nucliadb-protos>=6.2.1.post3165
|
28
|
+
Requires-Dist: nucliadb-models>=6.2.1.post3165
|
29
29
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
30
30
|
Requires-Dist: nuclia-models>=0.24.2
|
31
31
|
Requires-Dist: uvicorn
|
@@ -207,10 +207,10 @@ nucliadb/search/requesters/utils.py,sha256=ZTiWDkDihJ7rcvs7itCe8hr6OclVcvu_2EAPF
|
|
207
207
|
nucliadb/search/search/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
208
208
|
nucliadb/search/search/cache.py,sha256=n9vkN6Y6Xnr2RBJyoH0WzjzGTJOMfKekU9tfPTWWCPc,6810
|
209
209
|
nucliadb/search/search/cut.py,sha256=ytY0_GY7ocNjfxTb4aosxEp4ZfhQNDP--JkhEMGD298,1153
|
210
|
-
nucliadb/search/search/exceptions.py,sha256=
|
210
|
+
nucliadb/search/search/exceptions.py,sha256=klGLgAGGrXcSGix_W6418ZBMqDchAIGjN77ofkOScEI,1039
|
211
211
|
nucliadb/search/search/fetch.py,sha256=XJHIFnZmXM_8Kb37lb4lg1GYG7cZ1plT-qAIb_QziX4,6184
|
212
212
|
nucliadb/search/search/filters.py,sha256=1MkHlJjAQqoRCj7e5cEzK2HvBxGLE17I_omsjiklbtw,6476
|
213
|
-
nucliadb/search/search/find.py,sha256=
|
213
|
+
nucliadb/search/search/find.py,sha256=DaO3CPBQqRAw-iK_DNf_gM-aEipjtuX6oA2TbAplkxs,9901
|
214
214
|
nucliadb/search/search/find_merge.py,sha256=5Aqz54E5GG8jw666KNncVHIJcs821ug-YwJ46YL6Br8,17363
|
215
215
|
nucliadb/search/search/graph_strategy.py,sha256=Egcq_zn895gTUYmyQTsXj8YaUMa3HBKhcSa1GBvgzAM,31877
|
216
216
|
nucliadb/search/search/hydrator.py,sha256=-R37gCrGxkyaiHQalnTWHNG_FCx11Zucd7qA1vQCxuw,6985
|
@@ -219,7 +219,7 @@ nucliadb/search/search/metrics.py,sha256=81X-tahGW4n2CLvUzCPdNxNClmZqUWZjcVOGCUH
|
|
219
219
|
nucliadb/search/search/paragraphs.py,sha256=pNAEiYqJGGUVcEf7xf-PFMVqz0PX4Qb-WNG-_zPGN2o,7799
|
220
220
|
nucliadb/search/search/pgcatalog.py,sha256=IaNK4dAxdXs38PoIkTdgqMDuZDjeiOtcXn3LeaT-OMw,8855
|
221
221
|
nucliadb/search/search/predict_proxy.py,sha256=xBlh6kjuQpWRq7KsBx4pEl2PtnwljjQIiYMaTWpcCSA,3015
|
222
|
-
nucliadb/search/search/query.py,sha256=
|
222
|
+
nucliadb/search/search/query.py,sha256=doRdBhM928wB64v271RSyJxsRT5qd6oevImEMz4gpvw,29487
|
223
223
|
nucliadb/search/search/rank_fusion.py,sha256=tRGo_KlsFsVx1CQEy1iqQ6f0T1Dq1kf0axDXHuuzvvM,6946
|
224
224
|
nucliadb/search/search/rerankers.py,sha256=0kAHES9X_FKkP7KSN9NRETFmRPKzwrFAo_54MbyvM7Q,9051
|
225
225
|
nucliadb/search/search/shards.py,sha256=JSRSrHgHcF4sXyuZZoJdMfK0v_LHpoSRf1lCr5-K5ko,2742
|
@@ -232,9 +232,10 @@ nucliadb/search/search/chat/images.py,sha256=PA8VWxT5_HUGfW1ULhKTK46UBsVyINtWWqE
|
|
232
232
|
nucliadb/search/search/chat/prompt.py,sha256=r2JTiRWH3YHPdeRAG5w6gD0g0fWVxdTjYIR86qAVa7k,47106
|
233
233
|
nucliadb/search/search/chat/query.py,sha256=rBssR6MPSx8h2DASRMTLODaz9oGE5tNVVVeDncSrEp4,15684
|
234
234
|
nucliadb/search/search/query_parser/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
235
|
-
nucliadb/search/search/query_parser/exceptions.py,sha256=
|
235
|
+
nucliadb/search/search/query_parser/exceptions.py,sha256=szAOXUZ27oNY-OSa9t2hQ5HHkQQC0EX1FZz_LluJHJE,1224
|
236
|
+
nucliadb/search/search/query_parser/fetcher.py,sha256=NnzbRIhtg15_N9rw6uNXgPLNOjmO_dv8HMvAskLZ6-g,15496
|
236
237
|
nucliadb/search/search/query_parser/models.py,sha256=-VlCDXUCgOroAZw1Leqhj2VMgRv_CD2w40PXXOBLaUM,2332
|
237
|
-
nucliadb/search/search/query_parser/parser.py,sha256=
|
238
|
+
nucliadb/search/search/query_parser/parser.py,sha256=JC6koS9Np1PzCfEk1Xy6mpP1HmovS_vIxxA9u-kwzos,6498
|
238
239
|
nucliadb/standalone/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
239
240
|
nucliadb/standalone/api_router.py,sha256=zR03TQ-Pd2kXx1jeV83Puw19112Z8Jhln7p1cAn69kg,6699
|
240
241
|
nucliadb/standalone/app.py,sha256=mAApNK_iVsQgJyd-mtwCeZq5csSimwnXmlQGH9a70pE,5586
|
@@ -331,9 +332,9 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
|
|
331
332
|
nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
|
332
333
|
nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
|
333
334
|
nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
|
334
|
-
nucliadb-6.2.1.
|
335
|
-
nucliadb-6.2.1.
|
336
|
-
nucliadb-6.2.1.
|
337
|
-
nucliadb-6.2.1.
|
338
|
-
nucliadb-6.2.1.
|
339
|
-
nucliadb-6.2.1.
|
335
|
+
nucliadb-6.2.1.post3165.dist-info/METADATA,sha256=9FA7BAbWWQlT3pJKH0iexO3PiSOl7mpz-PAh8W7kdxs,4603
|
336
|
+
nucliadb-6.2.1.post3165.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
337
|
+
nucliadb-6.2.1.post3165.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
|
338
|
+
nucliadb-6.2.1.post3165.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
|
339
|
+
nucliadb-6.2.1.post3165.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
340
|
+
nucliadb-6.2.1.post3165.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|