PyPI - nucliadb - Versions diffs - 6.2.1.post3139__py3-none-any.whl → 6.2.1.post3165__py3-none-any.whl - Mend

nucliadb 6.2.1.post3139py3-none-any.whl → 6.2.1.post3165py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

nucliadb/search/search/exceptions.py CHANGED Viewed

@@ -17,6 +17,8 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
+from nucliadb.search.search.query_parser.exceptions import InvalidQueryError as InvalidQueryError
 class IncompleteFindResultsError(Exception):
     pass
@@ -24,10 +26,3 @@ class IncompleteFindResultsError(Exception):
 class ResourceNotFoundError(Exception):
     pass
-class InvalidQueryError(Exception):
-    def __init__(self, param: str, reason: str):
-        self.param = param
-        self.reason = reason
-        super().__init__(f"Invalid query. Error in {param}: {reason}")

nucliadb/search/search/find.py CHANGED Viewed

@@ -260,7 +260,7 @@ async def query_parser_from_find_request(
     # XXX this is becoming the new /find query parsing, this should be moved to
     # a cleaner abstraction
-    parsed = parse_find(item)
+    parsed = await parse_find(kbid, item)
     rank_fusion = get_rank_fusion(parsed.rank_fusion)
     reranker = get_reranker(parsed.reranker)

nucliadb/search/search/query.py CHANGED Viewed

@@ -23,12 +23,9 @@ import string
 from datetime import datetime
 from typing import Any, Awaitable, Optional, Union
-from async_lru import alru_cache
 from nucliadb.common import datamanagers
-from nucliadb.common.maindb.utils import get_driver
 from nucliadb.search import logger
-from nucliadb.search.predict import SendToPredictError, convert_relations
+from nucliadb.search.predict import SendToPredictError
 from nucliadb.search.search.filters import (
     convert_to_node_filters,
     flatten_filter_literals,
@@ -39,15 +36,14 @@ from nucliadb.search.search.filters import (
 )
 from nucliadb.search.search.metrics import (
     node_features,
-    query_parse_dependency_observer,
 )
+from nucliadb.search.search.query_parser.fetcher import Fetcher, get_classification_labels
 from nucliadb.search.search.rank_fusion import (
     RankFusionAlgorithm,
 )
 from nucliadb.search.search.rerankers import (
     Reranker,
 )
-from nucliadb.search.utilities import get_predict
 from nucliadb_models.internal.predict import QueryInfo
 from nucliadb_models.labels import LABEL_HIDDEN, translate_system_to_alias_label
 from nucliadb_models.metadata import ResourceProcessingStatus
@@ -63,7 +59,7 @@ from nucliadb_models.search import (
     SuggestOptions,
 )
 from nucliadb_models.security import RequestSecurity
-from nucliadb_protos import knowledgebox_pb2, nodereader_pb2, utils_pb2
+from nucliadb_protos import nodereader_pb2, utils_pb2
 from nucliadb_protos.noderesources_pb2 import Resource
 from .exceptions import InvalidQueryError
@@ -87,13 +83,6 @@ class QueryParser:
     """
     _query_information_task: Optional[asyncio.Task] = None
-    _get_vectorset_task: Optional[asyncio.Task] = None
-    _detected_entities_task: Optional[asyncio.Task] = None
-    _entities_meta_cache_task: Optional[asyncio.Task] = None
-    _deleted_entities_groups_task: Optional[asyncio.Task] = None
-    _synonyms_task: Optional[asyncio.Task] = None
-    _get_classification_labels_task: Optional[asyncio.Task] = None
-    _get_matryoshka_dimension_task: Optional[asyncio.Task] = None
     def __init__(
         self,
@@ -168,6 +157,15 @@ class QueryParser:
         self.max_tokens = max_tokens
         self.rank_fusion = rank_fusion
         self.reranker = reranker
+        self.fetcher = Fetcher(
+            kbid=kbid,
+            query=query,
+            user_vector=user_vector,
+            vectorset=vectorset,
+            rephrase=rephrase,
+            rephrase_prompt=rephrase_prompt,
+            generative_model=generative_model,
+        )
     @property
     def has_vector_search(self) -> bool:
@@ -183,78 +181,12 @@ class QueryParser:
         return self._query_information_task
     async def _query_information(self) -> QueryInfo:
-        vectorset = await self.select_query_vectorset()
-        return await query_information(
-            self.kbid, self.query, vectorset, self.generative_model, self.rephrase, self.rephrase_prompt
-        )
-    def _get_vectorset(self) -> Awaitable[Optional[str]]:
-        if self._get_vectorset_task is None:
-            self._get_vectorset_task = asyncio.create_task(self._select_vectorset())
-        return self._get_vectorset_task
-    async def _select_vectorset(self) -> Optional[str]:
-        if self.vectorset:
-            return self.vectorset
-        # When vectorset is not provided we get the default from Predict API
-        try:
-            query_information = await self._get_query_information()
-        except SendToPredictError:
-            return None
-        if query_information.sentence is None:
-            logger.error(
-                "Asking for a vectorset but /query didn't return one", extra={"kbid": self.kbid}
-            )
-            return None
-        for vectorset in query_information.sentence.vectors.keys():
-            self.vectorset = vectorset
-            break
-        return self.vectorset
-    def _get_matryoshka_dimension(self) -> Awaitable[Optional[int]]:
-        if self._get_matryoshka_dimension_task is None:
-            self._get_matryoshka_dimension_task = asyncio.create_task(self._matryoshka_dimension())
-        return self._get_matryoshka_dimension_task
-    async def _matryoshka_dimension(self) -> Optional[int]:
-        vectorset = await self._select_vectorset()
-        return await get_matryoshka_dimension_cached(self.kbid, vectorset)
-    def _get_detected_entities(self) -> Awaitable[list[utils_pb2.RelationNode]]:
-        if self._detected_entities_task is None:  # pragma: no cover
-            self._detected_entities_task = asyncio.create_task(detect_entities(self.kbid, self.query))
-        return self._detected_entities_task
-    def _get_entities_meta_cache(
-        self,
-    ) -> Awaitable[datamanagers.entities.EntitiesMetaCache]:
-        if self._entities_meta_cache_task is None:
-            self._entities_meta_cache_task = asyncio.create_task(get_entities_meta_cache(self.kbid))
-        return self._entities_meta_cache_task
-    def _get_deleted_entity_groups(self) -> Awaitable[list[str]]:
-        if self._deleted_entities_groups_task is None:
-            self._deleted_entities_groups_task = asyncio.create_task(
-                get_deleted_entity_groups(self.kbid)
-            )
-        return self._deleted_entities_groups_task
-    def _get_synomyns(self) -> Awaitable[Optional[knowledgebox_pb2.Synonyms]]:
-        if self._synonyms_task is None:
-            self._synonyms_task = asyncio.create_task(get_kb_synonyms(self.kbid))
-        return self._synonyms_task
-    def _get_classification_labels(self) -> Awaitable[knowledgebox_pb2.Labels]:
-        if self._get_classification_labels_task is None:
-            self._get_classification_labels_task = asyncio.create_task(
-                get_classification_labels(self.kbid)
-            )
-        return self._get_classification_labels_task
+        # HACK: while transitioning to the new query parser, use fetcher under
+        # the hood for a smoother migration
+        query_info = await self.fetcher._predict_query_endpoint()
+        if query_info is None:
+            raise SendToPredictError("Error while using predict's query endpoint")
+        return query_info
     async def _schedule_dependency_tasks(self) -> None:
         """
@@ -262,21 +194,22 @@ class QueryParser:
         for the sake of the query being performed
         """
         if len(self.label_filters) > 0 and has_classification_label_filters(self.flat_label_filters):
-            asyncio.ensure_future(self._get_classification_labels())
+            asyncio.ensure_future(self.fetcher.get_classification_labels())
         if self.has_vector_search and self.user_vector is None:
             self.query_endpoint_used = True
             asyncio.ensure_future(self._get_query_information())
-            asyncio.ensure_future(self._get_matryoshka_dimension())
+            # XXX: should we also ensure get_vectorset and get_query_vector?
+            asyncio.ensure_future(self.fetcher.get_matryoshka_dimension())
         if (self.has_relations_search or self.autofilter) and len(self.query) > 0:
             if not self.query_endpoint_used:
                 # If we only need to detect entities, we don't need the query endpoint
-                asyncio.ensure_future(self._get_detected_entities())
-            asyncio.ensure_future(self._get_entities_meta_cache())
-            asyncio.ensure_future(self._get_deleted_entity_groups())
+                asyncio.ensure_future(self.fetcher.get_detected_entities())
+            asyncio.ensure_future(self.fetcher.get_entities_meta_cache())
+            asyncio.ensure_future(self.fetcher.get_deleted_entity_groups())
         if self.with_synonyms and self.query:
-            asyncio.ensure_future(self._get_synomyns())
+            asyncio.ensure_future(self.fetcher.get_synonyms())
     async def parse(self) -> tuple[nodereader_pb2.SearchRequest, bool, list[str]]:
         """
@@ -309,7 +242,7 @@ class QueryParser:
             field_labels = self.flat_label_filters
             paragraph_labels: list[str] = []
             if has_classification_label_filters(self.flat_label_filters):
-                classification_labels = await self._get_classification_labels()
+                classification_labels = await self.fetcher.get_classification_labels()
                 field_labels, paragraph_labels = split_labels_by_type(
                     self.flat_label_filters, classification_labels
                 )
@@ -398,19 +331,13 @@ class QueryParser:
             semantic_min_score = self.min_score.semantic
         elif self.has_vector_search and not incomplete:
             query_information = await self._get_query_information()
-            vectorset = await self._select_vectorset()
-            if vectorset is not None:
-                semantic_threshold = query_information.semantic_thresholds.get(vectorset, None)
-                if semantic_threshold is not None:
-                    semantic_min_score = semantic_threshold
-                else:
-                    logger.warning(
-                        "Semantic threshold not found in query information, using default",
-                        extra={"kbid": self.kbid},
-                    )
+            vectorset = await self.fetcher.get_vectorset()
+            semantic_threshold = query_information.semantic_thresholds.get(vectorset, None)
+            if semantic_threshold is not None:
+                semantic_min_score = semantic_threshold
             else:
                 logger.warning(
-                    "Vectorset unset by user or predict, using default semantic threshold",
+                    "Semantic threshold not found in query information, using default",
                     extra={"kbid": self.kbid},
                 )
         self.min_score.semantic = semantic_min_score
@@ -427,70 +354,18 @@ class QueryParser:
             request.paragraph = True
             node_features.inc({"type": "paragraphs"})
-    async def select_query_vectorset(self) -> Optional[str]:
-        """Set and return the requested vectorset parameter (if used) validated
-        for the current KB.
-        """
-        if not self.vectorset:
-            return None
-        # validate vectorset
-        async with datamanagers.with_ro_transaction() as txn:
-            if not await datamanagers.vectorsets.exists(
-                txn, kbid=self.kbid, vectorset_id=self.vectorset
-            ):
-                raise InvalidQueryError(
-                    "vectorset",
-                    f"Vectorset {self.vectorset} doesn't exist in you Knowledge Box",
-                )
-        return self.vectorset
     async def parse_vector_search(self, request: nodereader_pb2.SearchRequest) -> bool:
         if not self.has_vector_search:
             return False
         node_features.inc({"type": "vectors"})
-        incomplete = False
-        vectorset = await self._select_vectorset()
-        if vectorset is not None:
-            request.vectorset = vectorset
-        query_vector = None
-        if self.user_vector is None:
-            try:
-                query_info = await self._get_query_information()
-            except SendToPredictError as err:
-                logger.warning(f"Errors on predict api trying to embedd query: {err}")
-                incomplete = True
-            else:
-                if query_info and query_info.sentence:
-                    if vectorset:
-                        if vectorset in query_info.sentence.vectors:
-                            query_vector = query_info.sentence.vectors[vectorset]
-                        else:
-                            incomplete = True
-                    else:
-                        for vectorset_id, vector in query_info.sentence.vectors.items():
-                            if vector:
-                                query_vector = vector
-                                break
-                        else:
-                            incomplete = True
-                else:
-                    incomplete = True
-        else:
-            query_vector = self.user_vector
+        vectorset = await self.fetcher.get_vectorset()
+        query_vector = await self.fetcher.get_query_vector()
+        incomplete = query_vector is None
+        request.vectorset = vectorset
         if query_vector is not None:
-            matryoshka_dimension = await self._get_matryoshka_dimension()
-            if matryoshka_dimension is not None:
-                # KB using a matryoshka embeddings model, cut the query vector
-                # accordingly
-                query_vector = query_vector[:matryoshka_dimension]
             request.vector.extend(query_vector)
         return incomplete
@@ -498,20 +373,15 @@ class QueryParser:
     async def parse_relation_search(self, request: nodereader_pb2.SearchRequest) -> list[str]:
         autofilters = []
         if self.has_relations_search or self.autofilter:
-            if not self.query_endpoint_used:
-                detected_entities = await self._get_detected_entities()
-            else:
-                query_info_result = await self._get_query_information()
-                if query_info_result.entities:
-                    detected_entities = convert_relations(query_info_result.entities.model_dump())
-                else:
-                    detected_entities = []
-            meta_cache = await self._get_entities_meta_cache()
+            detected_entities = await self.fetcher.get_detected_entities()
+            meta_cache = await self.fetcher.get_entities_meta_cache()
             detected_entities = expand_entities(meta_cache, detected_entities)
             if self.has_relations_search:
                 request.relation_subgraph.entry_points.extend(detected_entities)
                 request.relation_subgraph.depth = 1
-                request.relation_subgraph.deleted_groups.extend(await self._get_deleted_entity_groups())
+                request.relation_subgraph.deleted_groups.extend(
+                    await self.fetcher.get_deleted_entity_groups()
+                )
                 for group_id, deleted_entities in meta_cache.deleted_entities.items():
                     request.relation_subgraph.deleted_entities.append(
                         nodereader_pb2.EntitiesSubgraphRequest.DeletedEntities(
@@ -544,7 +414,7 @@ class QueryParser:
                 "Search with custom synonyms is only supported on paragraph and document search",
             )
-        synonyms = await self._get_synomyns()
+        synonyms = await self.fetcher.get_synonyms()
         if synonyms is None:
             # No synonyms found
             return
@@ -680,29 +550,6 @@ async def paragraph_query_to_pb(
     return request
-@query_parse_dependency_observer.wrap({"type": "query_information"})
-async def query_information(
-    kbid: str,
-    query: str,
-    semantic_model: Optional[str],
-    generative_model: Optional[str] = None,
-    rephrase: bool = False,
-    rephrase_prompt: Optional[str] = None,
-) -> QueryInfo:
-    predict = get_predict()
-    return await predict.query(kbid, query, semantic_model, generative_model, rephrase, rephrase_prompt)
-@query_parse_dependency_observer.wrap({"type": "detect_entities"})
-async def detect_entities(kbid: str, query: str) -> list[utils_pb2.RelationNode]:
-    predict = get_predict()
-    try:
-        return await predict.detect_entities(kbid, query)
-    except SendToPredictError as ex:
-        logger.warning(f"Errors on predict api detecting entities: {ex}")
-        return []
 def expand_entities(
     meta_cache: datamanagers.entities.EntitiesMetaCache,
     detected_entities: list[utils_pb2.RelationNode],
@@ -833,30 +680,6 @@ PROCESSING_STATUS_TO_PB_MAP = {
 }
-@query_parse_dependency_observer.wrap({"type": "synonyms"})
-async def get_kb_synonyms(kbid: str) -> Optional[knowledgebox_pb2.Synonyms]:
-    async with get_driver().transaction(read_only=True) as txn:
-        return await datamanagers.synonyms.get(txn, kbid=kbid)
-@query_parse_dependency_observer.wrap({"type": "entities_meta_cache"})
-async def get_entities_meta_cache(kbid: str) -> datamanagers.entities.EntitiesMetaCache:
-    async with get_driver().transaction(read_only=True) as txn:
-        return await datamanagers.entities.get_entities_meta_cache(txn, kbid=kbid)
-@query_parse_dependency_observer.wrap({"type": "deleted_entities_groups"})
-async def get_deleted_entity_groups(kbid: str) -> list[str]:
-    async with get_driver().transaction(read_only=True) as txn:
-        return list((await datamanagers.entities.get_deleted_groups(txn, kbid=kbid)).entities_groups)
-@query_parse_dependency_observer.wrap({"type": "classification_labels"})
-async def get_classification_labels(kbid: str) -> knowledgebox_pb2.Labels:
-    async with get_driver().transaction(read_only=True) as txn:
-        return await datamanagers.labels.get_labels(txn, kbid=kbid)
 def check_supported_filters(filters: dict[str, Any], paragraph_labels: list[str]):
     """
     Check if the provided filters are supported:
@@ -889,28 +712,6 @@ def check_supported_filters(filters: dict[str, Any], paragraph_labels: list[str]
             )
-@alru_cache(maxsize=None)
-async def get_matryoshka_dimension_cached(kbid: str, vectorset: Optional[str]) -> Optional[int]:
-    # This can be safely cached as the matryoshka dimension is not expected to change
-    return await get_matryoshka_dimension(kbid, vectorset)
-@query_parse_dependency_observer.wrap({"type": "matryoshka_dimension"})
-async def get_matryoshka_dimension(kbid: str, vectorset: Optional[str]) -> Optional[int]:
-    async with get_driver().transaction(read_only=True) as txn:
-        matryoshka_dimension = None
-        if not vectorset:
-            # XXX this should be migrated once we remove the "default" vectorset
-            # concept
-            matryoshka_dimension = await datamanagers.kb.get_matryoshka_vector_dimension(txn, kbid=kbid)
-        else:
-            vectorset_config = await datamanagers.vectorsets.get(txn, kbid=kbid, vectorset_id=vectorset)
-            if vectorset_config is not None and vectorset_config.vectorset_index_config.vector_dimension:
-                matryoshka_dimension = vectorset_config.vectorset_index_config.vector_dimension
-        return matryoshka_dimension
 def get_sort_field_proto(obj: SortField) -> Optional[nodereader_pb2.OrderBy.OrderField.ValueType]:
     return {
         SortField.SCORE: None,

nucliadb/search/search/query_parser/exceptions.py CHANGED Viewed

@@ -19,4 +19,14 @@
 #
-class ParserError(ValueError): ...
+class InternalParserError(ValueError):
+    """Raised when parsing fails due to some internal error"""
+class InvalidQueryError(Exception):
+    """Raised when parsing a query containing an invalid parameter"""
+    def __init__(self, param: str, reason: str):
+        self.param = param
+        self.reason = reason
+        super().__init__(f"Invalid query. Error in {param}: {reason}")

nucliadb/search/search/query_parser/fetcher.py ADDED Viewed

@@ -0,0 +1,399 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from typing import Optional, TypeVar, Union
+from async_lru import alru_cache
+from typing_extensions import TypeIs
+from nucliadb.common import datamanagers
+from nucliadb.common.maindb.utils import get_driver
+from nucliadb.search import logger
+from nucliadb.search.predict import SendToPredictError, convert_relations
+from nucliadb.search.search.metrics import (
+    query_parse_dependency_observer,
+)
+from nucliadb.search.search.query_parser.exceptions import InvalidQueryError
+from nucliadb.search.utilities import get_predict
+from nucliadb_models.internal.predict import QueryInfo
+from nucliadb_protos import knowledgebox_pb2, utils_pb2
+# We use a class as cache miss marker to allow None values in the cache and to
+# make mypy happy with typing
+class NotCached:
+    pass
+not_cached = NotCached()
+T = TypeVar("T")
+def is_cached(field: Union[T, NotCached]) -> TypeIs[T]:
+    return not isinstance(field, NotCached)
+class FetcherCache:
+    predict_query_info: Union[Optional[QueryInfo], NotCached] = not_cached
+    predict_detected_entities: Union[list[utils_pb2.RelationNode], NotCached] = not_cached
+    # semantic search
+    query_vector: Union[Optional[list[float]], NotCached] = not_cached
+    vectorset: Union[str, NotCached] = not_cached
+    matryoshka_dimension: Union[Optional[int], NotCached] = not_cached
+    labels: Union[knowledgebox_pb2.Labels, NotCached] = not_cached
+    synonyms: Union[Optional[knowledgebox_pb2.Synonyms], NotCached] = not_cached
+    entities_meta_cache: Union[datamanagers.entities.EntitiesMetaCache, NotCached] = not_cached
+    deleted_entity_groups: Union[list[str], NotCached] = not_cached
+    detected_entities: Union[list[utils_pb2.RelationNode], NotCached] = not_cached
+class Fetcher:
+    """Queries are getting more and more complex and different phases of the
+    query depend on different data, not only from the user but from other parts
+    of the system.
+    This class is an encapsulation of data gathering across different parts of
+    the system. Given the user query input, it aims to be as efficient as
+    possible removing redundant expensive calls to other parts of the system. An
+    instance of a fetcher caches it's results and it's thought to be used in the
+    context of a single request. DO NOT use this as a global object!
+    """
+    def __init__(
+        self,
+        kbid: str,
+        *,
+        query: str,
+        user_vector: Optional[list[float]],
+        vectorset: Optional[str],
+        rephrase: bool,
+        rephrase_prompt: Optional[str],
+        generative_model: Optional[str],
+    ):
+        self.kbid = kbid
+        self.query = query
+        self.user_vector = user_vector
+        self.user_vectorset = vectorset
+        self.rephrase = rephrase
+        self.rephrase_prompt = rephrase_prompt
+        self.generative_model = generative_model
+        self.cache = FetcherCache()
+        self._validated = False
+    # Validation
+    async def initial_validate(self):
+        """Runs a validation on the input parameters. It can raise errors if
+        there's some wrong parameter.
+        This function should be always called if validated input for fetching is
+        desired
+        """
+        if self._validated:
+            return
+        self._validated = True
+    async def _validate_vectorset(self):
+        if self.user_vectorset is not None:
+            await validate_vectorset(self.kbid, self.user_vectorset)
+    # Semantic search
+    async def get_matryoshka_dimension(self) -> Optional[int]:
+        if is_cached(self.cache.matryoshka_dimension):
+            return self.cache.matryoshka_dimension
+        vectorset = await self.get_vectorset()
+        matryoshka_dimension = await get_matryoshka_dimension_cached(self.kbid, vectorset)
+        self.cache.matryoshka_dimension = matryoshka_dimension
+        return matryoshka_dimension
+    async def _get_user_vectorset(self) -> Optional[str]:
+        """Returns the user's requested vectorset and validates if it does exist
+        in the KB.
+        """
+        vectorset = self.user_vectorset
+        if not self._validated:
+            await self._validate_vectorset()
+        return vectorset
+    async def get_vectorset(self) -> str:
+        """Get the vectorset to be used in the search. If not specified, by the
+        user, Predict API or the own uses KB will provide a default.
+        """
+        if is_cached(self.cache.vectorset):
+            return self.cache.vectorset
+        if self.user_vectorset:
+            # user explicitly asked for a vectorset
+            self.cache.vectorset = self.user_vectorset
+            return self.user_vectorset
+        # when it's not provided, we get the default from Predict API
+        query_info = await self._predict_query_endpoint()
+        if query_info is None:
+            vectorset = None
+        else:
+            if query_info.sentence is None:
+                logger.error(
+                    "Asking for a vectorset but /query didn't return one", extra={"kbid": self.kbid}
+                )
+                vectorset = None
+            else:
+                # vectors field is enforced by the data model to have at least one key
+                for vectorset in query_info.sentence.vectors.keys():
+                    vectorset = vectorset
+                    break
+        if vectorset is None:
+            # in case predict don't answer which vectorset to use, fallback to
+            # the first vectorset of the KB
+            async with datamanagers.with_ro_transaction() as txn:
+                async for vectorset, _ in datamanagers.vectorsets.iter(txn, kbid=self.kbid):
+                    break
+            assert vectorset is not None, "All KBs must have at least one vectorset in maindb"
+        self.cache.vectorset = vectorset
+        return vectorset
+    async def get_query_vector(self) -> Optional[list[float]]:
+        if is_cached(self.cache.query_vector):
+            return self.cache.query_vector
+        if self.user_vector is not None:
+            query_vector = self.user_vector
+        else:
+            query_info = await self._predict_query_endpoint()
+            if query_info is None or query_info.sentence is None:
+                self.cache.query_vector = None
+                return None
+            vectorset = await self.get_vectorset()
+            if vectorset not in query_info.sentence.vectors:
+                logger.warning(
+                    "Predict is not responding with a valid query nucliadb vectorset",
+                    extra={
+                        "kbid": self.kbid,
+                        "vectorset": vectorset,
+                        "predict_vectorsets": ",".join(query_info.sentence.vectors.keys()),
+                    },
+                )
+                self.cache.query_vector = None
+                return None
+            query_vector = query_info.sentence.vectors[vectorset]
+        matryoshka_dimension = await self.get_matryoshka_dimension()
+        if matryoshka_dimension is not None:
+            if self.user_vector is not None and len(query_vector) < matryoshka_dimension:
+                raise InvalidQueryError(
+                    "vector",
+                    f"Invalid vector length, please check valid embedding size for {vectorset} model",
+                )
+            # KB using a matryoshka embeddings model, cut the query vector
+            # accordingly
+            query_vector = query_vector[:matryoshka_dimension]
+        self.cache.query_vector = query_vector
+        return query_vector
+    # Labels
+    async def get_classification_labels(self) -> knowledgebox_pb2.Labels:
+        if is_cached(self.cache.labels):
+            return self.cache.labels
+        labels = await get_classification_labels(self.kbid)
+        self.cache.labels = labels
+        return labels
+    # Entities
+    async def get_entities_meta_cache(self) -> datamanagers.entities.EntitiesMetaCache:
+        if is_cached(self.cache.entities_meta_cache):
+            return self.cache.entities_meta_cache
+        entities_meta_cache = await get_entities_meta_cache(self.kbid)
+        self.cache.entities_meta_cache = entities_meta_cache
+        return entities_meta_cache
+    async def get_deleted_entity_groups(self) -> list[str]:
+        if is_cached(self.cache.deleted_entity_groups):
+            return self.cache.deleted_entity_groups
+        deleted_entity_groups = await get_deleted_entity_groups(self.kbid)
+        self.cache.deleted_entity_groups = deleted_entity_groups
+        return deleted_entity_groups
+    async def get_detected_entities(self) -> list[utils_pb2.RelationNode]:
+        if is_cached(self.cache.detected_entities):
+            return self.cache.detected_entities
+        # Optimization to avoid calling predict twice
+        if is_cached(self.cache.predict_query_info):
+            # /query supersets detect entities, so we already have them
+            query_info = self.cache.predict_query_info
+            if query_info is not None and query_info.entities is not None:
+                detected_entities = convert_relations(query_info.entities.model_dump())
+            else:
+                detected_entities = []
+        else:
+            # No call to /query has been done, we'll use detect entities
+            # endpoint instead (as it's faster)
+            detected_entities = await self._predict_detect_entities()
+        self.cache.detected_entities = detected_entities
+        return detected_entities
+    # Synonyms
+    async def get_synonyms(self) -> Optional[knowledgebox_pb2.Synonyms]:
+        if is_cached(self.cache.synonyms):
+            return self.cache.synonyms
+        synonyms = await get_kb_synonyms(self.kbid)
+        self.cache.synonyms = synonyms
+        return synonyms
+    # Predict API
+    async def _predict_query_endpoint(self) -> Optional[QueryInfo]:
+        if is_cached(self.cache.predict_query_info):
+            return self.cache.predict_query_info
+        # calling twice should be avoided as query endpoint is a superset of detect entities
+        if is_cached(self.cache.predict_detected_entities):
+            logger.warning("Fetcher is not being efficient enough and has called predict twice!")
+        # we can't call get_vectorset, as it would do a recirsive loop between
+        # functions, so we'll manually parse it
+        vectorset = await self._get_user_vectorset()
+        try:
+            query_info = await query_information(
+                self.kbid,
+                self.query,
+                vectorset,
+                self.generative_model,
+                self.rephrase,
+                self.rephrase_prompt,
+            )
+        except SendToPredictError:
+            query_info = None
+        self.cache.predict_query_info = query_info
+        return query_info
+    async def _predict_detect_entities(self) -> list[utils_pb2.RelationNode]:
+        if is_cached(self.cache.predict_detected_entities):
+            return self.cache.predict_detected_entities
+        try:
+            detected_entities = await detect_entities(self.kbid, self.query)
+        except SendToPredictError as ex:
+            logger.warning(f"Errors on Predict API detecting entities: {ex}", extra={"kbid": self.kbid})
+            detected_entities = []
+        self.cache.predict_detected_entities = detected_entities
+        return detected_entities
+async def validate_vectorset(kbid: str, vectorset: str):
+    async with datamanagers.with_ro_transaction() as txn:
+        if not await datamanagers.vectorsets.exists(txn, kbid=kbid, vectorset_id=vectorset):
+            raise InvalidQueryError(
+                "vectorset", f"Vectorset {vectorset} doesn't exist in you Knowledge Box"
+            )
+@query_parse_dependency_observer.wrap({"type": "query_information"})
+async def query_information(
+    kbid: str,
+    query: str,
+    semantic_model: Optional[str],
+    generative_model: Optional[str] = None,
+    rephrase: bool = False,
+    rephrase_prompt: Optional[str] = None,
+) -> QueryInfo:
+    predict = get_predict()
+    return await predict.query(kbid, query, semantic_model, generative_model, rephrase, rephrase_prompt)
+@query_parse_dependency_observer.wrap({"type": "detect_entities"})
+async def detect_entities(kbid: str, query: str) -> list[utils_pb2.RelationNode]:
+    predict = get_predict()
+    return await predict.detect_entities(kbid, query)
+@alru_cache(maxsize=None)
+async def get_matryoshka_dimension_cached(kbid: str, vectorset: Optional[str]) -> Optional[int]:
+    # This can be safely cached as the matryoshka dimension is not expected to change
+    return await get_matryoshka_dimension(kbid, vectorset)
+@query_parse_dependency_observer.wrap({"type": "matryoshka_dimension"})
+async def get_matryoshka_dimension(kbid: str, vectorset: Optional[str]) -> Optional[int]:
+    async with get_driver().transaction(read_only=True) as txn:
+        matryoshka_dimension = None
+        if not vectorset:
+            # XXX this should be migrated once we remove the "default" vectorset
+            # concept
+            matryoshka_dimension = await datamanagers.kb.get_matryoshka_vector_dimension(txn, kbid=kbid)
+        else:
+            vectorset_config = await datamanagers.vectorsets.get(txn, kbid=kbid, vectorset_id=vectorset)
+            if vectorset_config is not None and vectorset_config.vectorset_index_config.vector_dimension:
+                matryoshka_dimension = vectorset_config.vectorset_index_config.vector_dimension
+        return matryoshka_dimension
+@query_parse_dependency_observer.wrap({"type": "classification_labels"})
+async def get_classification_labels(kbid: str) -> knowledgebox_pb2.Labels:
+    async with get_driver().transaction(read_only=True) as txn:
+        return await datamanagers.labels.get_labels(txn, kbid=kbid)
+@query_parse_dependency_observer.wrap({"type": "synonyms"})
+async def get_kb_synonyms(kbid: str) -> Optional[knowledgebox_pb2.Synonyms]:
+    async with get_driver().transaction(read_only=True) as txn:
+        return await datamanagers.synonyms.get(txn, kbid=kbid)
+@query_parse_dependency_observer.wrap({"type": "entities_meta_cache"})
+async def get_entities_meta_cache(kbid: str) -> datamanagers.entities.EntitiesMetaCache:
+    async with get_driver().transaction(read_only=True) as txn:
+        return await datamanagers.entities.get_entities_meta_cache(txn, kbid=kbid)
+@query_parse_dependency_observer.wrap({"type": "deleted_entities_groups"})
+async def get_deleted_entity_groups(kbid: str) -> list[str]:
+    async with get_driver().transaction(read_only=True) as txn:
+        return list((await datamanagers.entities.get_deleted_groups(txn, kbid=kbid)).entities_groups)

nucliadb/search/search/query_parser/parser.py CHANGED Viewed

@@ -26,7 +26,7 @@ from nucliadb.search.search.filters import (
     convert_to_node_filters,
     translate_label_filters,
 )
-from nucliadb.search.search.query_parser.exceptions import ParserError
+from nucliadb.search.search.query_parser.exceptions import InternalParserError
 from nucliadb.search.search.query_parser.models import (
     CatalogFilters,
     CatalogQuery,
@@ -50,25 +50,26 @@ from nucliadb_models.search import (
 )
-def parse_find(item: FindRequest) -> UnitRetrieval:
-    parser = _FindParser(item)
-    return parser.parse()
+async def parse_find(kbid: str, item: FindRequest) -> UnitRetrieval:
+    parser = _FindParser(kbid, item)
+    return await parser.parse()
 class _FindParser:
-    def __init__(self, item: FindRequest):
+    def __init__(self, kbid: str, item: FindRequest):
+        self.kbid = kbid
         self.item = item
-    def parse(self) -> UnitRetrieval:
+    async def parse(self) -> UnitRetrieval:
         top_k = self._parse_top_k()
         try:
             rank_fusion = self._parse_rank_fusion()
         except ValidationError as exc:
-            raise ParserError(f"Parsing error in rank fusion: {str(exc)}") from exc
+            raise InternalParserError(f"Parsing error in rank fusion: {str(exc)}") from exc
         try:
             reranker = self._parse_reranker()
         except ValidationError as exc:
-            raise ParserError(f"Parsing error in reranker: {str(exc)}") from exc
+            raise InternalParserError(f"Parsing error in reranker: {str(exc)}") from exc
         # Adjust retrieval windows. Our current implementation assume:
         # `top_k <= reranker.window <= rank_fusion.window`
@@ -98,7 +99,7 @@ class _FindParser:
             if self.item.rank_fusion == search_models.RankFusionName.RECIPROCAL_RANK_FUSION:
                 rank_fusion = ReciprocalRankFusion(window=window)
             else:
-                raise ParserError(f"Unknown rank fusion algorithm: {self.item.rank_fusion}")
+                raise InternalParserError(f"Unknown rank fusion algorithm: {self.item.rank_fusion}")
         elif isinstance(self.item.rank_fusion, search_models.ReciprocalRankFusion):
             user_window = self.item.rank_fusion.window
@@ -109,7 +110,7 @@ class _FindParser:
             )
         else:
-            raise ParserError(f"Unknown rank fusion {self.item.rank_fusion}")
+            raise InternalParserError(f"Unknown rank fusion {self.item.rank_fusion}")
         return rank_fusion
@@ -131,14 +132,14 @@ class _FindParser:
                 reranking = PredictReranker(window=min(top_k * 2, 200))
             else:
-                raise ParserError(f"Unknown reranker algorithm: {self.item.reranker}")
+                raise InternalParserError(f"Unknown reranker algorithm: {self.item.reranker}")
         elif isinstance(self.item.reranker, search_models.PredictReranker):
             user_window = self.item.reranker.window
             reranking = PredictReranker(window=min(max(user_window or 0, top_k), 200))
         else:
-            raise ParserError(f"Unknown reranker {self.item.reranker}")
+            raise InternalParserError(f"Unknown reranker {self.item.reranker}")
         return reranking

{nucliadb-6.2.1.post3139.dist-info → nucliadb-6.2.1.post3165.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: nucliadb
-Version: 6.2.1.post3139
+Version: 6.2.1.post3165
 Home-page: https://docs.nuclia.dev/docs/management/nucliadb/intro
 Author: NucliaDB Community
 Author-email: nucliadb@nuclia.com
@@ -22,10 +22,10 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3 :: Only
 Requires-Python: >=3.9, <4
 Description-Content-Type: text/markdown
-Requires-Dist: nucliadb-telemetry[all]>=6.2.1.post3139
-Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.2.1.post3139
-Requires-Dist: nucliadb-protos>=6.2.1.post3139
-Requires-Dist: nucliadb-models>=6.2.1.post3139
+Requires-Dist: nucliadb-telemetry[all]>=6.2.1.post3165
+Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.2.1.post3165
+Requires-Dist: nucliadb-protos>=6.2.1.post3165
+Requires-Dist: nucliadb-models>=6.2.1.post3165
 Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
 Requires-Dist: nuclia-models>=0.24.2
 Requires-Dist: uvicorn

{nucliadb-6.2.1.post3139.dist-info → nucliadb-6.2.1.post3165.dist-info}/RECORD RENAMED Viewed

@@ -207,10 +207,10 @@ nucliadb/search/requesters/utils.py,sha256=ZTiWDkDihJ7rcvs7itCe8hr6OclVcvu_2EAPF
 nucliadb/search/search/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
 nucliadb/search/search/cache.py,sha256=n9vkN6Y6Xnr2RBJyoH0WzjzGTJOMfKekU9tfPTWWCPc,6810
 nucliadb/search/search/cut.py,sha256=ytY0_GY7ocNjfxTb4aosxEp4ZfhQNDP--JkhEMGD298,1153
-nucliadb/search/search/exceptions.py,sha256=mbToQ-ghrv8ukLEv8S_-EZrgweWaIZZ5SIpoeuGDk6s,1154
+nucliadb/search/search/exceptions.py,sha256=klGLgAGGrXcSGix_W6418ZBMqDchAIGjN77ofkOScEI,1039
 nucliadb/search/search/fetch.py,sha256=XJHIFnZmXM_8Kb37lb4lg1GYG7cZ1plT-qAIb_QziX4,6184
 nucliadb/search/search/filters.py,sha256=1MkHlJjAQqoRCj7e5cEzK2HvBxGLE17I_omsjiklbtw,6476
-nucliadb/search/search/find.py,sha256=iJrLR9ML01Z0FZ5FOc80Z9S9IxCjgJiHyTZjwSRREDI,9889
+nucliadb/search/search/find.py,sha256=DaO3CPBQqRAw-iK_DNf_gM-aEipjtuX6oA2TbAplkxs,9901
 nucliadb/search/search/find_merge.py,sha256=5Aqz54E5GG8jw666KNncVHIJcs821ug-YwJ46YL6Br8,17363
 nucliadb/search/search/graph_strategy.py,sha256=Egcq_zn895gTUYmyQTsXj8YaUMa3HBKhcSa1GBvgzAM,31877
 nucliadb/search/search/hydrator.py,sha256=-R37gCrGxkyaiHQalnTWHNG_FCx11Zucd7qA1vQCxuw,6985
@@ -219,7 +219,7 @@ nucliadb/search/search/metrics.py,sha256=81X-tahGW4n2CLvUzCPdNxNClmZqUWZjcVOGCUH
 nucliadb/search/search/paragraphs.py,sha256=pNAEiYqJGGUVcEf7xf-PFMVqz0PX4Qb-WNG-_zPGN2o,7799
 nucliadb/search/search/pgcatalog.py,sha256=IaNK4dAxdXs38PoIkTdgqMDuZDjeiOtcXn3LeaT-OMw,8855
 nucliadb/search/search/predict_proxy.py,sha256=xBlh6kjuQpWRq7KsBx4pEl2PtnwljjQIiYMaTWpcCSA,3015
-nucliadb/search/search/query.py,sha256=CbCLdkBgD1XRVXN5qgT1rforf28cLNXvKbjbZzQYmUA,38051
+nucliadb/search/search/query.py,sha256=doRdBhM928wB64v271RSyJxsRT5qd6oevImEMz4gpvw,29487
 nucliadb/search/search/rank_fusion.py,sha256=tRGo_KlsFsVx1CQEy1iqQ6f0T1Dq1kf0axDXHuuzvvM,6946
 nucliadb/search/search/rerankers.py,sha256=0kAHES9X_FKkP7KSN9NRETFmRPKzwrFAo_54MbyvM7Q,9051
 nucliadb/search/search/shards.py,sha256=JSRSrHgHcF4sXyuZZoJdMfK0v_LHpoSRf1lCr5-K5ko,2742
@@ -232,9 +232,10 @@ nucliadb/search/search/chat/images.py,sha256=PA8VWxT5_HUGfW1ULhKTK46UBsVyINtWWqE
 nucliadb/search/search/chat/prompt.py,sha256=r2JTiRWH3YHPdeRAG5w6gD0g0fWVxdTjYIR86qAVa7k,47106
 nucliadb/search/search/chat/query.py,sha256=rBssR6MPSx8h2DASRMTLODaz9oGE5tNVVVeDncSrEp4,15684
 nucliadb/search/search/query_parser/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
-nucliadb/search/search/query_parser/exceptions.py,sha256=tuzl7ZyvVsRz6u0_3zMe60vx39nd3pi641prs-5nC0E,872
+nucliadb/search/search/query_parser/exceptions.py,sha256=szAOXUZ27oNY-OSa9t2hQ5HHkQQC0EX1FZz_LluJHJE,1224
+nucliadb/search/search/query_parser/fetcher.py,sha256=NnzbRIhtg15_N9rw6uNXgPLNOjmO_dv8HMvAskLZ6-g,15496
 nucliadb/search/search/query_parser/models.py,sha256=-VlCDXUCgOroAZw1Leqhj2VMgRv_CD2w40PXXOBLaUM,2332
-nucliadb/search/search/query_parser/parser.py,sha256=7L7vcEKIum07HeQ6F2EzKCYe3u6CFCgWsu16r91h3S0,6371
+nucliadb/search/search/query_parser/parser.py,sha256=JC6koS9Np1PzCfEk1Xy6mpP1HmovS_vIxxA9u-kwzos,6498
 nucliadb/standalone/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
 nucliadb/standalone/api_router.py,sha256=zR03TQ-Pd2kXx1jeV83Puw19112Z8Jhln7p1cAn69kg,6699
 nucliadb/standalone/app.py,sha256=mAApNK_iVsQgJyd-mtwCeZq5csSimwnXmlQGH9a70pE,5586
@@ -331,9 +332,9 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
 nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
 nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
 nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
-nucliadb-6.2.1.post3139.dist-info/METADATA,sha256=olyfxfEent03zeFNUA2bZClJgYjD0cPL8ZHVa6xbM50,4603
-nucliadb-6.2.1.post3139.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-nucliadb-6.2.1.post3139.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
-nucliadb-6.2.1.post3139.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
-nucliadb-6.2.1.post3139.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
-nucliadb-6.2.1.post3139.dist-info/RECORD,,
+nucliadb-6.2.1.post3165.dist-info/METADATA,sha256=9FA7BAbWWQlT3pJKH0iexO3PiSOl7mpz-PAh8W7kdxs,4603
+nucliadb-6.2.1.post3165.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+nucliadb-6.2.1.post3165.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
+nucliadb-6.2.1.post3165.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
+nucliadb-6.2.1.post3165.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
+nucliadb-6.2.1.post3165.dist-info/RECORD,,

{nucliadb-6.2.1.post3139.dist-info → nucliadb-6.2.1.post3165.dist-info}/WHEEL RENAMED Viewed

File without changes

{nucliadb-6.2.1.post3139.dist-info → nucliadb-6.2.1.post3165.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nucliadb-6.2.1.post3139.dist-info → nucliadb-6.2.1.post3165.dist-info}/top_level.txt RENAMED Viewed

File without changes

{nucliadb-6.2.1.post3139.dist-info → nucliadb-6.2.1.post3165.dist-info}/zip-safe RENAMED Viewed

File without changes

nucliadb 6.2.1.post3139__py3-none-any.whl → 6.2.1.post3165__py3-none-any.whl

nucliadb 6.2.1.post3139py3-none-any.whl → 6.2.1.post3165py3-none-any.whl