PyPI - nucliadb - Versions diffs - 6.2.0.post2679__py3-none-any.whl → 6.2.1__py3-none-any.whl - Mend

nucliadb 6.2.0.post2679py3-none-any.whl → 6.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

migrations/0028_extracted_vectors_reference.py +61 -0
migrations/0029_backfill_field_status.py +149 -0
migrations/0030_label_deduplication.py +60 -0
nucliadb/common/cluster/manager.py +41 -331
nucliadb/common/cluster/rebalance.py +2 -2
nucliadb/common/cluster/rollover.py +12 -71
nucliadb/common/cluster/settings.py +3 -0
nucliadb/common/cluster/standalone/utils.py +0 -43
nucliadb/common/cluster/utils.py +0 -16
nucliadb/common/counters.py +1 -0
nucliadb/common/datamanagers/fields.py +48 -7
nucliadb/common/datamanagers/vectorsets.py +11 -2
nucliadb/common/external_index_providers/base.py +2 -1
nucliadb/common/external_index_providers/pinecone.py +3 -5
nucliadb/common/ids.py +18 -4
nucliadb/common/models_utils/from_proto.py +479 -0
nucliadb/common/models_utils/to_proto.py +60 -0
nucliadb/common/nidx.py +76 -37
nucliadb/export_import/models.py +3 -3
nucliadb/health.py +0 -7
nucliadb/ingest/app.py +0 -8
nucliadb/ingest/consumer/auditing.py +1 -1
nucliadb/ingest/consumer/shard_creator.py +1 -1
nucliadb/ingest/fields/base.py +83 -21
nucliadb/ingest/orm/brain.py +55 -56
nucliadb/ingest/orm/broker_message.py +12 -2
nucliadb/ingest/orm/entities.py +6 -17
nucliadb/ingest/orm/knowledgebox.py +44 -22
nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
nucliadb/ingest/orm/processor/processor.py +5 -2
nucliadb/ingest/orm/resource.py +222 -413
nucliadb/ingest/processing.py +8 -2
nucliadb/ingest/serialize.py +77 -46
nucliadb/ingest/service/writer.py +2 -56
nucliadb/ingest/settings.py +1 -4
nucliadb/learning_proxy.py +6 -4
nucliadb/purge/__init__.py +102 -12
nucliadb/purge/orphan_shards.py +6 -4
nucliadb/reader/api/models.py +3 -3
nucliadb/reader/api/v1/__init__.py +1 -0
nucliadb/reader/api/v1/download.py +2 -2
nucliadb/reader/api/v1/knowledgebox.py +3 -3
nucliadb/reader/api/v1/resource.py +23 -12
nucliadb/reader/api/v1/services.py +4 -4
nucliadb/reader/api/v1/vectorsets.py +48 -0
nucliadb/search/api/v1/ask.py +11 -1
nucliadb/search/api/v1/feedback.py +3 -3
nucliadb/search/api/v1/knowledgebox.py +8 -13
nucliadb/search/api/v1/search.py +3 -2
nucliadb/search/api/v1/suggest.py +0 -2
nucliadb/search/predict.py +6 -4
nucliadb/search/requesters/utils.py +1 -2
nucliadb/search/search/chat/ask.py +77 -13
nucliadb/search/search/chat/prompt.py +16 -5
nucliadb/search/search/chat/query.py +74 -34
nucliadb/search/search/exceptions.py +2 -7
nucliadb/search/search/find.py +9 -5
nucliadb/search/search/find_merge.py +10 -4
nucliadb/search/search/graph_strategy.py +884 -0
nucliadb/search/search/hydrator.py +6 -0
nucliadb/search/search/merge.py +79 -24
nucliadb/search/search/query.py +74 -245
nucliadb/search/search/query_parser/exceptions.py +11 -1
nucliadb/search/search/query_parser/fetcher.py +405 -0
nucliadb/search/search/query_parser/models.py +0 -3
nucliadb/search/search/query_parser/parser.py +22 -21
nucliadb/search/search/rerankers.py +1 -42
nucliadb/search/search/shards.py +19 -0
nucliadb/standalone/api_router.py +2 -14
nucliadb/standalone/settings.py +4 -0
nucliadb/train/generators/field_streaming.py +7 -3
nucliadb/train/lifecycle.py +3 -6
nucliadb/train/nodes.py +14 -12
nucliadb/train/resource.py +380 -0
nucliadb/writer/api/constants.py +20 -16
nucliadb/writer/api/v1/__init__.py +1 -0
nucliadb/writer/api/v1/export_import.py +1 -1
nucliadb/writer/api/v1/field.py +13 -7
nucliadb/writer/api/v1/knowledgebox.py +3 -46
nucliadb/writer/api/v1/resource.py +20 -13
nucliadb/writer/api/v1/services.py +10 -1
nucliadb/writer/api/v1/upload.py +61 -34
nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
nucliadb/writer/back_pressure.py +17 -46
nucliadb/writer/resource/basic.py +9 -7
nucliadb/writer/resource/field.py +42 -9
nucliadb/writer/settings.py +2 -2
nucliadb/writer/tus/gcs.py +11 -10
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
nucliadb/common/cluster/discovery/base.py +0 -178
nucliadb/common/cluster/discovery/k8s.py +0 -301
nucliadb/common/cluster/discovery/manual.py +0 -57
nucliadb/common/cluster/discovery/single.py +0 -51
nucliadb/common/cluster/discovery/types.py +0 -32
nucliadb/common/cluster/discovery/utils.py +0 -67
nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
nucliadb/common/cluster/standalone/index_node.py +0 -123
nucliadb/common/cluster/standalone/service.py +0 -84
nucliadb/standalone/introspect.py +0 -208
nucliadb-6.2.0.post2679.dist-info/zip-safe +0 -1
/nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0

nucliadb/search/search/query_parser/fetcher.py ADDED Viewed

@@ -0,0 +1,405 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from typing import Optional, TypeVar, Union
+from async_lru import alru_cache
+from typing_extensions import TypeIs
+from nucliadb.common import datamanagers
+from nucliadb.common.maindb.utils import get_driver
+from nucliadb.search import logger
+from nucliadb.search.predict import SendToPredictError, convert_relations
+from nucliadb.search.search.metrics import (
+    query_parse_dependency_observer,
+)
+from nucliadb.search.search.query_parser.exceptions import InvalidQueryError
+from nucliadb.search.utilities import get_predict
+from nucliadb_models.internal.predict import QueryInfo
+from nucliadb_protos import knowledgebox_pb2, utils_pb2
+# We use a class as cache miss marker to allow None values in the cache and to
+# make mypy happy with typing
+class NotCached:
+    pass
+not_cached = NotCached()
+T = TypeVar("T")
+def is_cached(field: Union[T, NotCached]) -> TypeIs[T]:
+    return not isinstance(field, NotCached)
+class FetcherCache:
+    predict_query_info: Union[Optional[QueryInfo], NotCached] = not_cached
+    predict_detected_entities: Union[list[utils_pb2.RelationNode], NotCached] = not_cached
+    # semantic search
+    query_vector: Union[Optional[list[float]], NotCached] = not_cached
+    vectorset: Union[str, NotCached] = not_cached
+    matryoshka_dimension: Union[Optional[int], NotCached] = not_cached
+    labels: Union[knowledgebox_pb2.Labels, NotCached] = not_cached
+    synonyms: Union[Optional[knowledgebox_pb2.Synonyms], NotCached] = not_cached
+    entities_meta_cache: Union[datamanagers.entities.EntitiesMetaCache, NotCached] = not_cached
+    deleted_entity_groups: Union[list[str], NotCached] = not_cached
+    detected_entities: Union[list[utils_pb2.RelationNode], NotCached] = not_cached
+class Fetcher:
+    """Queries are getting more and more complex and different phases of the
+    query depend on different data, not only from the user but from other parts
+    of the system.
+    This class is an encapsulation of data gathering across different parts of
+    the system. Given the user query input, it aims to be as efficient as
+    possible removing redundant expensive calls to other parts of the system. An
+    instance of a fetcher caches it's results and it's thought to be used in the
+    context of a single request. DO NOT use this as a global object!
+    """
+    def __init__(
+        self,
+        kbid: str,
+        *,
+        query: str,
+        user_vector: Optional[list[float]],
+        vectorset: Optional[str],
+        rephrase: bool,
+        rephrase_prompt: Optional[str],
+        generative_model: Optional[str],
+    ):
+        self.kbid = kbid
+        self.query = query
+        self.user_vector = user_vector
+        self.user_vectorset = vectorset
+        self.rephrase = rephrase
+        self.rephrase_prompt = rephrase_prompt
+        self.generative_model = generative_model
+        self.cache = FetcherCache()
+        self._validated = False
+    # Validation
+    async def initial_validate(self):
+        """Runs a validation on the input parameters. It can raise errors if
+        there's some wrong parameter.
+        This function should be always called if validated input for fetching is
+        desired
+        """
+        if self._validated:
+            return
+        self._validated = True
+    async def _validate_vectorset(self):
+        if self.user_vectorset is not None:
+            await validate_vectorset(self.kbid, self.user_vectorset)
+    # Semantic search
+    async def get_matryoshka_dimension(self) -> Optional[int]:
+        if is_cached(self.cache.matryoshka_dimension):
+            return self.cache.matryoshka_dimension
+        vectorset = await self.get_vectorset()
+        matryoshka_dimension = await get_matryoshka_dimension_cached(self.kbid, vectorset)
+        self.cache.matryoshka_dimension = matryoshka_dimension
+        return matryoshka_dimension
+    async def _get_user_vectorset(self) -> Optional[str]:
+        """Returns the user's requested vectorset and validates if it does exist
+        in the KB.
+        """
+        vectorset = self.user_vectorset
+        if not self._validated:
+            await self._validate_vectorset()
+        return vectorset
+    async def get_vectorset(self) -> str:
+        """Get the vectorset to be used in the search. If not specified, by the
+        user, Predict API or the own uses KB will provide a default.
+        """
+        if is_cached(self.cache.vectorset):
+            return self.cache.vectorset
+        if self.user_vectorset:
+            # user explicitly asked for a vectorset
+            self.cache.vectorset = self.user_vectorset
+            return self.user_vectorset
+        # when it's not provided, we get the default from Predict API
+        query_info = await self._predict_query_endpoint()
+        if query_info is None:
+            vectorset = None
+        else:
+            if query_info.sentence is None:
+                logger.error(
+                    "Asking for a vectorset but /query didn't return one", extra={"kbid": self.kbid}
+                )
+                vectorset = None
+            else:
+                # vectors field is enforced by the data model to have at least one key
+                for vectorset in query_info.sentence.vectors.keys():
+                    vectorset = vectorset
+                    break
+        if vectorset is None:
+            # in case predict don't answer which vectorset to use, fallback to
+            # the first vectorset of the KB
+            async with datamanagers.with_ro_transaction() as txn:
+                async for vectorset, _ in datamanagers.vectorsets.iter(txn, kbid=self.kbid):
+                    break
+            assert vectorset is not None, "All KBs must have at least one vectorset in maindb"
+        self.cache.vectorset = vectorset
+        return vectorset
+    async def get_query_vector(self) -> Optional[list[float]]:
+        if is_cached(self.cache.query_vector):
+            return self.cache.query_vector
+        if self.user_vector is not None:
+            query_vector = self.user_vector
+        else:
+            query_info = await self._predict_query_endpoint()
+            if query_info is None or query_info.sentence is None:
+                self.cache.query_vector = None
+                return None
+            vectorset = await self.get_vectorset()
+            if vectorset not in query_info.sentence.vectors:
+                logger.warning(
+                    "Predict is not responding with a valid query nucliadb vectorset",
+                    extra={
+                        "kbid": self.kbid,
+                        "vectorset": vectorset,
+                        "predict_vectorsets": ",".join(query_info.sentence.vectors.keys()),
+                    },
+                )
+                self.cache.query_vector = None
+                return None
+            query_vector = query_info.sentence.vectors[vectorset]
+        matryoshka_dimension = await self.get_matryoshka_dimension()
+        if matryoshka_dimension is not None:
+            if self.user_vector is not None and len(query_vector) < matryoshka_dimension:
+                raise InvalidQueryError(
+                    "vector",
+                    f"Invalid vector length, please check valid embedding size for {vectorset} model",
+                )
+            # KB using a matryoshka embeddings model, cut the query vector
+            # accordingly
+            query_vector = query_vector[:matryoshka_dimension]
+        self.cache.query_vector = query_vector
+        return query_vector
+    async def get_rephrased_query(self) -> Optional[str]:
+        query_info = await self._predict_query_endpoint()
+        if query_info is None:
+            return None
+        return query_info.rephrased_query
+    # Labels
+    async def get_classification_labels(self) -> knowledgebox_pb2.Labels:
+        if is_cached(self.cache.labels):
+            return self.cache.labels
+        labels = await get_classification_labels(self.kbid)
+        self.cache.labels = labels
+        return labels
+    # Entities
+    async def get_entities_meta_cache(self) -> datamanagers.entities.EntitiesMetaCache:
+        if is_cached(self.cache.entities_meta_cache):
+            return self.cache.entities_meta_cache
+        entities_meta_cache = await get_entities_meta_cache(self.kbid)
+        self.cache.entities_meta_cache = entities_meta_cache
+        return entities_meta_cache
+    async def get_deleted_entity_groups(self) -> list[str]:
+        if is_cached(self.cache.deleted_entity_groups):
+            return self.cache.deleted_entity_groups
+        deleted_entity_groups = await get_deleted_entity_groups(self.kbid)
+        self.cache.deleted_entity_groups = deleted_entity_groups
+        return deleted_entity_groups
+    async def get_detected_entities(self) -> list[utils_pb2.RelationNode]:
+        if is_cached(self.cache.detected_entities):
+            return self.cache.detected_entities
+        # Optimization to avoid calling predict twice
+        if is_cached(self.cache.predict_query_info):
+            # /query supersets detect entities, so we already have them
+            query_info = self.cache.predict_query_info
+            if query_info is not None and query_info.entities is not None:
+                detected_entities = convert_relations(query_info.entities.model_dump())
+            else:
+                detected_entities = []
+        else:
+            # No call to /query has been done, we'll use detect entities
+            # endpoint instead (as it's faster)
+            detected_entities = await self._predict_detect_entities()
+        self.cache.detected_entities = detected_entities
+        return detected_entities
+    # Synonyms
+    async def get_synonyms(self) -> Optional[knowledgebox_pb2.Synonyms]:
+        if is_cached(self.cache.synonyms):
+            return self.cache.synonyms
+        synonyms = await get_kb_synonyms(self.kbid)
+        self.cache.synonyms = synonyms
+        return synonyms
+    # Predict API
+    async def _predict_query_endpoint(self) -> Optional[QueryInfo]:
+        if is_cached(self.cache.predict_query_info):
+            return self.cache.predict_query_info
+        # calling twice should be avoided as query endpoint is a superset of detect entities
+        if is_cached(self.cache.predict_detected_entities):
+            logger.warning("Fetcher is not being efficient enough and has called predict twice!")
+        # we can't call get_vectorset, as it would do a recirsive loop between
+        # functions, so we'll manually parse it
+        vectorset = await self._get_user_vectorset()
+        try:
+            query_info = await query_information(
+                self.kbid,
+                self.query,
+                vectorset,
+                self.generative_model,
+                self.rephrase,
+                self.rephrase_prompt,
+            )
+        except (SendToPredictError, TimeoutError):
+            query_info = None
+        self.cache.predict_query_info = query_info
+        return query_info
+    async def _predict_detect_entities(self) -> list[utils_pb2.RelationNode]:
+        if is_cached(self.cache.predict_detected_entities):
+            return self.cache.predict_detected_entities
+        try:
+            detected_entities = await detect_entities(self.kbid, self.query)
+        except (SendToPredictError, TimeoutError) as ex:
+            logger.warning(f"Errors on Predict API detecting entities: {ex}", extra={"kbid": self.kbid})
+            detected_entities = []
+        self.cache.predict_detected_entities = detected_entities
+        return detected_entities
+async def validate_vectorset(kbid: str, vectorset: str):
+    async with datamanagers.with_ro_transaction() as txn:
+        if not await datamanagers.vectorsets.exists(txn, kbid=kbid, vectorset_id=vectorset):
+            raise InvalidQueryError(
+                "vectorset", f"Vectorset {vectorset} doesn't exist in you Knowledge Box"
+            )
+@query_parse_dependency_observer.wrap({"type": "query_information"})
+async def query_information(
+    kbid: str,
+    query: str,
+    semantic_model: Optional[str],
+    generative_model: Optional[str] = None,
+    rephrase: bool = False,
+    rephrase_prompt: Optional[str] = None,
+) -> QueryInfo:
+    predict = get_predict()
+    return await predict.query(kbid, query, semantic_model, generative_model, rephrase, rephrase_prompt)
+@query_parse_dependency_observer.wrap({"type": "detect_entities"})
+async def detect_entities(kbid: str, query: str) -> list[utils_pb2.RelationNode]:
+    predict = get_predict()
+    return await predict.detect_entities(kbid, query)
+@alru_cache(maxsize=None)
+async def get_matryoshka_dimension_cached(kbid: str, vectorset: Optional[str]) -> Optional[int]:
+    # This can be safely cached as the matryoshka dimension is not expected to change
+    return await get_matryoshka_dimension(kbid, vectorset)
+@query_parse_dependency_observer.wrap({"type": "matryoshka_dimension"})
+async def get_matryoshka_dimension(kbid: str, vectorset: Optional[str]) -> Optional[int]:
+    async with get_driver().transaction(read_only=True) as txn:
+        matryoshka_dimension = None
+        if not vectorset:
+            # XXX this should be migrated once we remove the "default" vectorset
+            # concept
+            matryoshka_dimension = await datamanagers.kb.get_matryoshka_vector_dimension(txn, kbid=kbid)
+        else:
+            vectorset_config = await datamanagers.vectorsets.get(txn, kbid=kbid, vectorset_id=vectorset)
+            if vectorset_config is not None and vectorset_config.vectorset_index_config.vector_dimension:
+                matryoshka_dimension = vectorset_config.vectorset_index_config.vector_dimension
+        return matryoshka_dimension
+@query_parse_dependency_observer.wrap({"type": "classification_labels"})
+async def get_classification_labels(kbid: str) -> knowledgebox_pb2.Labels:
+    async with get_driver().transaction(read_only=True) as txn:
+        return await datamanagers.labels.get_labels(txn, kbid=kbid)
+@query_parse_dependency_observer.wrap({"type": "synonyms"})
+async def get_kb_synonyms(kbid: str) -> Optional[knowledgebox_pb2.Synonyms]:
+    async with get_driver().transaction(read_only=True) as txn:
+        return await datamanagers.synonyms.get(txn, kbid=kbid)
+@query_parse_dependency_observer.wrap({"type": "entities_meta_cache"})
+async def get_entities_meta_cache(kbid: str) -> datamanagers.entities.EntitiesMetaCache:
+    async with get_driver().transaction(read_only=True) as txn:
+        return await datamanagers.entities.get_entities_meta_cache(txn, kbid=kbid)
+@query_parse_dependency_observer.wrap({"type": "deleted_entities_groups"})
+async def get_deleted_entity_groups(kbid: str) -> list[str]:
+    async with get_driver().transaction(read_only=True) as txn:
+        return list((await datamanagers.entities.get_deleted_groups(txn, kbid=kbid)).entities_groups)

nucliadb/search/search/query_parser/models.py CHANGED Viewed

@@ -62,9 +62,6 @@ class Reranker(BaseModel): ...
 class NoopReranker(Reranker): ...
-class MultiMatchBoosterReranker(Reranker): ...
 class PredictReranker(Reranker):
     window: int = Field(le=200)

nucliadb/search/search/query_parser/parser.py CHANGED Viewed

@@ -26,12 +26,11 @@ from nucliadb.search.search.filters import (
     convert_to_node_filters,
     translate_label_filters,
 )
-from nucliadb.search.search.query_parser.exceptions import ParserError
+from nucliadb.search.search.query_parser.exceptions import InternalParserError
 from nucliadb.search.search.query_parser.models import (
     CatalogFilters,
     CatalogQuery,
     DateTimeFilter,
-    MultiMatchBoosterReranker,
     NoopReranker,
     PredictReranker,
     RankFusion,
@@ -50,25 +49,26 @@ from nucliadb_models.search import (
 )
-def parse_find(item: FindRequest) -> UnitRetrieval:
-    parser = _FindParser(item)
-    return parser.parse()
+async def parse_find(kbid: str, item: FindRequest) -> UnitRetrieval:
+    parser = _FindParser(kbid, item)
+    return await parser.parse()
 class _FindParser:
-    def __init__(self, item: FindRequest):
+    def __init__(self, kbid: str, item: FindRequest):
+        self.kbid = kbid
         self.item = item
-    def parse(self) -> UnitRetrieval:
+    async def parse(self) -> UnitRetrieval:
         top_k = self._parse_top_k()
         try:
             rank_fusion = self._parse_rank_fusion()
         except ValidationError as exc:
-            raise ParserError(f"Parsing error in rank fusion: {str(exc)}") from exc
+            raise InternalParserError(f"Parsing error in rank fusion: {str(exc)}") from exc
         try:
             reranker = self._parse_reranker()
         except ValidationError as exc:
-            raise ParserError(f"Parsing error in reranker: {str(exc)}") from exc
+            raise InternalParserError(f"Parsing error in reranker: {str(exc)}") from exc
         # Adjust retrieval windows. Our current implementation assume:
         # `top_k <= reranker.window <= rank_fusion.window`
@@ -98,7 +98,7 @@ class _FindParser:
             if self.item.rank_fusion == search_models.RankFusionName.RECIPROCAL_RANK_FUSION:
                 rank_fusion = ReciprocalRankFusion(window=window)
             else:
-                raise ParserError(f"Unknown rank fusion algorithm: {self.item.rank_fusion}")
+                raise InternalParserError(f"Unknown rank fusion algorithm: {self.item.rank_fusion}")
         elif isinstance(self.item.rank_fusion, search_models.ReciprocalRankFusion):
             user_window = self.item.rank_fusion.window
@@ -109,7 +109,7 @@ class _FindParser:
             )
         else:
-            raise ParserError(f"Unknown rank fusion {self.item.rank_fusion}")
+            raise InternalParserError(f"Unknown rank fusion {self.item.rank_fusion}")
         return rank_fusion
@@ -122,33 +122,34 @@ class _FindParser:
             if self.item.reranker == search_models.RerankerName.NOOP:
                 reranking = NoopReranker()
-            elif self.item.reranker == search_models.RerankerName.MULTI_MATCH_BOOSTER:
-                reranking = MultiMatchBoosterReranker()
             elif self.item.reranker == search_models.RerankerName.PREDICT_RERANKER:
                 # for predict rearnker, by default, we want a x2 factor with a
                 # top of 200 results
                 reranking = PredictReranker(window=min(top_k * 2, 200))
             else:
-                raise ParserError(f"Unknown reranker algorithm: {self.item.reranker}")
+                raise InternalParserError(f"Unknown reranker algorithm: {self.item.reranker}")
         elif isinstance(self.item.reranker, search_models.PredictReranker):
             user_window = self.item.reranker.window
             reranking = PredictReranker(window=min(max(user_window or 0, top_k), 200))
         else:
-            raise ParserError(f"Unknown reranker {self.item.reranker}")
+            raise InternalParserError(f"Unknown reranker {self.item.reranker}")
         return reranking
 def parse_catalog(kbid: str, item: search_models.CatalogRequest) -> CatalogQuery:
-    if item.hidden:
-        hidden_filter = Filter(all=[LABEL_HIDDEN])
-    else:
-        hidden_filter = Filter(none=[LABEL_HIDDEN])
-    label_filters: dict[str, Any] = convert_to_node_filters(item.filters + [hidden_filter])  # type: ignore
+    filters = item.filters
+    if item.hidden is not None:
+        if item.hidden:
+            filters.append(Filter(all=[LABEL_HIDDEN]))  # type: ignore
+        else:
+            filters.append(Filter(none=[LABEL_HIDDEN]))  # type: ignore
+    label_filters: dict[str, Any] = convert_to_node_filters(item.filters)
     if len(label_filters) > 0:
         label_filters = translate_label_filters(label_filters)

nucliadb/search/search/rerankers.py CHANGED Viewed

@@ -169,58 +169,17 @@ class PredictReranker(Reranker):
         return best
-class MultiMatchBoosterReranker(Reranker):
-    """This reranker gives more value to items that come from different indices"""
-    @property
-    def window(self) -> Optional[int]:
-        return None
-    @reranker_observer.wrap({"type": "multi_match_booster"})
-    async def _rerank(self, items: list[RerankableItem], options: RerankingOptions) -> list[RankedItem]:
-        """Given a list of rerankable items, boost matches that appear multiple
-        times. The returned list can be smaller than the initial, as repeated
-        matches are deduplicated.
-        """
-        reranked_by_id = {}
-        for item in items:
-            if item.id not in reranked_by_id:
-                reranked_by_id[item.id] = RankedItem(
-                    id=item.id,
-                    score=item.score,
-                    score_type=item.score_type,
-                )
-            else:
-                # it's a mutiple match, boost the score
-                if reranked_by_id[item.id].score < item.score:
-                    # previous implementation noted that we are using vector
-                    # score x2 when we find a multiple match. However, this may
-                    # not be true, as the same paragraph could come in any
-                    # position in the rank fusioned result list
-                    reranked_by_id[item.id].score = item.score * 2
-                reranked_by_id[item.id].score_type = SCORE_TYPE.BOTH
-        reranked = list(reranked_by_id.values())
-        sort_by_score(reranked)
-        return reranked
 def get_reranker(reranker: parser_models.Reranker) -> Reranker:
     algorithm: Reranker
     if isinstance(reranker, parser_models.NoopReranker):
         algorithm = NoopReranker()
-    elif isinstance(reranker, parser_models.MultiMatchBoosterReranker):
-        algorithm = MultiMatchBoosterReranker()
     elif isinstance(reranker, parser_models.PredictReranker):
         algorithm = PredictReranker(reranker.window)
     else:
-        logger.warning(f"Unknown reranker requested: {reranker}. Using default instead")
-        algorithm = MultiMatchBoosterReranker()
+        raise ValueError(f"Unknown reranker requested: {reranker}")
     return algorithm

nucliadb/search/search/shards.py CHANGED Viewed

@@ -19,6 +19,10 @@
 #
 import asyncio
+import backoff
+from grpc import StatusCode
+from grpc.aio import AioRpcError
 from nucliadb.common.cluster.base import AbstractIndexNode
 from nucliadb_protos.nodereader_pb2 import (
     GetShardRequest,
@@ -39,6 +43,15 @@ node_observer = metrics.Observer(
 )
+def should_giveup(e: Exception):
+    if isinstance(e, AioRpcError) and e.code() != StatusCode.NOT_FOUND:
+        return True
+    return False
+@backoff.on_exception(
+    backoff.expo, Exception, jitter=None, factor=0.1, max_tries=3, giveup=should_giveup
+)
 async def query_shard(node: AbstractIndexNode, shard: str, query: SearchRequest) -> SearchResponse:
     req = SearchRequest()
     req.CopyFrom(query)
@@ -47,6 +60,9 @@ async def query_shard(node: AbstractIndexNode, shard: str, query: SearchRequest)
         return await node.reader.Search(req)  # type: ignore
+@backoff.on_exception(
+    backoff.expo, Exception, jitter=None, factor=0.1, max_tries=3, giveup=should_giveup
+)
 async def get_shard(node: AbstractIndexNode, shard_id: str) -> Shard:
     req = GetShardRequest()
     req.shard_id.id = shard_id
@@ -54,6 +70,9 @@ async def get_shard(node: AbstractIndexNode, shard_id: str) -> Shard:
         return await node.reader.GetShard(req)  # type: ignore
+@backoff.on_exception(
+    backoff.expo, Exception, jitter=None, factor=0.1, max_tries=3, giveup=should_giveup
+)
 async def suggest_shard(node: AbstractIndexNode, shard: str, query: SuggestRequest) -> SuggestResponse:
     req = SuggestRequest()
     req.CopyFrom(query)

nucliadb/standalone/api_router.py CHANGED Viewed

@@ -17,14 +17,13 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-import datetime
 import logging
 import time
 import orjson
 import pydantic
 from fastapi import Request
-from fastapi.responses import JSONResponse, StreamingResponse
+from fastapi.responses import JSONResponse
 from fastapi.routing import APIRouter
 from fastapi_versioning import version
 from jwcrypto import jwe, jwk  # type: ignore
@@ -33,7 +32,7 @@ from nucliadb.common import datamanagers
 from nucliadb.common.cluster import manager
 from nucliadb.common.http_clients import processing
 from nucliadb.common.http_clients.auth import NucliaAuthHTTPClient
-from nucliadb.standalone import introspect, versions
+from nucliadb.standalone import versions
 from nucliadb_models.resource import NucliaDBRoles
 from nucliadb_utils.authentication import requires
 from nucliadb_utils.settings import nuclia_settings
@@ -146,17 +145,6 @@ async def versions_endpoint(request: Request) -> JSONResponse:
     )
-@standalone_api_router.get("/introspect")
-def introspect_endpoint(request: Request) -> StreamingResponse:
-    introspect_id = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
-    return StreamingResponse(
-        content=introspect.stream_tar(request.app),
-        status_code=200,
-        headers={"Content-Disposition": f"attachment; filename=introspect_{introspect_id}.tar.gz"},
-        media_type="application/octet-stream",
-    )
 @standalone_api_router.get("/pull/position")
 async def pull_status(request: Request) -> JSONResponse:
     async with datamanagers.with_ro_transaction() as txn:

nucliadb/standalone/settings.py CHANGED Viewed

@@ -83,6 +83,10 @@ class Settings(DriverSettings, StorageSettings, ExtendedStorageSettings):
         default="X-NUCLIADB-ROLES",
         description="Only used for `upstream_naive` auth policy.",
     )
+    auth_policy_security_groups_header: str = pydantic.Field(
+        default="X-NUCLIADB-SECURITY_GROUPS",
+        description="Only used for `upstream_naive` auth policy.",
+    )
     auth_policy_user_default_roles: list[NucliaDBRoles] = pydantic.Field(
         default=[NucliaDBRoles.READER, NucliaDBRoles.WRITER, NucliaDBRoles.MANAGER],
         description="Default role to assign to user that is authenticated \

nucliadb 6.2.0.post2679__py3-none-any.whl → 6.2.1__py3-none-any.whl

nucliadb 6.2.0.post2679py3-none-any.whl → 6.2.1py3-none-any.whl