PyPI - nucliadb - Versions diffs - 6.3.7.post4066__py3-none-any.whl → 6.3.7.post4068__py3-none-any.whl - Mend

nucliadb 6.3.7.post4066py3-none-any.whl → 6.3.7.post4068py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

nucliadb/search/api/v1/search.py +6 -39
nucliadb/search/search/chat/ask.py +19 -26
nucliadb/search/search/chat/query.py +6 -6
nucliadb/search/search/find.py +21 -91
nucliadb/search/search/find_merge.py +18 -9
nucliadb/search/search/graph_strategy.py +9 -10
nucliadb/search/search/merge.py +76 -65
nucliadb/search/search/query.py +2 -455
nucliadb/search/search/query_parser/fetcher.py +41 -0
nucliadb/search/search/query_parser/models.py +82 -8
nucliadb/search/search/query_parser/parsers/ask.py +77 -0
nucliadb/search/search/query_parser/parsers/common.py +189 -0
nucliadb/search/search/query_parser/parsers/find.py +175 -13
nucliadb/search/search/query_parser/parsers/search.py +249 -0
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +176 -0
nucliadb/search/search/rerankers.py +4 -2
{nucliadb-6.3.7.post4066.dist-info → nucliadb-6.3.7.post4068.dist-info}/METADATA +6 -6
{nucliadb-6.3.7.post4066.dist-info → nucliadb-6.3.7.post4068.dist-info}/RECORD +21 -17
{nucliadb-6.3.7.post4066.dist-info → nucliadb-6.3.7.post4068.dist-info}/WHEEL +0 -0
{nucliadb-6.3.7.post4066.dist-info → nucliadb-6.3.7.post4068.dist-info}/entry_points.txt +0 -0
{nucliadb-6.3.7.post4066.dist-info → nucliadb-6.3.7.post4068.dist-info}/top_level.txt +0 -0

nucliadb/search/search/query_parser/parsers/ask.py ADDED Viewed

@@ -0,0 +1,77 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from typing import Optional
+from nucliadb.search.search.query_parser.fetcher import Fetcher
+from nucliadb.search.search.query_parser.models import (
+    Generation,
+)
+from nucliadb_models.search import AskRequest, MaxTokens
+async def parse_ask(kbid: str, item: AskRequest, *, fetcher: Optional[Fetcher] = None) -> Generation:
+    fetcher = fetcher or fetcher_for_ask(kbid, item)
+    parser = _AskParser(kbid, item, fetcher)
+    return await parser.parse()
+def fetcher_for_ask(kbid: str, item: AskRequest) -> Fetcher:
+    return Fetcher(
+        kbid=kbid,
+        query=item.query,
+        user_vector=None,
+        vectorset=item.vectorset,
+        rephrase=item.rephrase,
+        rephrase_prompt=None,
+        generative_model=item.generative_model,
+    )
+class _AskParser:
+    def __init__(self, kbid: str, item: AskRequest, fetcher: Fetcher):
+        self.kbid = kbid
+        self.item = item
+        self.fetcher = fetcher
+    async def parse(self) -> Generation:
+        use_visual_llm = await self.fetcher.get_visual_llm_enabled()
+        if self.item.max_tokens is None:
+            max_tokens = None
+        elif isinstance(self.item.max_tokens, int):
+            max_tokens = MaxTokens(
+                context=None,
+                answer=self.item.max_tokens,
+            )
+        elif isinstance(self.item.max_tokens, MaxTokens):
+            max_tokens = self.item.max_tokens
+        else:  # pragma: nocover
+            # This is a trick so mypy generates an error if this branch can be reached,
+            # that is, if we are missing some ifs
+            _a: int = "a"
+        max_context_tokens = await self.fetcher.get_max_context_tokens(max_tokens)
+        max_answer_tokens = self.fetcher.get_max_answer_tokens(max_tokens)
+        return Generation(
+            use_visual_llm=use_visual_llm,
+            max_context_tokens=max_context_tokens,
+            max_answer_tokens=max_answer_tokens,
+        )

nucliadb/search/search/query_parser/parsers/common.py ADDED Viewed

@@ -0,0 +1,189 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+import re
+import string
+from typing import Optional, Union
+from nucliadb.search import logger
+from nucliadb.search.search.query_parser.exceptions import InvalidQueryError
+from nucliadb.search.search.query_parser.fetcher import Fetcher
+from nucliadb.search.search.query_parser.models import (
+    KeywordQuery,
+    SemanticQuery,
+)
+from nucliadb.search.search.utils import should_disable_vector_search
+from nucliadb_models import search as search_models
+DEFAULT_GENERIC_SEMANTIC_THRESHOLD = 0.7
+# -* is an invalid query in tantivy and it won't return results but if you add some whitespaces
+# between - and *, it will actually trigger a tantivy bug and panic
+INVALID_QUERY = re.compile(r"- +\*")
+def validate_base_request(item: search_models.BaseSearchRequest):
+    # Filter some queries that panic tantivy, better than returning the 500
+    if INVALID_QUERY.search(item.query):
+        raise InvalidQueryError("query", "Invalid query syntax")
+    # synonyms are not compatible with vector/graph search
+    if (
+        item.with_synonyms
+        and item.query
+        and (
+            search_models.SearchOptions.SEMANTIC in item.features
+            or search_models.SearchOptions.RELATIONS in item.features
+        )
+    ):
+        raise InvalidQueryError(
+            "synonyms",
+            "Search with custom synonyms is only supported on paragraph and document search",
+        )
+    if search_models.SearchOptions.SEMANTIC in item.features:
+        if should_disable_vector_search(item):
+            item.features.remove(search_models.SearchOptions.SEMANTIC)
+def parse_top_k(item: search_models.BaseSearchRequest) -> int:
+    assert item.top_k is not None, "top_k must have an int value"
+    top_k = item.top_k
+    return top_k
+async def parse_keyword_query(
+    item: search_models.BaseSearchRequest,
+    *,
+    fetcher: Fetcher,
+) -> KeywordQuery:
+    query = item.query
+    is_synonyms_query = False
+    if item.with_synonyms:
+        synonyms_query = await query_with_synonyms(query, fetcher=fetcher)
+        if synonyms_query is not None:
+            query = synonyms_query
+            is_synonyms_query = True
+    min_score = parse_keyword_min_score(item.min_score)
+    return KeywordQuery(
+        query=query,
+        is_synonyms_query=is_synonyms_query,
+        min_score=min_score,
+    )
+async def parse_semantic_query(
+    item: search_models.BaseSearchRequest,
+    *,
+    fetcher: Fetcher,
+) -> SemanticQuery:
+    vectorset = await fetcher.get_vectorset()
+    query = await fetcher.get_query_vector()
+    min_score = await parse_semantic_min_score(item.min_score, fetcher=fetcher)
+    return SemanticQuery(query=query, vectorset=vectorset, min_score=min_score)
+def parse_keyword_min_score(
+    min_score: Optional[Union[float, search_models.MinScore]],
+) -> float:
+    # Keep backward compatibility with the deprecated min_score payload
+    # parameter being a float (specifying semantic)
+    if min_score is None or isinstance(min_score, float):
+        return 0.0
+    else:
+        return min_score.bm25
+async def parse_semantic_min_score(
+    min_score: Optional[Union[float, search_models.MinScore]],
+    *,
+    fetcher: Fetcher,
+):
+    if min_score is None:
+        min_score = None
+    elif isinstance(min_score, float):
+        min_score = min_score
+    else:
+        min_score = min_score.semantic
+    if min_score is None:
+        # min score not defined by the user, we'll try to get the default
+        # from Predict API
+        min_score = await fetcher.get_semantic_min_score()
+        if min_score is None:
+            logger.warning(
+                "Semantic threshold not found in query information, using default",
+                extra={"kbid": fetcher.kbid},
+            )
+            min_score = DEFAULT_GENERIC_SEMANTIC_THRESHOLD
+    return min_score
+async def query_with_synonyms(
+    query: str,
+    *,
+    fetcher: Fetcher,
+) -> Optional[str]:
+    """
+    Replace the terms in the query with an expression that will make it match with the configured synonyms.
+    We're using the Tantivy's query language here: https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html
+    Example:
+    - Synonyms: Foo -> Bar, Baz
+    - Query: "What is Foo?"
+    - Advanced Query: "What is (Foo OR Bar OR Baz)?"
+    """
+    if not query:
+        return None
+    synonyms = await fetcher.get_synonyms()
+    if synonyms is None:
+        # No synonyms found
+        return None
+    # Calculate term variants: 'term' -> '(term OR synonym1 OR synonym2)'
+    variants: dict[str, str] = {}
+    for term, term_synonyms in synonyms.terms.items():
+        if len(term_synonyms.synonyms) > 0:
+            variants[term] = "({})".format(" OR ".join([term] + list(term_synonyms.synonyms)))
+    # Split the query into terms
+    query_terms = query.split()
+    # Remove punctuation from the query terms
+    clean_query_terms = [term.strip(string.punctuation) for term in query_terms]
+    # Replace the original terms with the variants if the cleaned term is in the variants
+    term_with_synonyms_found = False
+    for index, clean_term in enumerate(clean_query_terms):
+        if clean_term in variants:
+            term_with_synonyms_found = True
+            query_terms[index] = query_terms[index].replace(clean_term, variants[clean_term])
+    if term_with_synonyms_found:
+        advanced_query = " ".join(query_terms)
+        return advanced_query
+    return None

nucliadb/search/search/query_parser/parsers/find.py CHANGED Viewed

@@ -18,37 +18,103 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
+from typing import Optional
 from pydantic import ValidationError
+from nucliadb.common.models_utils.from_proto import RelationNodeTypeMap
 from nucliadb.search.search.metrics import query_parser_observer
-from nucliadb.search.search.query_parser.exceptions import InternalParserError
+from nucliadb.search.search.query import expand_entities
+from nucliadb.search.search.query_parser.exceptions import InternalParserError, InvalidQueryError
+from nucliadb.search.search.query_parser.fetcher import Fetcher
+from nucliadb.search.search.query_parser.filter_expression import parse_expression
 from nucliadb.search.search.query_parser.models import (
+    Filters,
     NoopReranker,
+    ParsedQuery,
     PredictReranker,
+    Query,
     RankFusion,
     ReciprocalRankFusion,
+    RelationQuery,
     Reranker,
     UnitRetrieval,
 )
+from nucliadb.search.search.query_parser.old_filters import OldFilterParams, parse_old_filters
+from nucliadb.search.search.utils import filter_hidden_resources
 from nucliadb_models import search as search_models
+from nucliadb_models.filters import FilterExpression
 from nucliadb_models.search import (
     FindRequest,
 )
+from nucliadb_protos import nodereader_pb2, utils_pb2
+from .common import (
+    parse_keyword_query,
+    parse_semantic_query,
+    parse_top_k,
+    validate_base_request,
+)
 @query_parser_observer.wrap({"type": "parse_find"})
-async def parse_find(kbid: str, item: FindRequest) -> UnitRetrieval:
-    parser = _FindParser(kbid, item)
-    return await parser.parse()
+async def parse_find(
+    kbid: str,
+    item: FindRequest,
+    generative_model: Optional[str] = None,
+    *,
+    fetcher: Optional[Fetcher] = None,
+) -> ParsedQuery:
+    fetcher = fetcher or fetcher_for_find(kbid, item, generative_model)
+    parser = _FindParser(kbid, item, fetcher)
+    retrieval = await parser.parse()
+    return ParsedQuery(fetcher=fetcher, retrieval=retrieval, generation=None)
+def fetcher_for_find(kbid: str, item: FindRequest, generative_model: Optional[str]) -> Fetcher:
+    return Fetcher(
+        kbid=kbid,
+        query=item.query,
+        user_vector=item.vector,
+        vectorset=item.vectorset,
+        rephrase=item.rephrase,
+        rephrase_prompt=item.rephrase_prompt,
+        generative_model=generative_model,
+    )
 class _FindParser:
-    def __init__(self, kbid: str, item: FindRequest):
+    def __init__(self, kbid: str, item: FindRequest, fetcher: Fetcher):
         self.kbid = kbid
         self.item = item
+        self.fetcher = fetcher
+        # cached data while parsing
+        self._query: Optional[Query] = None
+        self._top_k: Optional[int] = None
     async def parse(self) -> UnitRetrieval:
-        top_k = self._parse_top_k()
+        validate_base_request(self.item)
+        self._top_k = parse_top_k(self.item)
+        # parse search types (features)
+        self._query = Query()
+        if search_models.SearchOptions.KEYWORD in self.item.features:
+            self._query.keyword = await parse_keyword_query(self.item, fetcher=self.fetcher)
+        if search_models.SearchOptions.SEMANTIC in self.item.features:
+            self._query.semantic = await parse_semantic_query(self.item, fetcher=self.fetcher)
+        if search_models.SearchOptions.RELATIONS in self.item.features:
+            self._query.relation = await self._parse_relation_query()
+        # TODO: graph search
+        filters = await self._parse_filters()
         try:
             rank_fusion = self._parse_rank_fusion()
         except ValidationError as exc:
@@ -66,20 +132,116 @@ class _FindParser:
             rank_fusion.window = max(rank_fusion.window, reranker.window)
         return UnitRetrieval(
-            top_k=top_k,
+            query=self._query,
+            top_k=self._top_k,
+            filters=filters,
             rank_fusion=rank_fusion,
             reranker=reranker,
         )
-    def _parse_top_k(self) -> int:
-        assert self.item.top_k is not None, "top_k must have an int value"
-        top_k = self.item.top_k
-        return top_k
+    async def _parse_relation_query(self) -> RelationQuery:
+        detected_entities = await self._get_detected_entities()
+        deleted_entity_groups = await self.fetcher.get_deleted_entity_groups()
+        meta_cache = await self.fetcher.get_entities_meta_cache()
+        deleted_entities = meta_cache.deleted_entities
+        return RelationQuery(
+            detected_entities=detected_entities,
+            deleted_entity_groups=deleted_entity_groups,
+            deleted_entities=deleted_entities,
+        )
+    async def _get_detected_entities(self) -> list[utils_pb2.RelationNode]:
+        """Get entities from request, either automatically detected or
+        explicitly set by the user."""
+        if self.item.query_entities:
+            detected_entities = []
+            for entity in self.item.query_entities:
+                relation_node = utils_pb2.RelationNode()
+                relation_node.value = entity.name
+                if entity.type is not None:
+                    relation_node.ntype = RelationNodeTypeMap[entity.type]
+                if entity.subtype is not None:
+                    relation_node.subtype = entity.subtype
+                detected_entities.append(relation_node)
+        else:
+            detected_entities = await self.fetcher.get_detected_entities()
+        meta_cache = await self.fetcher.get_entities_meta_cache()
+        detected_entities = expand_entities(meta_cache, detected_entities)
+        return detected_entities
+    async def _parse_filters(self) -> Filters:
+        assert self._query is not None, "query must be parsed before filters"
+        has_old_filters = (
+            len(self.item.filters) > 0
+            or len(self.item.resource_filters) > 0
+            or len(self.item.fields) > 0
+            or len(self.item.keyword_filters) > 0
+            or self.item.range_creation_start is not None
+            or self.item.range_creation_end is not None
+            or self.item.range_modification_start is not None
+            or self.item.range_modification_end is not None
+        )
+        if self.item.filter_expression is not None and has_old_filters:
+            raise InvalidQueryError("filter_expression", "Cannot mix old filters with filter_expression")
+        field_expr = None
+        paragraph_expr = None
+        filter_operator = nodereader_pb2.FilterOperator.AND
+        if has_old_filters:
+            old_filters = OldFilterParams(
+                label_filters=self.item.filters,
+                keyword_filters=self.item.keyword_filters,
+                range_creation_start=self.item.range_creation_start,
+                range_creation_end=self.item.range_creation_end,
+                range_modification_start=self.item.range_modification_start,
+                range_modification_end=self.item.range_modification_end,
+                fields=self.item.fields,
+                key_filters=self.item.resource_filters,
+            )
+            field_expr, paragraph_expr = await parse_old_filters(old_filters, self.fetcher)
+        if self.item.filter_expression is not None:
+            if self.item.filter_expression.field:
+                field_expr = await parse_expression(self.item.filter_expression.field, self.kbid)
+            if self.item.filter_expression.paragraph:
+                paragraph_expr = await parse_expression(self.item.filter_expression.paragraph, self.kbid)
+            if self.item.filter_expression.operator == FilterExpression.Operator.OR:
+                filter_operator = nodereader_pb2.FilterOperator.OR
+            else:
+                filter_operator = nodereader_pb2.FilterOperator.AND
+        autofilter = None
+        if self.item.autofilter:
+            if self._query.relation is not None:
+                autofilter = self._query.relation.detected_entities
+            else:
+                autofilter = await self._get_detected_entities()
+        hidden = await filter_hidden_resources(self.kbid, self.item.show_hidden)
+        return Filters(
+            autofilter=autofilter,
+            facets=[],
+            field_expression=field_expr,
+            paragraph_expression=paragraph_expr,
+            filter_expression_operator=filter_operator,
+            security=self.item.security,
+            hidden=hidden,
+            with_duplicates=self.item.with_duplicates,
+        )
     def _parse_rank_fusion(self) -> RankFusion:
         rank_fusion: RankFusion
-        top_k = self._parse_top_k()
+        top_k = parse_top_k(self.item)
         window = min(top_k, 500)
         if isinstance(self.item.rank_fusion, search_models.RankFusionName):
@@ -104,7 +266,7 @@ class _FindParser:
     def _parse_reranker(self) -> Reranker:
         reranking: Reranker
-        top_k = self._parse_top_k()
+        top_k = parse_top_k(self.item)
         if isinstance(self.item.reranker, search_models.RerankerName):
             if self.item.reranker == search_models.RerankerName.NOOP:

nucliadb 6.3.7.post4066__py3-none-any.whl → 6.3.7.post4068__py3-none-any.whl

nucliadb 6.3.7.post4066py3-none-any.whl → 6.3.7.post4068py3-none-any.whl