PyPI - nucliadb - Versions diffs - 6.2.0.post2679__py3-none-any.whl → 6.2.1__py3-none-any.whl - Mend

nucliadb 6.2.0.post2679py3-none-any.whl → 6.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

migrations/0028_extracted_vectors_reference.py +61 -0
migrations/0029_backfill_field_status.py +149 -0
migrations/0030_label_deduplication.py +60 -0
nucliadb/common/cluster/manager.py +41 -331
nucliadb/common/cluster/rebalance.py +2 -2
nucliadb/common/cluster/rollover.py +12 -71
nucliadb/common/cluster/settings.py +3 -0
nucliadb/common/cluster/standalone/utils.py +0 -43
nucliadb/common/cluster/utils.py +0 -16
nucliadb/common/counters.py +1 -0
nucliadb/common/datamanagers/fields.py +48 -7
nucliadb/common/datamanagers/vectorsets.py +11 -2
nucliadb/common/external_index_providers/base.py +2 -1
nucliadb/common/external_index_providers/pinecone.py +3 -5
nucliadb/common/ids.py +18 -4
nucliadb/common/models_utils/from_proto.py +479 -0
nucliadb/common/models_utils/to_proto.py +60 -0
nucliadb/common/nidx.py +76 -37
nucliadb/export_import/models.py +3 -3
nucliadb/health.py +0 -7
nucliadb/ingest/app.py +0 -8
nucliadb/ingest/consumer/auditing.py +1 -1
nucliadb/ingest/consumer/shard_creator.py +1 -1
nucliadb/ingest/fields/base.py +83 -21
nucliadb/ingest/orm/brain.py +55 -56
nucliadb/ingest/orm/broker_message.py +12 -2
nucliadb/ingest/orm/entities.py +6 -17
nucliadb/ingest/orm/knowledgebox.py +44 -22
nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
nucliadb/ingest/orm/processor/processor.py +5 -2
nucliadb/ingest/orm/resource.py +222 -413
nucliadb/ingest/processing.py +8 -2
nucliadb/ingest/serialize.py +77 -46
nucliadb/ingest/service/writer.py +2 -56
nucliadb/ingest/settings.py +1 -4
nucliadb/learning_proxy.py +6 -4
nucliadb/purge/__init__.py +102 -12
nucliadb/purge/orphan_shards.py +6 -4
nucliadb/reader/api/models.py +3 -3
nucliadb/reader/api/v1/__init__.py +1 -0
nucliadb/reader/api/v1/download.py +2 -2
nucliadb/reader/api/v1/knowledgebox.py +3 -3
nucliadb/reader/api/v1/resource.py +23 -12
nucliadb/reader/api/v1/services.py +4 -4
nucliadb/reader/api/v1/vectorsets.py +48 -0
nucliadb/search/api/v1/ask.py +11 -1
nucliadb/search/api/v1/feedback.py +3 -3
nucliadb/search/api/v1/knowledgebox.py +8 -13
nucliadb/search/api/v1/search.py +3 -2
nucliadb/search/api/v1/suggest.py +0 -2
nucliadb/search/predict.py +6 -4
nucliadb/search/requesters/utils.py +1 -2
nucliadb/search/search/chat/ask.py +77 -13
nucliadb/search/search/chat/prompt.py +16 -5
nucliadb/search/search/chat/query.py +74 -34
nucliadb/search/search/exceptions.py +2 -7
nucliadb/search/search/find.py +9 -5
nucliadb/search/search/find_merge.py +10 -4
nucliadb/search/search/graph_strategy.py +884 -0
nucliadb/search/search/hydrator.py +6 -0
nucliadb/search/search/merge.py +79 -24
nucliadb/search/search/query.py +74 -245
nucliadb/search/search/query_parser/exceptions.py +11 -1
nucliadb/search/search/query_parser/fetcher.py +405 -0
nucliadb/search/search/query_parser/models.py +0 -3
nucliadb/search/search/query_parser/parser.py +22 -21
nucliadb/search/search/rerankers.py +1 -42
nucliadb/search/search/shards.py +19 -0
nucliadb/standalone/api_router.py +2 -14
nucliadb/standalone/settings.py +4 -0
nucliadb/train/generators/field_streaming.py +7 -3
nucliadb/train/lifecycle.py +3 -6
nucliadb/train/nodes.py +14 -12
nucliadb/train/resource.py +380 -0
nucliadb/writer/api/constants.py +20 -16
nucliadb/writer/api/v1/__init__.py +1 -0
nucliadb/writer/api/v1/export_import.py +1 -1
nucliadb/writer/api/v1/field.py +13 -7
nucliadb/writer/api/v1/knowledgebox.py +3 -46
nucliadb/writer/api/v1/resource.py +20 -13
nucliadb/writer/api/v1/services.py +10 -1
nucliadb/writer/api/v1/upload.py +61 -34
nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
nucliadb/writer/back_pressure.py +17 -46
nucliadb/writer/resource/basic.py +9 -7
nucliadb/writer/resource/field.py +42 -9
nucliadb/writer/settings.py +2 -2
nucliadb/writer/tus/gcs.py +11 -10
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
nucliadb/common/cluster/discovery/base.py +0 -178
nucliadb/common/cluster/discovery/k8s.py +0 -301
nucliadb/common/cluster/discovery/manual.py +0 -57
nucliadb/common/cluster/discovery/single.py +0 -51
nucliadb/common/cluster/discovery/types.py +0 -32
nucliadb/common/cluster/discovery/utils.py +0 -67
nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
nucliadb/common/cluster/standalone/index_node.py +0 -123
nucliadb/common/cluster/standalone/service.py +0 -84
nucliadb/standalone/introspect.py +0 -208
nucliadb-6.2.0.post2679.dist-info/zip-safe +0 -1
/nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0

nucliadb/search/search/chat/ask.py CHANGED Viewed

@@ -49,6 +49,7 @@ from nucliadb.search.search.chat.query import (
     ChatAuditor,
     get_find_results,
     get_relations_results,
+    maybe_audit_chat,
     rephrase_query,
     sorted_prompt_context_list,
     tokens_to_chars,
@@ -57,6 +58,7 @@ from nucliadb.search.search.exceptions import (
     IncompleteFindResultsError,
     InvalidQueryError,
 )
+from nucliadb.search.search.graph_strategy import get_graph_results
 from nucliadb.search.search.metrics import RAGMetrics
 from nucliadb.search.search.query import QueryParser
 from nucliadb.search.utilities import get_predict
@@ -75,6 +77,7 @@ from nucliadb_models.search import (
     ErrorAskResponseItem,
     FindParagraph,
     FindRequest,
+    GraphStrategy,
     JSONAskResponseItem,
     KnowledgeboxFindResults,
     MetadataAskResponseItem,
@@ -126,7 +129,7 @@ class AskResult:
         main_results: KnowledgeboxFindResults,
         prequeries_results: Optional[list[PreQueryResult]],
         nuclia_learning_id: Optional[str],
-        predict_answer_stream: AsyncGenerator[GenerativeChunk, None],
+        predict_answer_stream: Optional[AsyncGenerator[GenerativeChunk, None]],
         prompt_context: PromptContext,
         prompt_context_order: PromptContextOrder,
         auditor: ChatAuditor,
@@ -393,6 +396,9 @@ class AskResult:
         This method does not assume any order in the stream of items, but it assumes that at least
         the answer text is streamed in order.
         """
+        if self.predict_answer_stream is None:
+            # In some cases, clients may want to skip the answer generation step
+            return
         async for generative_chunk in self.predict_answer_stream:
             item = generative_chunk.chunk
             if isinstance(item, TextGenerativeResponse):
@@ -431,14 +437,14 @@ class NotEnoughContextAskResult(AskResult):
         """
         yield self._ndjson_encode(RetrievalAskResponseItem(results=self.main_results))
         yield self._ndjson_encode(AnswerAskResponseItem(text=NOT_ENOUGH_CONTEXT_ANSWER))
-        status = AnswerStatusCode.NO_CONTEXT
+        status = AnswerStatusCode.NO_RETRIEVAL_DATA
         yield self._ndjson_encode(StatusAskResponseItem(code=status.value, status=status.prettify()))
     async def json(self) -> str:
         return SyncAskResponse(
             answer=NOT_ENOUGH_CONTEXT_ANSWER,
             retrieval_results=self.main_results,
-            status=AnswerStatusCode.NO_CONTEXT,
+            status=AnswerStatusCode.NO_RETRIEVAL_DATA.prettify(),
         ).model_dump_json()
@@ -485,6 +491,31 @@ async def ask(
             resource=resource,
         )
     except NoRetrievalResultsError as err:
+        try:
+            rephrase_time = metrics.elapsed("rephrase")
+        except KeyError:
+            # Not all ask requests have a rephrase step
+            rephrase_time = None
+        maybe_audit_chat(
+            kbid=kbid,
+            user_id=user_id,
+            client_type=client_type,
+            origin=origin,
+            generative_answer_time=0,
+            generative_answer_first_chunk_time=0,
+            rephrase_time=rephrase_time,
+            user_query=user_query,
+            rephrased_query=rephrased_query,
+            text_answer=b"",
+            status_code=AnswerStatusCode.NO_RETRIEVAL_DATA,
+            chat_history=chat_history,
+            query_context={},
+            query_context_order={},
+            learning_id=None,
+            model=ask_request.generative_model,
+        )
         # If a retrieval was attempted but no results were found,
         # early return the ask endpoint without querying the generative model
         return NotEnoughContextAskResult(
@@ -503,6 +534,7 @@ async def ask(
             ordered_paragraphs=[match.paragraph for match in retrieval_results.best_matches],
             resource=resource,
             user_context=user_context,
+            user_image_context=ask_request.extra_context_images,
             strategies=ask_request.rag_strategies,
             image_strategies=ask_request.rag_images_strategies,
             max_context_characters=tokens_to_chars(max_tokens_context),
@@ -534,14 +566,18 @@ async def ask(
         rerank_context=False,
         top_k=ask_request.top_k,
     )
-    with metrics.time("stream_start"):
-        predict = get_predict()
-        (
-            nuclia_learning_id,
-            nuclia_learning_model,
-            predict_answer_stream,
-        ) = await predict.chat_query_ndjson(kbid, chat_model)
-        debug_chat_model = chat_model
+    nuclia_learning_id = None
+    nuclia_learning_model = None
+    predict_answer_stream = None
+    if ask_request.generate_answer:
+        with metrics.time("stream_start"):
+            predict = get_predict()
+            (
+                nuclia_learning_id,
+                nuclia_learning_model,
+                predict_answer_stream,
+            ) = await predict.chat_query_ndjson(kbid, chat_model)
     auditor = ChatAuditor(
         kbid=kbid,
@@ -562,13 +598,13 @@ async def ask(
         main_results=retrieval_results.main_query,
         prequeries_results=retrieval_results.prequeries,
         nuclia_learning_id=nuclia_learning_id,
-        predict_answer_stream=predict_answer_stream,  # type: ignore
+        predict_answer_stream=predict_answer_stream,
         prompt_context=prompt_context,
         prompt_context_order=prompt_context_order,
         auditor=auditor,
         metrics=metrics,
         best_matches=retrieval_results.best_matches,
-        debug_chat_model=debug_chat_model,
+        debug_chat_model=chat_model,
     )
@@ -629,6 +665,13 @@ def parse_prequeries(ask_request: AskRequest) -> Optional[PreQueriesStrategy]:
     return None
+def parse_graph_strategy(ask_request: AskRequest) -> Optional[GraphStrategy]:
+    for rag_strategy in ask_request.rag_strategies:
+        if rag_strategy.name == RagStrategyName.GRAPH:
+            return cast(GraphStrategy, rag_strategy)
+    return None
 async def retrieval_step(
     kbid: str,
     main_query: str,
@@ -675,6 +718,7 @@ async def retrieval_in_kb(
     metrics: RAGMetrics,
 ) -> RetrievalResults:
     prequeries = parse_prequeries(ask_request)
+    graph_strategy = parse_graph_strategy(ask_request)
     with metrics.time("retrieval"):
         main_results, prequeries_results, query_parser = await get_find_results(
             kbid=kbid,
@@ -686,6 +730,26 @@ async def retrieval_in_kb(
             metrics=metrics,
             prequeries_strategy=prequeries,
         )
+        if graph_strategy is not None:
+            graph_results, graph_request = await get_graph_results(
+                kbid=kbid,
+                query=main_query,
+                item=ask_request,
+                ndb_client=client_type,
+                user=user_id,
+                origin=origin,
+                graph_strategy=graph_strategy,
+                metrics=metrics,
+                shards=ask_request.shards,
+            )
+            if prequeries_results is None:
+                prequeries_results = []
+            prequery = PreQuery(id="graph", request=graph_request, weight=graph_strategy.weight)
+            prequeries_results.append((prequery, graph_results))
         if len(main_results.resources) == 0 and all(
             len(prequery_result.resources) == 0 for (_, prequery_result) in prequeries_results or []
         ):

nucliadb/search/search/chat/prompt.py CHANGED Viewed

@@ -28,6 +28,7 @@ from pydantic import BaseModel
 from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, FieldId, ParagraphId
 from nucliadb.common.maindb.utils import get_driver
+from nucliadb.common.models_utils import from_proto
 from nucliadb.ingest.fields.base import Field
 from nucliadb.ingest.fields.conversation import Conversation
 from nucliadb.ingest.fields.file import File
@@ -41,6 +42,7 @@ from nucliadb.search.search.chat.images import (
 )
 from nucliadb.search.search.hydrator import hydrate_field_text, hydrate_resource_text
 from nucliadb.search.search.paragraphs import get_paragraph_text
+from nucliadb_models.labels import translate_alias_to_system_label
 from nucliadb_models.metadata import Extra, Origin
 from nucliadb_models.search import (
     SCORE_TYPE,
@@ -49,6 +51,7 @@ from nucliadb_models.search import (
     FindParagraph,
     FullResourceStrategy,
     HierarchyResourceStrategy,
+    Image,
     ImageRagStrategy,
     ImageRagStrategyName,
     MetadataExtensionStrategy,
@@ -266,7 +269,9 @@ async def full_resource_prompt_context(
                 if strategy.apply_to is not None:
                     # decide whether the resource should be extended or not
                     for label in strategy.apply_to.exclude:
-                        skip = skip or (label in (paragraph.labels or []))
+                        skip = skip or (
+                            translate_alias_to_system_label(label) in (paragraph.labels or [])
+                        )
                 if not skip:
                     ordered_resources.append(resource_uuid)
@@ -346,7 +351,7 @@ async def extend_prompt_context_with_origin_metadata(context, kbid, text_block_i
         if resource is not None:
             pb_origin = await resource.get_origin()
             if pb_origin is not None:
-                origin = Origin.from_message(pb_origin)
+                origin = from_proto.origin(pb_origin)
         return rid, origin
     rids = {tb_id.rid for tb_id in text_block_ids}
@@ -433,7 +438,7 @@ async def extend_prompt_context_with_extra_metadata(context, kbid, text_block_id
         if resource is not None:
             pb_extra = await resource.get_extra()
             if pb_extra is not None:
-                extra = Extra.from_message(pb_extra)
+                extra = from_proto.extra(pb_extra)
         return rid, extra
     rids = {tb_id.rid for tb_id in text_block_ids}
@@ -876,6 +881,7 @@ class PromptContextBuilder:
         ordered_paragraphs: list[FindParagraph],
         resource: Optional[str] = None,
         user_context: Optional[list[str]] = None,
+        user_image_context: Optional[list[Image]] = None,
         strategies: Optional[Sequence[RagStrategy]] = None,
         image_strategies: Optional[Sequence[ImageRagStrategy]] = None,
         max_context_characters: Optional[int] = None,
@@ -885,6 +891,7 @@ class PromptContextBuilder:
         self.ordered_paragraphs = ordered_paragraphs
         self.resource = resource
         self.user_context = user_context
+        self.user_image_context = user_image_context
         self.strategies = strategies
         self.image_strategies = image_strategies
         self.max_context_characters = max_context_characters
@@ -895,6 +902,8 @@ class PromptContextBuilder:
         # it is added first, followed by the found text blocks in order of relevance
         for i, text_block in enumerate(self.user_context or []):
             context[f"USER_CONTEXT_{i}"] = text_block
+        for i, image in enumerate(self.user_image_context or []):
+            context.images[f"USER_IMAGE_CONTEXT_{i}"] = image
     async def build(
         self,
@@ -1012,8 +1021,10 @@ class PromptContextBuilder:
                 neighbouring_paragraphs = cast(NeighbouringParagraphsStrategy, strategy)
             elif strategy.name == RagStrategyName.METADATA_EXTENSION:
                 metadata_extension = cast(MetadataExtensionStrategy, strategy)
-            elif strategy.name != RagStrategyName.PREQUERIES:  # pragma: no cover
-                # Prequeries are not handled here
+            elif (
+                strategy.name != RagStrategyName.PREQUERIES and strategy.name != RagStrategyName.GRAPH
+            ):  # pragma: no cover
+                # Prequeries and graph are not handled here
                 logger.warning(
                     "Unknown rag strategy",
                     extra={"strategy": strategy.name, "kbid": self.kbid},

nucliadb/search/search/chat/query.py CHANGED Viewed

@@ -18,8 +18,9 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 import asyncio
-from typing import Optional
+from typing import Iterable, Optional
+from nucliadb.common.models_utils import to_proto
 from nucliadb.search import logger
 from nucliadb.search.predict import AnswerStatusCode
 from nucliadb.search.requesters.utils import Method, node_query
@@ -49,7 +50,13 @@ from nucliadb_models.search import (
     parse_rephrase_prompt,
 )
 from nucliadb_protos import audit_pb2
-from nucliadb_protos.nodereader_pb2 import RelationSearchResponse, SearchRequest, SearchResponse
+from nucliadb_protos.nodereader_pb2 import (
+    EntitiesSubgraphRequest,
+    RelationSearchResponse,
+    SearchRequest,
+    SearchResponse,
+)
+from nucliadb_protos.utils_pb2 import RelationNode
 from nucliadb_telemetry.errors import capture_exception
 from nucliadb_utils.utilities import get_audit
@@ -144,15 +151,7 @@ async def get_find_results(
     return main_results, prequeries_results, query_parser
-async def run_main_query(
-    kbid: str,
-    query: str,
-    item: AskRequest,
-    ndb_client: NucliaDBClientType,
-    user: str,
-    origin: str,
-    metrics: RAGMetrics = RAGMetrics(),
-) -> tuple[KnowledgeboxFindResults, QueryParser]:
+def find_request_from_ask_request(item: AskRequest, query: str) -> FindRequest:
     find_request = FindRequest()
     find_request.resource_filters = item.resource_filters
     find_request.features = []
@@ -188,7 +187,19 @@ async def run_main_query(
     find_request.show_hidden = item.show_hidden
     # this executes the model validators, that can tweak some fields
-    FindRequest.model_validate(find_request)
+    return FindRequest.model_validate(find_request)
+async def run_main_query(
+    kbid: str,
+    query: str,
+    item: AskRequest,
+    ndb_client: NucliaDBClientType,
+    user: str,
+    origin: str,
+    metrics: RAGMetrics = RAGMetrics(),
+) -> tuple[KnowledgeboxFindResults, QueryParser]:
+    find_request = find_request_from_ask_request(item, query)
     find_results, incomplete, query_parser = await find(
         kbid,
@@ -210,36 +221,65 @@ async def get_relations_results(
     text_answer: str,
     target_shard_replicas: Optional[list[str]],
     timeout: Optional[float] = None,
+    only_with_metadata: bool = False,
+    only_agentic_relations: bool = False,
 ) -> Relations:
     try:
         predict = get_predict()
         detected_entities = await predict.detect_entities(kbid, text_answer)
-        request = SearchRequest()
-        request.relation_subgraph.entry_points.extend(detected_entities)
-        request.relation_subgraph.depth = 1
-        results: list[SearchResponse]
-        (
-            results,
-            _,
-            _,
-        ) = await node_query(
-            kbid,
-            Method.SEARCH,
-            request,
+        return await get_relations_results_from_entities(
+            kbid=kbid,
+            entities=detected_entities,
             target_shard_replicas=target_shard_replicas,
             timeout=timeout,
-            use_read_replica_nodes=True,
-            retry_on_primary=False,
+            only_with_metadata=only_with_metadata,
+            only_agentic_relations=only_agentic_relations,
         )
-        relations_results: list[RelationSearchResponse] = [result.relation for result in results]
-        return await merge_relations_results(relations_results, request.relation_subgraph)
     except Exception as exc:
         capture_exception(exc)
         logger.exception("Error getting relations results")
         return Relations(entities={})
+async def get_relations_results_from_entities(
+    *,
+    kbid: str,
+    entities: Iterable[RelationNode],
+    target_shard_replicas: Optional[list[str]],
+    timeout: Optional[float] = None,
+    only_with_metadata: bool = False,
+    only_agentic_relations: bool = False,
+    deleted_entities: set[str] = set(),
+) -> Relations:
+    request = SearchRequest()
+    request.relation_subgraph.entry_points.extend(entities)
+    request.relation_subgraph.depth = 1
+    deleted = EntitiesSubgraphRequest.DeletedEntities()
+    deleted.node_values.extend(deleted_entities)
+    request.relation_subgraph.deleted_entities.append(deleted)
+    results: list[SearchResponse]
+    (
+        results,
+        _,
+        _,
+    ) = await node_query(
+        kbid,
+        Method.SEARCH,
+        request,
+        target_shard_replicas=target_shard_replicas,
+        timeout=timeout,
+        use_read_replica_nodes=True,
+        retry_on_primary=False,
+    )
+    relations_results: list[RelationSearchResponse] = [result.relation for result in results]
+    return await merge_relations_results(
+        relations_results, request.relation_subgraph, only_with_metadata, only_agentic_relations
+    )
 def maybe_audit_chat(
     *,
     kbid: str,
@@ -256,8 +296,8 @@ def maybe_audit_chat(
     chat_history: list[ChatContextMessage],
     query_context: PromptContext,
     query_context_order: PromptContextOrder,
-    learning_id: str,
-    model: str,
+    learning_id: Optional[str],
+    model: Optional[str],
 ):
     audit = get_audit()
     if audit is None:
@@ -278,7 +318,7 @@ def maybe_audit_chat(
     audit.chat(
         kbid,
         user_id,
-        client_type.to_proto(),
+        to_proto.client_type(client_type),
         origin,
         question=user_query,
         generative_answer_time=generative_answer_time,
@@ -295,7 +335,7 @@ def maybe_audit_chat(
 def parse_audit_answer(raw_text_answer: bytes, status_code: AnswerStatusCode) -> Optional[str]:
-    if status_code == AnswerStatusCode.NO_CONTEXT:
+    if status_code == AnswerStatusCode.NO_CONTEXT or status_code == AnswerStatusCode.NO_RETRIEVAL_DATA:
         # We don't want to audit "Not enough context to answer this." and instead set a None.
         return None
     return raw_text_answer.decode()
@@ -320,7 +360,7 @@ class ChatAuditor:
         learning_id: Optional[str],
         query_context: PromptContext,
         query_context_order: PromptContextOrder,
-        model: str,
+        model: Optional[str],
     ):
         self.kbid = kbid
         self.user_id = user_id

nucliadb/search/search/exceptions.py CHANGED Viewed

@@ -17,6 +17,8 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
+from nucliadb.search.search.query_parser.exceptions import InvalidQueryError as InvalidQueryError
 class IncompleteFindResultsError(Exception):
     pass
@@ -24,10 +26,3 @@ class IncompleteFindResultsError(Exception):
 class ResourceNotFoundError(Exception):
     pass
-class InvalidQueryError(Exception):
-    def __init__(self, param: str, reason: str):
-        self.param = param
-        self.reason = reason
-        super().__init__(f"Invalid query. Error in {param}: {reason}")

nucliadb/search/search/find.py CHANGED Viewed

@@ -24,6 +24,7 @@ from typing import Optional
 from nucliadb.common.external_index_providers.base import ExternalIndexManager
 from nucliadb.common.external_index_providers.manager import get_external_index_manager
+from nucliadb.common.models_utils import to_proto
 from nucliadb.search.requesters.utils import Method, debug_nodes_info, node_query
 from nucliadb.search.search.find_merge import (
     build_find_response,
@@ -105,7 +106,7 @@ async def _index_node_retrieval(
         kbid, item, generative_model=generative_model
     )
     with metrics.time("query_parse"):
-        pb_query, incomplete_results, autofilters = await query_parser.parse()
+        pb_query, incomplete_results, autofilters, rephrased_query = await query_parser.parse()
     with metrics.time("node_query"):
         results, query_incomplete_results, queried_nodes = await node_query(
@@ -119,7 +120,8 @@ async def _index_node_retrieval(
             results,
             kbid=kbid,
             query=pb_query.body,
-            relation_subgraph_query=pb_query.relations.subgraph,
+            rephrased_query=rephrased_query,
+            relation_subgraph_query=pb_query.relation_subgraph,
             min_score_bm25=pb_query.min_score_bm25,
             min_score_semantic=pb_query.min_score_semantic,
             top_k=item.top_k,
@@ -136,7 +138,7 @@ async def _index_node_retrieval(
         audit.search(
             kbid,
             x_nucliadb_user,
-            x_ndb_client.to_proto(),
+            to_proto.client_type(x_ndb_client),
             x_forwarded_for,
             pb_query,
             search_time,
@@ -193,7 +195,7 @@ async def _external_index_retrieval(
     query_parser, _, reranker = await query_parser_from_find_request(
         kbid, item, generative_model=generative_model
     )
-    search_request, incomplete_results, _ = await query_parser.parse()
+    search_request, incomplete_results, _, rephrased_query = await query_parser.parse()
     # Query index
     query_results = await external_index_manager.query(search_request)  # noqa
@@ -224,6 +226,7 @@ async def _external_index_retrieval(
     retrieval_results = KnowledgeboxFindResults(
         resources=find_resources,
         query=item.query,
+        rephrased_query=rephrased_query,
         total=0,
         page_number=0,
         page_size=item.top_k,
@@ -259,7 +262,7 @@ async def query_parser_from_find_request(
     # XXX this is becoming the new /find query parsing, this should be moved to
     # a cleaner abstraction
-    parsed = parse_find(item)
+    parsed = await parse_find(kbid, item)
     rank_fusion = get_rank_fusion(parsed.rank_fusion)
     reranker = get_reranker(parsed.reranker)
@@ -268,6 +271,7 @@ async def query_parser_from_find_request(
         kbid=kbid,
         features=item.features,
         query=item.query,
+        query_entities=item.query_entities,
         label_filters=item.filters,
         keyword_filters=item.keyword_filters,
         faceted=None,

nucliadb/search/search/find_merge.py CHANGED Viewed

@@ -18,7 +18,7 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 import asyncio
-from typing import Iterable, Union
+from typing import Iterable, Optional, Union
 from nucliadb.common.external_index_providers.base import TextBlockMatch
 from nucliadb.common.ids import ParagraphId, VectorId
@@ -74,6 +74,7 @@ async def build_find_response(
     *,
     kbid: str,
     query: str,
+    rephrased_query: Optional[str],
     relation_subgraph_query: EntitiesSubgraphRequest,
     top_k: int,
     min_score_bm25: float,
@@ -96,9 +97,13 @@ async def build_find_response(
         )
     )
-    merged_text_blocks: list[TextBlockMatch] = rank_fusion_algorithm.fuse(
-        keyword_results, semantic_results
-    )
+    merged_text_blocks: list[TextBlockMatch]
+    if len(keyword_results) == 0:
+        merged_text_blocks = semantic_results
+    elif len(semantic_results) == 0:
+        merged_text_blocks = keyword_results
+    else:
+        merged_text_blocks = rank_fusion_algorithm.fuse(keyword_results, semantic_results)
     # cut
     # we assume pagination + predict reranker is forbidden and has been already
@@ -139,6 +144,7 @@ async def build_find_response(
     find_results = KnowledgeboxFindResults(
         query=query,
+        rephrased_query=rephrased_query,
         resources=find_resources,
         best_matches=best_matches,
         relations=relations,

nucliadb 6.2.0.post2679__py3-none-any.whl → 6.2.1__py3-none-any.whl

nucliadb 6.2.0.post2679py3-none-any.whl → 6.2.1py3-none-any.whl