PyPI - nucliadb - Versions diffs - 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.9.1.post5192py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

migrations/0023_backfill_pg_catalog.py +2 -2
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +2 -2
migrations/0039_backfill_converation_splits_metadata.py +2 -2
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/interface.py +12 -12
nucliadb/common/catalog/pg.py +41 -29
nucliadb/common/catalog/utils.py +3 -3
nucliadb/common/cluster/manager.py +5 -4
nucliadb/common/cluster/rebalance.py +483 -114
nucliadb/common/cluster/rollover.py +25 -9
nucliadb/common/cluster/settings.py +3 -8
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +4 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +4 -5
nucliadb/common/filter_expression.py +128 -40
nucliadb/common/http_clients/processing.py +12 -23
nucliadb/common/ids.py +6 -4
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +3 -4
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +3 -8
nucliadb/ingest/consumer/service.py +3 -3
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +28 -49
nucliadb/ingest/fields/conversation.py +12 -12
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +78 -64
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +4 -4
nucliadb/ingest/orm/knowledgebox.py +18 -27
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +27 -27
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +72 -70
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +3 -109
nucliadb/ingest/settings.py +3 -4
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +11 -11
nucliadb/metrics_exporter.py +5 -4
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +3 -4
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/learning_config.py +24 -4
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +2 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +11 -15
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +25 -25
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +7 -7
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +24 -17
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -23
nucliadb/search/search/chat/ask.py +88 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +449 -36
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +3 -152
nucliadb/search/search/hydrator/fields.py +92 -50
nucliadb/search/search/hydrator/images.py +7 -7
nucliadb/search/search/hydrator/paragraphs.py +42 -26
nucliadb/search/search/hydrator/resources.py +20 -16
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +10 -9
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +13 -9
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -20
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +4 -5
nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
nucliadb/search/search/query_parser/parsers/common.py +5 -6
nucliadb/search/search/query_parser/parsers/find.py +6 -26
nucliadb/search/search/query_parser/parsers/graph.py +13 -23
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -53
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +5 -6
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +2 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +2 -2
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +7 -11
nucliadb/writer/api/v1/knowledgebox.py +3 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +7 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +1 -3
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +5 -6
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/search/search/query_parser/parsers/unit_retrieval.py CHANGED Viewed

@@ -17,7 +17,6 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import Optional
 from nidx_protos import nodereader_pb2
 from nidx_protos.nodereader_pb2 import SearchRequest
@@ -25,34 +24,14 @@ from nidx_protos.nodereader_pb2 import SearchRequest
 from nucliadb.common.filter_expression import add_and_expression
 from nucliadb.search.search.filters import translate_label
 from nucliadb.search.search.metrics import node_features, query_parser_observer
-from nucliadb.search.search.query import apply_entities_filter, get_sort_field_proto
+from nucliadb.search.search.query import get_sort_field_proto
 from nucliadb.search.search.query_parser.models import ParsedQuery, PredictReranker, UnitRetrieval
 from nucliadb.search.search.query_parser.parsers.graph import parse_path_query
-from nucliadb_models.labels import LABEL_HIDDEN, translate_system_to_alias_label
+from nucliadb_models.labels import LABEL_HIDDEN
 from nucliadb_models.search import SortOrderMap
 from nucliadb_protos import utils_pb2
-@query_parser_observer.wrap({"type": "convert_retrieval_to_proto"})
-async def legacy_convert_retrieval_to_proto(
-    parsed: ParsedQuery,
-) -> tuple[SearchRequest, bool, list[str], Optional[str]]:
-    converter = _Converter(parsed.retrieval)
-    request = converter.into_search_request()
-    # XXX: legacy values that were returned by QueryParser but not always
-    # needed. We should find a better abstraction
-    incomplete = is_incomplete(parsed.retrieval)
-    autofilter = converter._autofilter
-    rephrased_query = None
-    if parsed.retrieval.query.semantic:
-        rephrased_query = await parsed.fetcher.get_rephrased_query()
-    return request, incomplete, autofilter, rephrased_query
 @query_parser_observer.wrap({"type": "convert_retrieval_to_proto"})
 def convert_retrieval_to_proto(retrieval: UnitRetrieval) -> SearchRequest:
     converter = _Converter(retrieval)
@@ -65,8 +44,6 @@ class _Converter:
         self.req = nodereader_pb2.SearchRequest()
         self.retrieval = retrieval
-        self._autofilter: list[str] = []
     def into_search_request(self) -> nodereader_pb2.SearchRequest:
         """Generate a SearchRequest proto from a retrieval operation."""
         self._apply_text_queries()
@@ -75,6 +52,7 @@ class _Converter:
         self._apply_graph_query()
         self._apply_filters()
         self._apply_top_k()
         return self.req
     def _apply_text_queries(self) -> None:
@@ -235,10 +213,6 @@ class _Converter:
             self.req.paragraph_filter.CopyFrom(self.retrieval.filters.paragraph_expression)
         self.req.filter_operator = self.retrieval.filters.filter_expression_operator
-        if self.retrieval.filters.autofilter:
-            entity_filters = apply_entities_filter(self.req, self.retrieval.filters.autofilter)
-            self._autofilter.extend([translate_system_to_alias_label(e) for e in entity_filters])
         if self.retrieval.filters.hidden is not None:
             expr = nodereader_pb2.FilterExpression()
             if self.retrieval.filters.hidden:
@@ -281,3 +255,8 @@ def is_incomplete(retrieval: UnitRetrieval) -> bool:
         return False
     incomplete = retrieval.query.semantic.query is None or len(retrieval.query.semantic.query) == 0
     return incomplete
+def get_rephrased_query(parsed: ParsedQuery) -> str | None:
+    """Given a parsed query, return the rephrased query used, if any."""
+    return parsed.fetcher.get_cached_rephrased_query()

nucliadb/search/search/rank_fusion.py CHANGED Viewed

@@ -20,11 +20,12 @@
 import logging
 from abc import ABC, abstractmethod
 from enum import Enum, auto
-from typing import Optional, TypeVar
+from typing import TypeVar
 from nucliadb.common.external_index_providers.base import ScoredTextBlock
 from nucliadb.common.ids import ParagraphId
 from nucliadb.search.search.query_parser import models as parser_models
+from nucliadb_models.retrieval import RrfScore, Score, WeightedCombSumScore
 from nucliadb_models.search import SCORE_TYPE
 from nucliadb_telemetry.metrics import Observer
@@ -127,7 +128,7 @@ class ReciprocalRankFusion(RankFusionAlgorithm):
         k: float = 60.0,
         *,
         window: int,
-        weights: Optional[dict[str, float]] = None,
+        weights: dict[str, float] | None = None,
         default_weight: float = 1.0,
     ):
         super().__init__(window)
@@ -145,7 +146,7 @@ class ReciprocalRankFusion(RankFusionAlgorithm):
         sources: dict[str, list[ScoredItem]],
     ) -> list[ScoredItem]:
         # accumulated scores per paragraph
-        scores: dict[ParagraphId, tuple[float, SCORE_TYPE]] = {}
+        scores: dict[ParagraphId, tuple[float, SCORE_TYPE, list[Score]]] = {}
         # pointers from paragraph to the original source
         match_positions: dict[ParagraphId, list[tuple[int, int]]] = {}
@@ -161,11 +162,12 @@ class ReciprocalRankFusion(RankFusionAlgorithm):
         for i, (ranking, weight) in enumerate(rankings):
             for rank, item in enumerate(ranking):
                 id = item.paragraph_id
-                score, score_type = scores.setdefault(id, (0, item.score_type))
+                score, score_type, history = scores.setdefault(id, (0, item.score_type, []))
                 score += 1 / (self._k + rank) * weight
+                history.append(item.current_score)
                 if {score_type, item.score_type} == {SCORE_TYPE.BM25, SCORE_TYPE.VECTOR}:
                     score_type = SCORE_TYPE.BOTH
-                scores[id] = (score, score_type)
+                scores[id] = (score, score_type, history)
                 position = (i, rank)
                 match_positions.setdefault(item.paragraph_id, []).append(position)
@@ -175,9 +177,10 @@ class ReciprocalRankFusion(RankFusionAlgorithm):
             # we are getting only one position, effectively deduplicating
             # multiple matches for the same text block
             i, j = match_positions[paragraph_id][0]
-            score, score_type = scores[paragraph_id]
+            score, score_type, history = scores[paragraph_id]
             item = rankings[i][0][j]
-            item.score = score
+            history.append(RrfScore(score=score))
+            item.scores = history
             item.score_type = score_type
             merged.append(item)
@@ -207,7 +210,7 @@ class WeightedCombSum(RankFusionAlgorithm):
         self,
         *,
         window: int,
-        weights: Optional[dict[str, float]] = None,
+        weights: dict[str, float] | None = None,
         default_weight: float = 1.0,
     ):
         super().__init__(window)
@@ -217,7 +220,7 @@ class WeightedCombSum(RankFusionAlgorithm):
     @rank_fusion_observer.wrap({"type": "weighted_comb_sum"})
     def _fuse(self, sources: dict[str, list[ScoredItem]]) -> list[ScoredItem]:
         # accumulated scores per paragraph
-        scores: dict[ParagraphId, tuple[float, SCORE_TYPE]] = {}
+        scores: dict[ParagraphId, tuple[float, SCORE_TYPE, list[Score]]] = {}
         # pointers from paragraph to the original source
         match_positions: dict[ParagraphId, list[tuple[int, int]]] = {}
@@ -228,11 +231,12 @@ class WeightedCombSum(RankFusionAlgorithm):
         for i, (ranking, weight) in enumerate(rankings):
             for j, item in enumerate(ranking):
                 id = item.paragraph_id
-                score, score_type = scores.setdefault(id, (0, item.score_type))
+                score, score_type, history = scores.setdefault(id, (0, item.score_type, []))
                 score += item.score * weight
+                history.append(item.current_score)
                 if {score_type, item.score_type} == {SCORE_TYPE.BM25, SCORE_TYPE.VECTOR}:
                     score_type = SCORE_TYPE.BOTH
-                scores[id] = (score, score_type)
+                scores[id] = (score, score_type, history)
                 position = (i, j)
                 match_positions.setdefault(item.paragraph_id, []).append(position)
@@ -242,9 +246,10 @@ class WeightedCombSum(RankFusionAlgorithm):
             # we are getting only one position, effectively deduplicating
             # multiple matches for the same text block
             i, j = match_positions[paragraph_id][0]
-            score, score_type = scores[paragraph_id]
+            score, score_type, history = scores[paragraph_id]
             item = rankings[i][0][j]
-            item.score = score
+            history.append(WeightedCombSumScore(score=score))
+            item.scores = history
             item.score_type = score_type
             merged.append(item)

nucliadb/search/search/rerankers.py CHANGED Viewed

@@ -21,7 +21,8 @@
 import logging
 from abc import ABC, abstractmethod, abstractproperty
 from dataclasses import dataclass
-from typing import Optional
+from typing_extensions import assert_never
 from nucliadb.search.predict import ProxiedPredictAPIError, SendToPredictError
 from nucliadb.search.search.query_parser import models as parser_models
@@ -63,7 +64,7 @@ class RerankingOptions:
 class Reranker(ABC):
     @abstractproperty
-    def window(self) -> Optional[int]:
+    def window(self) -> int | None:
         """Number of elements the reranker requests. `None` means no specific
         window is enforced."""
         ...
@@ -102,7 +103,7 @@ class NoopReranker(Reranker):
     """
     @property
-    def window(self) -> Optional[int]:
+    def window(self) -> int | None:
         return None
     @reranker_observer.wrap({"type": "noop"})
@@ -182,9 +183,7 @@ def get_reranker(reranker: parser_models.Reranker) -> Reranker:
         algorithm = PredictReranker(reranker.window)
     else:  # pragma: no cover
-        # This is a trick so mypy generates an error if this branch can be reached,
-        # that is, if we are missing some ifs
-        _a: int = "a"
+        assert_never(reranker)
     return algorithm

nucliadb/search/search/retrieval.py ADDED Viewed

@@ -0,0 +1,300 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from collections.abc import Iterable
+from nidx_protos.nodereader_pb2 import (
+    DocumentScored,
+    GraphSearchResponse,
+    ParagraphResult,
+    ParagraphSearchResponse,
+    SearchRequest,
+    SearchResponse,
+    VectorSearchResponse,
+)
+from nucliadb.common.external_index_providers.base import TextBlockMatch
+from nucliadb.common.ids import ParagraphId, VectorId
+from nucliadb.search import logger
+from nucliadb.search.requesters.utils import Method, nidx_query
+from nucliadb.search.search.metrics import merge_observer, search_observer
+from nucliadb.search.search.query_parser.models import UnitRetrieval
+from nucliadb.search.search.query_parser.parsers.unit_retrieval import convert_retrieval_to_proto
+from nucliadb.search.search.rank_fusion import IndexSource, get_rank_fusion
+from nucliadb_models.retrieval import GraphScore, KeywordScore, SemanticScore
+from nucliadb_models.search import SCORE_TYPE, TextPosition
+# Constant score given to all graph results until we implement graph scoring
+FAKE_GRAPH_SCORE = 1.0
+async def nidx_search(kbid: str, pb_query: SearchRequest) -> tuple[SearchResponse, list[str]]:
+    """Wrapper around nidx_query for SEARCH that merges shards results in a
+    single response.
+    At some point, nidx will provide this functionality and we'll be able to
+    remove this.
+    """
+    shards_responses, queried_shards = await nidx_query(kbid, Method.SEARCH, pb_query)
+    response = merge_shard_responses(shards_responses)
+    return response, queried_shards
+@search_observer.wrap({"type": "text_block_search"})
+async def text_block_search(
+    kbid: str, retrieval: UnitRetrieval
+) -> tuple[list[TextBlockMatch], SearchRequest, SearchResponse, list[str]]:
+    """Search for text blocks in multiple indexes and return an rank fused view.
+    This search method provides a textual view of the data. For example, given a
+    graph query, it will return the text blocks associated with matched
+    triplets, not the triplet itself.
+    """
+    assert retrieval.rank_fusion is not None, "text block search requries a rank fusion algorithm"
+    pb_query = convert_retrieval_to_proto(retrieval)
+    shards_response, queried_shards = await nidx_search(kbid, pb_query)
+    keyword_results = keyword_results_to_text_block_matches(shards_response.paragraph.results)
+    semantic_results = semantic_results_to_text_block_matches(shards_response.vector.documents)
+    graph_results = graph_results_to_text_block_matches(shards_response.graph)
+    rank_fusion = get_rank_fusion(retrieval.rank_fusion)
+    merged_text_blocks = rank_fusion.fuse(
+        {
+            IndexSource.KEYWORD: keyword_results,
+            IndexSource.SEMANTIC: semantic_results,
+            IndexSource.GRAPH: graph_results,
+        }
+    )
+    # cut to the rank fusion window. As we ask each shard and index this window,
+    # we'll normally have extra results
+    text_blocks = merged_text_blocks[: retrieval.rank_fusion.window]
+    return text_blocks, pb_query, shards_response, queried_shards
+@merge_observer.wrap({"type": "shards_responses"})
+def merge_shard_responses(
+    responses: list[SearchResponse],
+) -> SearchResponse:
+    """Merge search responses into a single response as if there were no shards
+    involved.
+    ATENTION! This is not a complete merge, we are only merging the fields
+    needed to compose a /find response.
+    """
+    paragraphs = []
+    vectors = []
+    graphs = []
+    for response in responses:
+        paragraphs.append(response.paragraph)
+        vectors.append(response.vector)
+        graphs.append(response.graph)
+    merged = SearchResponse(
+        paragraph=merge_shards_keyword_responses(paragraphs),
+        vector=merge_shards_semantic_responses(vectors),
+        graph=merge_shards_graph_responses(graphs),
+    )
+    return merged
+def merge_shards_keyword_responses(
+    keyword_responses: list[ParagraphSearchResponse],
+) -> ParagraphSearchResponse:
+    """Merge keyword (paragraph) search responses into a single response as if
+    there were no shards involved.
+    ATENTION! This is not a complete merge, we are only merging the fields
+    needed to compose a /find response.
+    """
+    merged = ParagraphSearchResponse()
+    for response in keyword_responses:
+        merged.query = response.query
+        merged.next_page = merged.next_page or response.next_page
+        merged.total += response.total
+        merged.results.extend(response.results)
+        merged.ematches.extend(response.ematches)
+    return merged
+def merge_shards_semantic_responses(
+    semantic_responses: list[VectorSearchResponse],
+) -> VectorSearchResponse:
+    """Merge semantic (vector) search responses into a single response as if
+    there were no shards involved.
+    ATENTION! This is not a complete merge, we are only merging the fields
+    needed to compose a /find response.
+    """
+    merged = VectorSearchResponse()
+    for response in semantic_responses:
+        merged.documents.extend(response.documents)
+    return merged
+def merge_shards_graph_responses(
+    graph_responses: list[GraphSearchResponse],
+):
+    merged = GraphSearchResponse()
+    for response in graph_responses:
+        nodes_offset = len(merged.nodes)
+        relations_offset = len(merged.relations)
+        # paths contain indexes to nodes and relations, we must offset them
+        # while merging responses to maintain valid data
+        for path in response.graph:
+            merged_path = GraphSearchResponse.Path()
+            merged_path.CopyFrom(path)
+            merged_path.source += nodes_offset
+            merged_path.relation += relations_offset
+            merged_path.destination += nodes_offset
+            merged.graph.append(merged_path)
+        merged.nodes.extend(response.nodes)
+        merged.relations.extend(response.relations)
+    return merged
+def keyword_result_to_text_block_match(item: ParagraphResult) -> TextBlockMatch:
+    fuzzy_result = len(item.matches) > 0
+    return TextBlockMatch(
+        paragraph_id=ParagraphId.from_string(item.paragraph),
+        scores=[KeywordScore(score=item.score.bm25)],
+        score_type=SCORE_TYPE.BM25,
+        order=0,  # NOTE: this will be filled later
+        text=None,  # NOTE: this will be filled later too
+        position=TextPosition(
+            page_number=item.metadata.position.page_number,
+            index=item.metadata.position.index,
+            start=item.start,
+            end=item.end,
+            start_seconds=[x for x in item.metadata.position.start_seconds],
+            end_seconds=[x for x in item.metadata.position.end_seconds],
+        ),
+        # XXX: we should split labels
+        field_labels=[],
+        paragraph_labels=list(item.labels),
+        fuzzy_search=fuzzy_result,
+        is_a_table=item.metadata.representation.is_a_table,
+        representation_file=item.metadata.representation.file or None,
+        page_with_visual=item.metadata.page_with_visual,
+    )
+def keyword_results_to_text_block_matches(items: Iterable[ParagraphResult]) -> list[TextBlockMatch]:
+    return [keyword_result_to_text_block_match(item) for item in items]
+class InvalidDocId(Exception):
+    """Raised while parsing an invalid id coming from semantic search"""
+    def __init__(self, invalid_vector_id: str):
+        self.invalid_vector_id = invalid_vector_id
+        super().__init__(f"Invalid vector ID: {invalid_vector_id}")
+def semantic_result_to_text_block_match(item: DocumentScored) -> TextBlockMatch:
+    try:
+        vector_id = VectorId.from_string(item.doc_id.id)
+    except (IndexError, ValueError):
+        raise InvalidDocId(item.doc_id.id)
+    return TextBlockMatch(
+        paragraph_id=ParagraphId.from_vector_id(vector_id),
+        scores=[SemanticScore(score=item.score)],
+        score_type=SCORE_TYPE.VECTOR,
+        order=0,  # NOTE: this will be filled later
+        text=None,  # NOTE: this will be filled later too
+        position=TextPosition(
+            page_number=item.metadata.position.page_number,
+            index=item.metadata.position.index,
+            start=vector_id.vector_start,
+            end=vector_id.vector_end,
+            start_seconds=[x for x in item.metadata.position.start_seconds],
+            end_seconds=[x for x in item.metadata.position.end_seconds],
+        ),
+        # XXX: we should split labels
+        field_labels=[],
+        paragraph_labels=list(item.labels),
+        fuzzy_search=False,  # semantic search doesn't have fuzziness
+        is_a_table=item.metadata.representation.is_a_table,
+        representation_file=item.metadata.representation.file or None,
+        page_with_visual=item.metadata.page_with_visual,
+    )
+def semantic_results_to_text_block_matches(items: Iterable[DocumentScored]) -> list[TextBlockMatch]:
+    text_blocks: list[TextBlockMatch] = []
+    for item in items:
+        try:
+            text_block = semantic_result_to_text_block_match(item)
+        except InvalidDocId as exc:
+            logger.warning(f"Skipping invalid doc_id: {exc.invalid_vector_id}")
+            continue
+        text_blocks.append(text_block)
+    return text_blocks
+def graph_results_to_text_block_matches(item: GraphSearchResponse) -> list[TextBlockMatch]:
+    matches = []
+    for path in item.graph:
+        metadata = path.metadata
+        if not metadata.paragraph_id:
+            continue
+        paragraph_id = ParagraphId.from_string(metadata.paragraph_id)
+        matches.append(
+            TextBlockMatch(
+                paragraph_id=paragraph_id,
+                scores=[GraphScore(score=FAKE_GRAPH_SCORE)],
+                score_type=SCORE_TYPE.RELATION_RELEVANCE,
+                order=0,  # NOTE: this will be filled later
+                text=None,  # NOTE: this will be filled later too
+                position=TextPosition(
+                    page_number=0,
+                    index=0,
+                    start=paragraph_id.paragraph_start,
+                    end=paragraph_id.paragraph_end,
+                    start_seconds=[],
+                    end_seconds=[],
+                ),
+                # XXX: we should split labels
+                field_labels=[],
+                paragraph_labels=[],
+                fuzzy_search=False,  # TODO: this depends on the query, should we populate it?
+                is_a_table=False,
+                representation_file="",
+                page_with_visual=False,
+            )
+        )
+    return matches

nucliadb/search/search/summarize.py CHANGED Viewed

@@ -18,7 +18,6 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 import asyncio
-from typing import Optional
 from nucliadb.common import datamanagers
 from nucliadb.common.maindb.utils import get_driver
@@ -36,7 +35,7 @@ from nucliadb_models.search import (
 from nucliadb_protos.utils_pb2 import ExtractedText
 from nucliadb_utils.utilities import get_storage
-ExtractedTexts = list[tuple[str, str, Optional[ExtractedText]]]
+ExtractedTexts = list[tuple[str, str, ExtractedText | None]]
 MAX_GET_EXTRACTED_TEXT_OPS = 20
@@ -46,7 +45,7 @@ class NoResourcesToSummarize(Exception):
 async def summarize(
-    kbid: str, request: SummarizeRequest, extra_predict_headers: Optional[dict[str, str]]
+    kbid: str, request: SummarizeRequest, extra_predict_headers: dict[str, str] | None
 ) -> SummarizedResponse:
     predict_request = SummarizeModel()
     predict_request.generative_model = request.generative_model
@@ -87,7 +86,7 @@ async def get_extracted_texts(kbid: str, resource_uuids_or_slugs: list[str]) ->
             if uuid is None:
                 logger.warning(f"Resource {uuid_or_slug} not found in KB", extra={"kbid": kbid})
                 continue
-            resource_orm = Resource(txn=txn, storage=storage, kb=kb_orm, uuid=uuid)
+            resource_orm = Resource(txn=txn, storage=storage, kbid=kbid, uuid=uuid)
             fields = await resource_orm.get_fields(force=True)
             for _, field in fields.items():
                 task = asyncio.create_task(get_extracted_text(uuid_or_slug, field, max_tasks))
@@ -115,14 +114,14 @@ async def get_extracted_texts(kbid: str, resource_uuids_or_slugs: list[str]) ->
 async def get_extracted_text(
     uuid_or_slug, field: Field, max_operations: asyncio.Semaphore
-) -> tuple[str, str, Optional[ExtractedText]]:
+) -> tuple[str, str, ExtractedText | None]:
     async with max_operations:
         extracted_text = await field.get_extracted_text(force=True)
         field_key = f"{field.type}/{field.id}"
         return uuid_or_slug, field_key, extracted_text
-async def get_resource_uuid(kbobj: KnowledgeBox, uuid_or_slug: str) -> Optional[str]:
+async def get_resource_uuid(kbobj: KnowledgeBox, uuid_or_slug: str) -> str | None:
     """
     Return the uuid of the resource with the given uuid_or_slug.
     """

nucliadb/search/search/utils.py CHANGED Viewed

@@ -18,7 +18,6 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 import logging
-from typing import Optional
 from pydantic import BaseModel
@@ -30,7 +29,7 @@ from nucliadb_utils.utilities import has_feature
 logger = logging.getLogger(__name__)
-async def filter_hidden_resources(kbid: str, show_hidden: bool) -> Optional[bool]:
+async def filter_hidden_resources(kbid: str, show_hidden: bool) -> bool | None:
     kb_config = await kb.get_config(kbid=kbid)
     hidden_enabled = kb_config and kb_config.hidden_resources_enabled
     if hidden_enabled and not show_hidden:
@@ -41,8 +40,8 @@ async def filter_hidden_resources(kbid: str, show_hidden: bool) -> Optional[bool
 def min_score_from_query_params(
     min_score_bm25: float,
-    min_score_semantic: Optional[float],
-    deprecated_min_score: Optional[float],
+    min_score_semantic: float | None,
+    deprecated_min_score: float | None,
 ) -> MinScore:
     # Keep backward compatibility with the deprecated min_score parameter
     semantic = deprecated_min_score if min_score_semantic is None else min_score_semantic

nucliadb/search/settings.py CHANGED Viewed

@@ -18,7 +18,6 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import Optional
 from pydantic import Field
@@ -43,7 +42,7 @@ class Settings(DriverSettings):
         title="Prequeries max parallel",
         description="The maximum number of prequeries to run in parallel per /ask request",
     )
-    nidx_address: Optional[str] = Field(default=None)
+    nidx_address: str | None = Field(default=None)
 settings = Settings()

nucliadb/standalone/api_router.py CHANGED Viewed

@@ -57,7 +57,7 @@ async def api_config_check(request: Request):
                     valid_nua_key = True
             except Exception as exc:
                 logger.warning(f"Error validating nua key", exc_info=exc)
-                nua_key_check_error = f"Error checking NUA key: {str(exc)}"
+                nua_key_check_error = f"Error checking NUA key: {exc!s}"
     return JSONResponse(
         {
             "nua_api_key": {

nucliadb/standalone/app.py CHANGED Viewed

@@ -31,7 +31,7 @@ from starlette.responses import HTMLResponse
 from starlette.routing import Mount
 import nucliadb_admin_assets  # type: ignore
-from nucliadb.middleware import ProcessTimeHeaderMiddleware
+from nucliadb.middleware import ClientErrorPayloadLoggerMiddleware, ProcessTimeHeaderMiddleware
 from nucliadb.reader import API_PREFIX
 from nucliadb.reader.api.v1.router import api as api_reader_v1
 from nucliadb.search.api.v1.router import api as api_search_v1
@@ -79,7 +79,7 @@ HOMEPAGE_HTML = """
     </ul>
 </body>
 </html>
-"""  # noqa: E501
+"""
 def application_factory(settings: Settings) -> FastAPI:
@@ -95,13 +95,13 @@ def application_factory(settings: Settings) -> FastAPI:
             backend=get_auth_backend(settings),
         ),
         Middleware(AuditMiddleware, audit_utility_getter=get_audit),
+        Middleware(ClientErrorPayloadLoggerMiddleware),
     ]
     if running_settings.debug:
         middleware.append(Middleware(ProcessTimeHeaderMiddleware))
     fastapi_settings = dict(
         debug=running_settings.debug,
-        middleware=middleware,
         lifespan=lifespan,
         exception_handlers={
             Exception: global_exception_handler,
@@ -122,6 +122,7 @@ def application_factory(settings: Settings) -> FastAPI:
         prefix_format=f"/{API_PREFIX}/v{{major}}",
         default_version=(1, 0),
         enable_latest=False,
+        middleware=middleware,
         kwargs=fastapi_settings,
     )

nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.9.1.post5192py3-none-any.whl → 6.10.0.post5705py3-none-any.whl