PyPI - nucliadb - Versions diffs - 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (246) hide show

migrations/0023_backfill_pg_catalog.py +8 -4
migrations/0028_extracted_vectors_reference.py +1 -1
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +8 -4
migrations/0039_backfill_converation_splits_metadata.py +106 -0
migrations/0040_migrate_search_configurations.py +79 -0
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/__init__.py +79 -0
nucliadb/common/catalog/dummy.py +36 -0
nucliadb/common/catalog/interface.py +85 -0
nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
nucliadb/common/catalog/utils.py +56 -0
nucliadb/common/cluster/manager.py +8 -23
nucliadb/common/cluster/rebalance.py +484 -112
nucliadb/common/cluster/rollover.py +36 -9
nucliadb/common/cluster/settings.py +4 -9
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +9 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +5 -34
nucliadb/common/external_index_providers/settings.py +1 -27
nucliadb/common/filter_expression.py +129 -41
nucliadb/common/http_clients/exceptions.py +8 -0
nucliadb/common/http_clients/processing.py +16 -23
nucliadb/common/http_clients/utils.py +3 -0
nucliadb/common/ids.py +82 -58
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +22 -5
nucliadb/common/vector_index_config.py +1 -1
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +10 -8
nucliadb/ingest/consumer/service.py +5 -30
nucliadb/ingest/consumer/shard_creator.py +16 -5
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +37 -49
nucliadb/ingest/fields/conversation.py +55 -9
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +89 -57
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +128 -113
nucliadb/ingest/orm/knowledgebox.py +91 -59
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +98 -153
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +82 -71
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/partitions.py +12 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +15 -114
nucliadb/ingest/settings.py +36 -15
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +23 -26
nucliadb/metrics_exporter.py +20 -6
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +4 -11
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/knowledgebox.py +37 -9
nucliadb/reader/api/v1/learning_config.py +33 -14
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +3 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +15 -19
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +328 -0
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +28 -8
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +33 -19
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -42
nucliadb/search/search/chat/ask.py +131 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +453 -32
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +49 -0
nucliadb/search/search/hydrator/fields.py +217 -0
nucliadb/search/search/hydrator/images.py +130 -0
nucliadb/search/search/hydrator/paragraphs.py +323 -0
nucliadb/search/search/hydrator/resources.py +60 -0
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +24 -7
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +44 -18
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -48
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +5 -6
nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
nucliadb/search/search/query_parser/parsers/common.py +21 -13
nucliadb/search/search/query_parser/parsers/find.py +6 -29
nucliadb/search/search/query_parser/parsers/graph.py +18 -28
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -56
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +6 -7
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +5 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +4 -10
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +15 -14
nucliadb/writer/api/v1/knowledgebox.py +18 -56
nucliadb/writer/api/v1/learning_config.py +5 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +43 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +5 -7
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +15 -22
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb/common/external_index_providers/pinecone.py +0 -894
nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
nucliadb/search/search/hydrator.py +0 -197
nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/search/search/query_parser/parsers/search.py CHANGED Viewed

@@ -17,14 +17,12 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import Optional
 from nidx_protos import nodereader_pb2
 from nucliadb.common.exceptions import InvalidQueryError
 from nucliadb.common.filter_expression import parse_expression
 from nucliadb.search.search.metrics import query_parser_observer
-from nucliadb.search.search.query import expand_entities
 from nucliadb.search.search.query_parser.fetcher import Fetcher
 from nucliadb.search.search.query_parser.models import (
     Filters,
@@ -51,7 +49,6 @@ from .common import (
     parse_semantic_query,
     parse_top_k,
     should_disable_vector_search,
-    validate_query_syntax,
 )
 INDEX_SORTABLE_FIELDS = [
@@ -61,9 +58,7 @@ INDEX_SORTABLE_FIELDS = [
 @query_parser_observer.wrap({"type": "parse_search"})
-async def parse_search(
-    kbid: str, item: SearchRequest, *, fetcher: Optional[Fetcher] = None
-) -> ParsedQuery:
+async def parse_search(kbid: str, item: SearchRequest, *, fetcher: Fetcher | None = None) -> ParsedQuery:
     fetcher = fetcher or fetcher_for_search(kbid, item)
     parser = _SearchParser(kbid, item, fetcher)
     retrieval = await parser.parse()
@@ -90,14 +85,17 @@ class _SearchParser:
         self.fetcher = fetcher
         # cached data while parsing
-        self._query: Optional[Query] = None
-        self._top_k: Optional[int] = None
+        self._query: Query | None = None
+        self._top_k: int | None = None
     async def parse(self) -> UnitRetrieval:
         self._validate_request()
         self._top_k = parse_top_k(self.item)
+        if self._top_k > 0 and self.item.offset > 0:
+            self._top_k += self.item.offset
         # parse search types (features)
         self._query = Query()
@@ -128,8 +126,6 @@ class _SearchParser:
         return retrieval
     def _validate_request(self):
-        validate_query_syntax(self.item.query)
         # synonyms are not compatible with vector/graph search
         if (
             self.item.with_synonyms
@@ -152,67 +148,38 @@ class _SearchParser:
         assert self._top_k is not None, "top_k must be parsed before text query"
         keyword = await parse_keyword_query(self.item, fetcher=self.fetcher)
-        sort, order_by, limit = self._parse_sorting()
+        sort, order_by = self._parse_sorting()
         keyword.sort = sort
         keyword.order_by = order_by
-        if limit is not None:
-            # sort limit can extend top_k
-            self._top_k = max(self._top_k, limit)
         return keyword
     async def _parse_relation_query(self) -> RelationQuery:
         detected_entities = await self._get_detected_entities()
-        deleted_entity_groups = await self.fetcher.get_deleted_entity_groups()
-        meta_cache = await self.fetcher.get_entities_meta_cache()
-        deleted_entities = meta_cache.deleted_entities
         return RelationQuery(
-            entry_points=detected_entities,
-            deleted_entity_groups=deleted_entity_groups,
-            deleted_entities=deleted_entities,
+            entry_points=detected_entities, deleted_entity_groups=[], deleted_entities={}
         )
     async def _get_detected_entities(self) -> list[utils_pb2.RelationNode]:
         detected_entities = await self.fetcher.get_detected_entities()
-        meta_cache = await self.fetcher.get_entities_meta_cache()
-        detected_entities = expand_entities(meta_cache, detected_entities)
         return detected_entities
-    def _parse_sorting(self) -> tuple[search_models.SortOrder, search_models.SortField, Optional[int]]:
+    def _parse_sorting(self) -> tuple[search_models.SortOrder, search_models.SortField]:
         sort = self.item.sort
-        if len(self.item.query) == 0:
-            if sort is None:
+        if sort is None:
+            if len(self.item.query) == 0:
                 sort = SortOptions(
                     field=SortField.CREATED,
                     order=SortOrder.DESC,
-                    limit=None,
-                )
-            elif sort.field not in INDEX_SORTABLE_FIELDS:
-                raise InvalidQueryError(
-                    "sort_field",
-                    f"Empty query can only be sorted by '{SortField.CREATED}' or"
-                    f" '{SortField.MODIFIED}' and sort limit won't be applied",
                 )
-        else:
-            if sort is None:
+            else:
                 sort = SortOptions(
                     field=SortField.SCORE,
                     order=SortOrder.DESC,
-                    limit=None,
-                )
-            elif sort.field not in INDEX_SORTABLE_FIELDS and sort.limit is None:
-                raise InvalidQueryError(
-                    "sort_field",
-                    f"Sort by '{sort.field}' requires setting a sort limit",
                 )
-        # We need to ask for all and cut later
-        top_k = None
-        if sort and sort.limit is not None:
-            # As the index can't sort, we have to do it when merging. To
-            # have consistent results, we must limit them
-            top_k = sort.limit
-        return (sort.order, sort.field, top_k)
+        return (sort.order, sort.field)
     async def _parse_filters(self) -> Filters:
         assert self._query is not None, "query must be parsed before filters"
@@ -254,17 +221,9 @@ class _SearchParser:
             else:
                 filter_operator = nodereader_pb2.FilterOperator.AND
-        autofilter = None
-        if self.item.autofilter:
-            if self._query.relation is not None:
-                autofilter = self._query.relation.entry_points
-            else:
-                autofilter = await self._get_detected_entities()
         hidden = await filter_hidden_resources(self.kbid, self.item.show_hidden)
         return Filters(
-            autofilter=autofilter,
             facets=self.item.faceted,
             field_expression=field_expr,
             paragraph_expression=paragraph_expr,

nucliadb/search/search/query_parser/parsers/unit_retrieval.py CHANGED Viewed

@@ -17,7 +17,6 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import Optional
 from nidx_protos import nodereader_pb2
 from nidx_protos.nodereader_pb2 import SearchRequest
@@ -25,34 +24,14 @@ from nidx_protos.nodereader_pb2 import SearchRequest
 from nucliadb.common.filter_expression import add_and_expression
 from nucliadb.search.search.filters import translate_label
 from nucliadb.search.search.metrics import node_features, query_parser_observer
-from nucliadb.search.search.query import apply_entities_filter, get_sort_field_proto
+from nucliadb.search.search.query import get_sort_field_proto
 from nucliadb.search.search.query_parser.models import ParsedQuery, PredictReranker, UnitRetrieval
 from nucliadb.search.search.query_parser.parsers.graph import parse_path_query
-from nucliadb_models.labels import LABEL_HIDDEN, translate_system_to_alias_label
+from nucliadb_models.labels import LABEL_HIDDEN
 from nucliadb_models.search import SortOrderMap
 from nucliadb_protos import utils_pb2
-@query_parser_observer.wrap({"type": "convert_retrieval_to_proto"})
-async def legacy_convert_retrieval_to_proto(
-    parsed: ParsedQuery,
-) -> tuple[SearchRequest, bool, list[str], Optional[str]]:
-    converter = _Converter(parsed.retrieval)
-    request = converter.into_search_request()
-    # XXX: legacy values that were returned by QueryParser but not always
-    # needed. We should find a better abstraction
-    incomplete = is_incomplete(parsed.retrieval)
-    autofilter = converter._autofilter
-    rephrased_query = None
-    if parsed.retrieval.query.semantic:
-        rephrased_query = await parsed.fetcher.get_rephrased_query()
-    return request, incomplete, autofilter, rephrased_query
 @query_parser_observer.wrap({"type": "convert_retrieval_to_proto"})
 def convert_retrieval_to_proto(retrieval: UnitRetrieval) -> SearchRequest:
     converter = _Converter(retrieval)
@@ -65,8 +44,6 @@ class _Converter:
         self.req = nodereader_pb2.SearchRequest()
         self.retrieval = retrieval
-        self._autofilter: list[str] = []
     def into_search_request(self) -> nodereader_pb2.SearchRequest:
         """Generate a SearchRequest proto from a retrieval operation."""
         self._apply_text_queries()
@@ -75,6 +52,7 @@ class _Converter:
         self._apply_graph_query()
         self._apply_filters()
         self._apply_top_k()
         return self.req
     def _apply_text_queries(self) -> None:
@@ -235,10 +213,6 @@ class _Converter:
             self.req.paragraph_filter.CopyFrom(self.retrieval.filters.paragraph_expression)
         self.req.filter_operator = self.retrieval.filters.filter_expression_operator
-        if self.retrieval.filters.autofilter:
-            entity_filters = apply_entities_filter(self.req, self.retrieval.filters.autofilter)
-            self._autofilter.extend([translate_system_to_alias_label(e) for e in entity_filters])
         if self.retrieval.filters.hidden is not None:
             expr = nodereader_pb2.FilterExpression()
             if self.retrieval.filters.hidden:
@@ -281,3 +255,8 @@ def is_incomplete(retrieval: UnitRetrieval) -> bool:
         return False
     incomplete = retrieval.query.semantic.query is None or len(retrieval.query.semantic.query) == 0
     return incomplete
+def get_rephrased_query(parsed: ParsedQuery) -> str | None:
+    """Given a parsed query, return the rephrased query used, if any."""
+    return parsed.fetcher.get_cached_rephrased_query()

nucliadb/search/search/rank_fusion.py CHANGED Viewed

@@ -20,11 +20,12 @@
 import logging
 from abc import ABC, abstractmethod
 from enum import Enum, auto
-from typing import Optional, TypeVar
+from typing import TypeVar
 from nucliadb.common.external_index_providers.base import ScoredTextBlock
 from nucliadb.common.ids import ParagraphId
 from nucliadb.search.search.query_parser import models as parser_models
+from nucliadb_models.retrieval import RrfScore, Score, WeightedCombSumScore
 from nucliadb_models.search import SCORE_TYPE
 from nucliadb_telemetry.metrics import Observer
@@ -127,7 +128,7 @@ class ReciprocalRankFusion(RankFusionAlgorithm):
         k: float = 60.0,
         *,
         window: int,
-        weights: Optional[dict[str, float]] = None,
+        weights: dict[str, float] | None = None,
         default_weight: float = 1.0,
     ):
         super().__init__(window)
@@ -145,7 +146,7 @@ class ReciprocalRankFusion(RankFusionAlgorithm):
         sources: dict[str, list[ScoredItem]],
     ) -> list[ScoredItem]:
         # accumulated scores per paragraph
-        scores: dict[ParagraphId, tuple[float, SCORE_TYPE]] = {}
+        scores: dict[ParagraphId, tuple[float, SCORE_TYPE, list[Score]]] = {}
         # pointers from paragraph to the original source
         match_positions: dict[ParagraphId, list[tuple[int, int]]] = {}
@@ -161,11 +162,12 @@ class ReciprocalRankFusion(RankFusionAlgorithm):
         for i, (ranking, weight) in enumerate(rankings):
             for rank, item in enumerate(ranking):
                 id = item.paragraph_id
-                score, score_type = scores.setdefault(id, (0, item.score_type))
+                score, score_type, history = scores.setdefault(id, (0, item.score_type, []))
                 score += 1 / (self._k + rank) * weight
+                history.append(item.current_score)
                 if {score_type, item.score_type} == {SCORE_TYPE.BM25, SCORE_TYPE.VECTOR}:
                     score_type = SCORE_TYPE.BOTH
-                scores[id] = (score, score_type)
+                scores[id] = (score, score_type, history)
                 position = (i, rank)
                 match_positions.setdefault(item.paragraph_id, []).append(position)
@@ -175,9 +177,10 @@ class ReciprocalRankFusion(RankFusionAlgorithm):
             # we are getting only one position, effectively deduplicating
             # multiple matches for the same text block
             i, j = match_positions[paragraph_id][0]
-            score, score_type = scores[paragraph_id]
+            score, score_type, history = scores[paragraph_id]
             item = rankings[i][0][j]
-            item.score = score
+            history.append(RrfScore(score=score))
+            item.scores = history
             item.score_type = score_type
             merged.append(item)
@@ -207,7 +210,7 @@ class WeightedCombSum(RankFusionAlgorithm):
         self,
         *,
         window: int,
-        weights: Optional[dict[str, float]] = None,
+        weights: dict[str, float] | None = None,
         default_weight: float = 1.0,
     ):
         super().__init__(window)
@@ -217,7 +220,7 @@ class WeightedCombSum(RankFusionAlgorithm):
     @rank_fusion_observer.wrap({"type": "weighted_comb_sum"})
     def _fuse(self, sources: dict[str, list[ScoredItem]]) -> list[ScoredItem]:
         # accumulated scores per paragraph
-        scores: dict[ParagraphId, tuple[float, SCORE_TYPE]] = {}
+        scores: dict[ParagraphId, tuple[float, SCORE_TYPE, list[Score]]] = {}
         # pointers from paragraph to the original source
         match_positions: dict[ParagraphId, list[tuple[int, int]]] = {}
@@ -228,11 +231,12 @@ class WeightedCombSum(RankFusionAlgorithm):
         for i, (ranking, weight) in enumerate(rankings):
             for j, item in enumerate(ranking):
                 id = item.paragraph_id
-                score, score_type = scores.setdefault(id, (0, item.score_type))
+                score, score_type, history = scores.setdefault(id, (0, item.score_type, []))
                 score += item.score * weight
+                history.append(item.current_score)
                 if {score_type, item.score_type} == {SCORE_TYPE.BM25, SCORE_TYPE.VECTOR}:
                     score_type = SCORE_TYPE.BOTH
-                scores[id] = (score, score_type)
+                scores[id] = (score, score_type, history)
                 position = (i, j)
                 match_positions.setdefault(item.paragraph_id, []).append(position)
@@ -242,9 +246,10 @@ class WeightedCombSum(RankFusionAlgorithm):
             # we are getting only one position, effectively deduplicating
             # multiple matches for the same text block
             i, j = match_positions[paragraph_id][0]
-            score, score_type = scores[paragraph_id]
+            score, score_type, history = scores[paragraph_id]
             item = rankings[i][0][j]
-            item.score = score
+            history.append(WeightedCombSumScore(score=score))
+            item.scores = history
             item.score_type = score_type
             merged.append(item)

nucliadb/search/search/rerankers.py CHANGED Viewed

@@ -21,7 +21,8 @@
 import logging
 from abc import ABC, abstractmethod, abstractproperty
 from dataclasses import dataclass
-from typing import Optional
+from typing_extensions import assert_never
 from nucliadb.search.predict import ProxiedPredictAPIError, SendToPredictError
 from nucliadb.search.search.query_parser import models as parser_models
@@ -63,7 +64,7 @@ class RerankingOptions:
 class Reranker(ABC):
     @abstractproperty
-    def window(self) -> Optional[int]:
+    def window(self) -> int | None:
         """Number of elements the reranker requests. `None` means no specific
         window is enforced."""
         ...
@@ -102,7 +103,7 @@ class NoopReranker(Reranker):
     """
     @property
-    def window(self) -> Optional[int]:
+    def window(self) -> int | None:
         return None
     @reranker_observer.wrap({"type": "noop"})
@@ -181,10 +182,8 @@ def get_reranker(reranker: parser_models.Reranker) -> Reranker:
     elif isinstance(reranker, parser_models.PredictReranker):
         algorithm = PredictReranker(reranker.window)
-    else:  # pragma: nocover
-        # This is a trick so mypy generates an error if this branch can be reached,
-        # that is, if we are missing some ifs
-        _a: int = "a"
+    else:  # pragma: no cover
+        assert_never(reranker)
     return algorithm

nucliadb/search/search/retrieval.py ADDED Viewed

@@ -0,0 +1,300 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from collections.abc import Iterable
+from nidx_protos.nodereader_pb2 import (
+    DocumentScored,
+    GraphSearchResponse,
+    ParagraphResult,
+    ParagraphSearchResponse,
+    SearchRequest,
+    SearchResponse,
+    VectorSearchResponse,
+)
+from nucliadb.common.external_index_providers.base import TextBlockMatch
+from nucliadb.common.ids import ParagraphId, VectorId
+from nucliadb.search import logger
+from nucliadb.search.requesters.utils import Method, nidx_query
+from nucliadb.search.search.metrics import merge_observer, search_observer
+from nucliadb.search.search.query_parser.models import UnitRetrieval
+from nucliadb.search.search.query_parser.parsers.unit_retrieval import convert_retrieval_to_proto
+from nucliadb.search.search.rank_fusion import IndexSource, get_rank_fusion
+from nucliadb_models.retrieval import GraphScore, KeywordScore, SemanticScore
+from nucliadb_models.search import SCORE_TYPE, TextPosition
+# Constant score given to all graph results until we implement graph scoring
+FAKE_GRAPH_SCORE = 1.0
+async def nidx_search(kbid: str, pb_query: SearchRequest) -> tuple[SearchResponse, list[str]]:
+    """Wrapper around nidx_query for SEARCH that merges shards results in a
+    single response.
+    At some point, nidx will provide this functionality and we'll be able to
+    remove this.
+    """
+    shards_responses, queried_shards = await nidx_query(kbid, Method.SEARCH, pb_query)
+    response = merge_shard_responses(shards_responses)
+    return response, queried_shards
+@search_observer.wrap({"type": "text_block_search"})
+async def text_block_search(
+    kbid: str, retrieval: UnitRetrieval
+) -> tuple[list[TextBlockMatch], SearchRequest, SearchResponse, list[str]]:
+    """Search for text blocks in multiple indexes and return an rank fused view.
+    This search method provides a textual view of the data. For example, given a
+    graph query, it will return the text blocks associated with matched
+    triplets, not the triplet itself.
+    """
+    assert retrieval.rank_fusion is not None, "text block search requries a rank fusion algorithm"
+    pb_query = convert_retrieval_to_proto(retrieval)
+    shards_response, queried_shards = await nidx_search(kbid, pb_query)
+    keyword_results = keyword_results_to_text_block_matches(shards_response.paragraph.results)
+    semantic_results = semantic_results_to_text_block_matches(shards_response.vector.documents)
+    graph_results = graph_results_to_text_block_matches(shards_response.graph)
+    rank_fusion = get_rank_fusion(retrieval.rank_fusion)
+    merged_text_blocks = rank_fusion.fuse(
+        {
+            IndexSource.KEYWORD: keyword_results,
+            IndexSource.SEMANTIC: semantic_results,
+            IndexSource.GRAPH: graph_results,
+        }
+    )
+    # cut to the rank fusion window. As we ask each shard and index this window,
+    # we'll normally have extra results
+    text_blocks = merged_text_blocks[: retrieval.rank_fusion.window]
+    return text_blocks, pb_query, shards_response, queried_shards
+@merge_observer.wrap({"type": "shards_responses"})
+def merge_shard_responses(
+    responses: list[SearchResponse],
+) -> SearchResponse:
+    """Merge search responses into a single response as if there were no shards
+    involved.
+    ATENTION! This is not a complete merge, we are only merging the fields
+    needed to compose a /find response.
+    """
+    paragraphs = []
+    vectors = []
+    graphs = []
+    for response in responses:
+        paragraphs.append(response.paragraph)
+        vectors.append(response.vector)
+        graphs.append(response.graph)
+    merged = SearchResponse(
+        paragraph=merge_shards_keyword_responses(paragraphs),
+        vector=merge_shards_semantic_responses(vectors),
+        graph=merge_shards_graph_responses(graphs),
+    )
+    return merged
+def merge_shards_keyword_responses(
+    keyword_responses: list[ParagraphSearchResponse],
+) -> ParagraphSearchResponse:
+    """Merge keyword (paragraph) search responses into a single response as if
+    there were no shards involved.
+    ATENTION! This is not a complete merge, we are only merging the fields
+    needed to compose a /find response.
+    """
+    merged = ParagraphSearchResponse()
+    for response in keyword_responses:
+        merged.query = response.query
+        merged.next_page = merged.next_page or response.next_page
+        merged.total += response.total
+        merged.results.extend(response.results)
+        merged.ematches.extend(response.ematches)
+    return merged
+def merge_shards_semantic_responses(
+    semantic_responses: list[VectorSearchResponse],
+) -> VectorSearchResponse:
+    """Merge semantic (vector) search responses into a single response as if
+    there were no shards involved.
+    ATENTION! This is not a complete merge, we are only merging the fields
+    needed to compose a /find response.
+    """
+    merged = VectorSearchResponse()
+    for response in semantic_responses:
+        merged.documents.extend(response.documents)
+    return merged
+def merge_shards_graph_responses(
+    graph_responses: list[GraphSearchResponse],
+):
+    merged = GraphSearchResponse()
+    for response in graph_responses:
+        nodes_offset = len(merged.nodes)
+        relations_offset = len(merged.relations)
+        # paths contain indexes to nodes and relations, we must offset them
+        # while merging responses to maintain valid data
+        for path in response.graph:
+            merged_path = GraphSearchResponse.Path()
+            merged_path.CopyFrom(path)
+            merged_path.source += nodes_offset
+            merged_path.relation += relations_offset
+            merged_path.destination += nodes_offset
+            merged.graph.append(merged_path)
+        merged.nodes.extend(response.nodes)
+        merged.relations.extend(response.relations)
+    return merged
+def keyword_result_to_text_block_match(item: ParagraphResult) -> TextBlockMatch:
+    fuzzy_result = len(item.matches) > 0
+    return TextBlockMatch(
+        paragraph_id=ParagraphId.from_string(item.paragraph),
+        scores=[KeywordScore(score=item.score.bm25)],
+        score_type=SCORE_TYPE.BM25,
+        order=0,  # NOTE: this will be filled later
+        text=None,  # NOTE: this will be filled later too
+        position=TextPosition(
+            page_number=item.metadata.position.page_number,
+            index=item.metadata.position.index,
+            start=item.start,
+            end=item.end,
+            start_seconds=[x for x in item.metadata.position.start_seconds],
+            end_seconds=[x for x in item.metadata.position.end_seconds],
+        ),
+        # XXX: we should split labels
+        field_labels=[],
+        paragraph_labels=list(item.labels),
+        fuzzy_search=fuzzy_result,
+        is_a_table=item.metadata.representation.is_a_table,
+        representation_file=item.metadata.representation.file or None,
+        page_with_visual=item.metadata.page_with_visual,
+    )
+def keyword_results_to_text_block_matches(items: Iterable[ParagraphResult]) -> list[TextBlockMatch]:
+    return [keyword_result_to_text_block_match(item) for item in items]
+class InvalidDocId(Exception):
+    """Raised while parsing an invalid id coming from semantic search"""
+    def __init__(self, invalid_vector_id: str):
+        self.invalid_vector_id = invalid_vector_id
+        super().__init__(f"Invalid vector ID: {invalid_vector_id}")
+def semantic_result_to_text_block_match(item: DocumentScored) -> TextBlockMatch:
+    try:
+        vector_id = VectorId.from_string(item.doc_id.id)
+    except (IndexError, ValueError):
+        raise InvalidDocId(item.doc_id.id)
+    return TextBlockMatch(
+        paragraph_id=ParagraphId.from_vector_id(vector_id),
+        scores=[SemanticScore(score=item.score)],
+        score_type=SCORE_TYPE.VECTOR,
+        order=0,  # NOTE: this will be filled later
+        text=None,  # NOTE: this will be filled later too
+        position=TextPosition(
+            page_number=item.metadata.position.page_number,
+            index=item.metadata.position.index,
+            start=vector_id.vector_start,
+            end=vector_id.vector_end,
+            start_seconds=[x for x in item.metadata.position.start_seconds],
+            end_seconds=[x for x in item.metadata.position.end_seconds],
+        ),
+        # XXX: we should split labels
+        field_labels=[],
+        paragraph_labels=list(item.labels),
+        fuzzy_search=False,  # semantic search doesn't have fuzziness
+        is_a_table=item.metadata.representation.is_a_table,
+        representation_file=item.metadata.representation.file or None,
+        page_with_visual=item.metadata.page_with_visual,
+    )
+def semantic_results_to_text_block_matches(items: Iterable[DocumentScored]) -> list[TextBlockMatch]:
+    text_blocks: list[TextBlockMatch] = []
+    for item in items:
+        try:
+            text_block = semantic_result_to_text_block_match(item)
+        except InvalidDocId as exc:
+            logger.warning(f"Skipping invalid doc_id: {exc.invalid_vector_id}")
+            continue
+        text_blocks.append(text_block)
+    return text_blocks
+def graph_results_to_text_block_matches(item: GraphSearchResponse) -> list[TextBlockMatch]:
+    matches = []
+    for path in item.graph:
+        metadata = path.metadata
+        if not metadata.paragraph_id:
+            continue
+        paragraph_id = ParagraphId.from_string(metadata.paragraph_id)
+        matches.append(
+            TextBlockMatch(
+                paragraph_id=paragraph_id,
+                scores=[GraphScore(score=FAKE_GRAPH_SCORE)],
+                score_type=SCORE_TYPE.RELATION_RELEVANCE,
+                order=0,  # NOTE: this will be filled later
+                text=None,  # NOTE: this will be filled later too
+                position=TextPosition(
+                    page_number=0,
+                    index=0,
+                    start=paragraph_id.paragraph_start,
+                    end=paragraph_id.paragraph_end,
+                    start_seconds=[],
+                    end_seconds=[],
+                ),
+                # XXX: we should split labels
+                field_labels=[],
+                paragraph_labels=[],
+                fuzzy_search=False,  # TODO: this depends on the query, should we populate it?
+                is_a_table=False,
+                representation_file="",
+                page_with_visual=False,
+            )
+        )
+    return matches

nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl