PyPI - nucliadb - Versions diffs - 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (246) hide show

migrations/0023_backfill_pg_catalog.py +8 -4
migrations/0028_extracted_vectors_reference.py +1 -1
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +8 -4
migrations/0039_backfill_converation_splits_metadata.py +106 -0
migrations/0040_migrate_search_configurations.py +79 -0
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/__init__.py +79 -0
nucliadb/common/catalog/dummy.py +36 -0
nucliadb/common/catalog/interface.py +85 -0
nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
nucliadb/common/catalog/utils.py +56 -0
nucliadb/common/cluster/manager.py +8 -23
nucliadb/common/cluster/rebalance.py +484 -112
nucliadb/common/cluster/rollover.py +36 -9
nucliadb/common/cluster/settings.py +4 -9
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +9 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +5 -34
nucliadb/common/external_index_providers/settings.py +1 -27
nucliadb/common/filter_expression.py +129 -41
nucliadb/common/http_clients/exceptions.py +8 -0
nucliadb/common/http_clients/processing.py +16 -23
nucliadb/common/http_clients/utils.py +3 -0
nucliadb/common/ids.py +82 -58
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +22 -5
nucliadb/common/vector_index_config.py +1 -1
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +10 -8
nucliadb/ingest/consumer/service.py +5 -30
nucliadb/ingest/consumer/shard_creator.py +16 -5
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +37 -49
nucliadb/ingest/fields/conversation.py +55 -9
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +89 -57
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +128 -113
nucliadb/ingest/orm/knowledgebox.py +91 -59
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +98 -153
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +82 -71
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/partitions.py +12 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +15 -114
nucliadb/ingest/settings.py +36 -15
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +23 -26
nucliadb/metrics_exporter.py +20 -6
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +4 -11
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/knowledgebox.py +37 -9
nucliadb/reader/api/v1/learning_config.py +33 -14
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +3 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +15 -19
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +328 -0
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +28 -8
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +33 -19
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -42
nucliadb/search/search/chat/ask.py +131 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +453 -32
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +49 -0
nucliadb/search/search/hydrator/fields.py +217 -0
nucliadb/search/search/hydrator/images.py +130 -0
nucliadb/search/search/hydrator/paragraphs.py +323 -0
nucliadb/search/search/hydrator/resources.py +60 -0
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +24 -7
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +44 -18
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -48
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +5 -6
nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
nucliadb/search/search/query_parser/parsers/common.py +21 -13
nucliadb/search/search/query_parser/parsers/find.py +6 -29
nucliadb/search/search/query_parser/parsers/graph.py +18 -28
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -56
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +6 -7
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +5 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +4 -10
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +15 -14
nucliadb/writer/api/v1/knowledgebox.py +18 -56
nucliadb/writer/api/v1/learning_config.py +5 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +43 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +5 -7
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +15 -22
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb/common/external_index_providers/pinecone.py +0 -894
nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
nucliadb/search/search/hydrator.py +0 -197
nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/search/search/find_merge.py CHANGED Viewed

@@ -18,104 +18,56 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 import asyncio
-from typing import Iterable, Optional, Union
-from nidx_protos.nodereader_pb2 import (
-    DocumentScored,
-    GraphSearchResponse,
-    ParagraphResult,
-    ParagraphSearchResponse,
-    SearchResponse,
-    VectorSearchResponse,
-)
+from collections.abc import Iterable
+from nidx_protos.nodereader_pb2 import GraphSearchResponse, SearchResponse
 from nucliadb.common.external_index_providers.base import TextBlockMatch
-from nucliadb.common.ids import ParagraphId, VectorId
-from nucliadb.search import SERVICE_NAME, logger
+from nucliadb.common.ids import ParagraphId
+from nucliadb.models.internal.augment import AugmentedParagraph, Paragraph, ParagraphText
+from nucliadb.search.augmentor.paragraphs import augment_paragraphs
+from nucliadb.search.augmentor.resources import augment_resources_deep
 from nucliadb.search.search.cut import cut_page
 from nucliadb.search.search.hydrator import (
     ResourceHydrationOptions,
     TextBlockHydrationOptions,
-    hydrate_resource_metadata,
-    hydrate_text_block,
-    text_block_to_find_paragraph,
 )
 from nucliadb.search.search.merge import merge_relations_results
+from nucliadb.search.search.metrics import merge_observer
+from nucliadb.search.search.paragraphs import highlight_paragraph
 from nucliadb.search.search.query_parser.models import UnitRetrieval
-from nucliadb.search.search.rank_fusion import IndexSource, RankFusionAlgorithm
-from nucliadb.search.search.rerankers import (
-    RerankableItem,
-    Reranker,
-    RerankingOptions,
-)
-from nucliadb_models.common import FieldTypeName
-from nucliadb_models.resource import ExtractedDataTypeName, Resource
+from nucliadb.search.search.rerankers import RerankableItem, Reranker, RerankingOptions
+from nucliadb_models.resource import Resource
+from nucliadb_models.retrieval import RerankerScore
 from nucliadb_models.search import (
-    SCORE_TYPE,
     FindField,
+    FindParagraph,
     FindResource,
     KnowledgeboxFindResults,
     MinScore,
-    ResourceProperties,
-    TextPosition,
 )
 from nucliadb_telemetry import metrics
-from .metrics import merge_observer
 FIND_FETCH_OPS_DISTRIBUTION = metrics.Histogram(
     "nucliadb_find_fetch_operations",
     buckets=[1, 5, 10, 20, 30, 40, 50, 60, 80, 100, 200],
 )
-# Constant score given to all graph results until we implement graph scoring
-FAKE_GRAPH_SCORE = 1.0
 @merge_observer.wrap({"type": "find_merge"})
 async def build_find_response(
-    search_responses: list[SearchResponse],
+    search_response: SearchResponse,
+    merged_text_blocks: list[TextBlockMatch],
+    graph_response: GraphSearchResponse,
     *,
     retrieval: UnitRetrieval,
     kbid: str,
     query: str,
-    rephrased_query: Optional[str],
-    rank_fusion_algorithm: RankFusionAlgorithm,
+    rephrased_query: str | None,
     reranker: Reranker,
-    show: list[ResourceProperties] = [],
-    extracted: list[ExtractedDataTypeName] = [],
-    field_type_filter: list[FieldTypeName] = [],
-    highlight: bool = False,
+    resource_hydration_options: ResourceHydrationOptions,
+    text_block_hydration_options: TextBlockHydrationOptions,
 ) -> KnowledgeboxFindResults:
-    # XXX: we shouldn't need a min score that we haven't used. Previous
-    # implementations got this value from the proto request (i.e., default to 0)
-    min_score_bm25 = 0.0
-    if retrieval.query.keyword is not None:
-        min_score_bm25 = retrieval.query.keyword.min_score
-    min_score_semantic = 0.0
-    if retrieval.query.semantic is not None:
-        min_score_semantic = retrieval.query.semantic.min_score
-    # merge
-    search_response = merge_shard_responses(search_responses)
-    keyword_results = keyword_results_to_text_block_matches(search_response.paragraph.results)
-    semantic_results = semantic_results_to_text_block_matches(
-        filter(
-            lambda x: x.score >= min_score_semantic,
-            search_response.vector.documents,
-        )
-    )
-    graph_results = graph_results_to_text_block_matches(search_response.graph)
-    merged_text_blocks = rank_fusion_algorithm.fuse(
-        {
-            IndexSource.KEYWORD: keyword_results,
-            IndexSource.SEMANTIC: semantic_results,
-            IndexSource.GRAPH: graph_results,
-        }
-    )
     # cut
     # we assume pagination + predict reranker is forbidden and has been already
     # enforced/validated by the query parsing.
@@ -126,14 +78,12 @@ async def build_find_response(
         text_blocks_page, next_page = cut_page(merged_text_blocks, retrieval.top_k)
     # hydrate and rerank
-    resource_hydration_options = ResourceHydrationOptions(
-        show=show, extracted=extracted, field_type_filter=field_type_filter
+    reranking_options = RerankingOptions(
+        kbid=kbid,
+        # if we have a rephrased query, we assume it'll be better for the
+        # reranker model. Otherwise, use the user query
+        query=rephrased_query or query,
     )
-    text_block_hydration_options = TextBlockHydrationOptions(
-        highlight=highlight,
-        ematches=search_response.paragraph.ematches,  # type: ignore
-    )
-    reranking_options = RerankingOptions(kbid=kbid, query=query)
     text_blocks, resources, best_matches = await hydrate_and_rerank(
         text_blocks_page,
         kbid,
@@ -148,12 +98,41 @@ async def build_find_response(
     entry_points = []
     if retrieval.query.relation is not None:
         entry_points = retrieval.query.relation.entry_points
-    relations = await merge_relations_results([search_response.graph], entry_points)
+    relations = await merge_relations_results([graph_response], entry_points)
     # compose response
     find_resources = compose_find_resources(text_blocks, resources)
-    next_page = search_response.paragraph.next_page or next_page
+    # Compute some misc values for the response
+    # XXX: we shouldn't need a min score that we haven't used. Previous
+    # implementations got this value from the proto request (i.e., default to 0)
+    min_score_bm25 = 0.0
+    if retrieval.query.keyword is not None:
+        min_score_bm25 = retrieval.query.keyword.min_score
+    min_score_semantic = 0.0
+    if retrieval.query.semantic is not None:
+        min_score_semantic = retrieval.query.semantic.min_score
+    # Bw/c with pagination, next page can be obtained from different places. The
+    # meaning is whether a greater top_k would have returned more results.
+    # Although it doesn't take into account matches on the same paragraphs, an
+    # estimate is good enough
+    next_page = (
+        # when rank fusion window is greater than top_k or the reranker window
+        next_page
+        # when the keyword index already has more results
+        or search_response.paragraph.next_page
+        # when rank fusion window is greater than top_k
+        or len(merged_text_blocks) > retrieval.top_k
+        # when the sum of all indexes makes more than top_k
+        or (
+            len(search_response.paragraph.results)
+            + len(search_response.vector.documents)
+            + len([True for path in graph_response.graph if path.metadata.paragraph_id])
+            > retrieval.top_k
+        )
+    )
     total_paragraphs = search_response.paragraph.total
     find_results = KnowledgeboxFindResults(
@@ -171,212 +150,6 @@ async def build_find_response(
     return find_results
-def merge_shard_responses(
-    responses: list[SearchResponse],
-) -> SearchResponse:
-    """Merge search responses into a single response as if there were no shards
-    involved.
-    ATENTION! This is not a complete merge, we are only merging the fields
-    needed to compose a /find response.
-    """
-    paragraphs = []
-    vectors = []
-    graphs = []
-    for response in responses:
-        paragraphs.append(response.paragraph)
-        vectors.append(response.vector)
-        graphs.append(response.graph)
-    merged = SearchResponse(
-        paragraph=merge_shards_keyword_responses(paragraphs),
-        vector=merge_shards_semantic_responses(vectors),
-        graph=merge_shards_graph_responses(graphs),
-    )
-    return merged
-def merge_shards_keyword_responses(
-    keyword_responses: list[ParagraphSearchResponse],
-) -> ParagraphSearchResponse:
-    """Merge keyword (paragraph) search responses into a single response as if
-    there were no shards involved.
-    ATENTION! This is not a complete merge, we are only merging the fields
-    needed to compose a /find response.
-    """
-    merged = ParagraphSearchResponse()
-    for response in keyword_responses:
-        merged.query = response.query
-        merged.next_page = merged.next_page or response.next_page
-        merged.total += response.total
-        merged.results.extend(response.results)
-        merged.ematches.extend(response.ematches)
-    return merged
-def merge_shards_semantic_responses(
-    semantic_responses: list[VectorSearchResponse],
-) -> VectorSearchResponse:
-    """Merge semantic (vector) search responses into a single response as if
-    there were no shards involved.
-    ATENTION! This is not a complete merge, we are only merging the fields
-    needed to compose a /find response.
-    """
-    merged = VectorSearchResponse()
-    for response in semantic_responses:
-        merged.documents.extend(response.documents)
-    return merged
-def merge_shards_graph_responses(
-    graph_responses: list[GraphSearchResponse],
-):
-    merged = GraphSearchResponse()
-    for response in graph_responses:
-        nodes_offset = len(merged.nodes)
-        relations_offset = len(merged.relations)
-        # paths contain indexes to nodes and relations, we must offset them
-        # while merging responses to maintain valid data
-        for path in response.graph:
-            merged_path = GraphSearchResponse.Path()
-            merged_path.CopyFrom(path)
-            merged_path.source += nodes_offset
-            merged_path.relation += relations_offset
-            merged_path.destination += nodes_offset
-            merged.graph.append(merged_path)
-        merged.nodes.extend(response.nodes)
-        merged.relations.extend(response.relations)
-    return merged
-def keyword_result_to_text_block_match(item: ParagraphResult) -> TextBlockMatch:
-    fuzzy_result = len(item.matches) > 0
-    return TextBlockMatch(
-        paragraph_id=ParagraphId.from_string(item.paragraph),
-        score=item.score.bm25,
-        score_type=SCORE_TYPE.BM25,
-        order=0,  # NOTE: this will be filled later
-        text="",  # NOTE: this will be filled later too
-        position=TextPosition(
-            page_number=item.metadata.position.page_number,
-            index=item.metadata.position.index,
-            start=item.start,
-            end=item.end,
-            start_seconds=[x for x in item.metadata.position.start_seconds],
-            end_seconds=[x for x in item.metadata.position.end_seconds],
-        ),
-        # XXX: we should split labels
-        field_labels=[],
-        paragraph_labels=list(item.labels),
-        fuzzy_search=fuzzy_result,
-        is_a_table=item.metadata.representation.is_a_table,
-        representation_file=item.metadata.representation.file,
-        page_with_visual=item.metadata.page_with_visual,
-    )
-def keyword_results_to_text_block_matches(items: Iterable[ParagraphResult]) -> list[TextBlockMatch]:
-    return [keyword_result_to_text_block_match(item) for item in items]
-class InvalidDocId(Exception):
-    """Raised while parsing an invalid id coming from semantic search"""
-    def __init__(self, invalid_vector_id: str):
-        self.invalid_vector_id = invalid_vector_id
-        super().__init__(f"Invalid vector ID: {invalid_vector_id}")
-def semantic_result_to_text_block_match(item: DocumentScored) -> TextBlockMatch:
-    try:
-        vector_id = VectorId.from_string(item.doc_id.id)
-    except (IndexError, ValueError):
-        raise InvalidDocId(item.doc_id.id)
-    return TextBlockMatch(
-        paragraph_id=ParagraphId.from_vector_id(vector_id),
-        score=item.score,
-        score_type=SCORE_TYPE.VECTOR,
-        order=0,  # NOTE: this will be filled later
-        text="",  # NOTE: this will be filled later too
-        position=TextPosition(
-            page_number=item.metadata.position.page_number,
-            index=item.metadata.position.index,
-            start=vector_id.vector_start,
-            end=vector_id.vector_end,
-            start_seconds=[x for x in item.metadata.position.start_seconds],
-            end_seconds=[x for x in item.metadata.position.end_seconds],
-        ),
-        # XXX: we should split labels
-        field_labels=[],
-        paragraph_labels=list(item.labels),
-        fuzzy_search=False,  # semantic search doesn't have fuzziness
-        is_a_table=item.metadata.representation.is_a_table,
-        representation_file=item.metadata.representation.file,
-        page_with_visual=item.metadata.page_with_visual,
-    )
-def semantic_results_to_text_block_matches(items: Iterable[DocumentScored]) -> list[TextBlockMatch]:
-    text_blocks: list[TextBlockMatch] = []
-    for item in items:
-        try:
-            text_block = semantic_result_to_text_block_match(item)
-        except InvalidDocId as exc:
-            logger.warning(f"Skipping invalid doc_id: {exc.invalid_vector_id}")
-            continue
-        text_blocks.append(text_block)
-    return text_blocks
-def graph_results_to_text_block_matches(item: GraphSearchResponse) -> list[TextBlockMatch]:
-    matches = []
-    for path in item.graph:
-        metadata = path.metadata
-        if not metadata.paragraph_id:
-            continue
-        paragraph_id = ParagraphId.from_string(metadata.paragraph_id)
-        matches.append(
-            TextBlockMatch(
-                paragraph_id=paragraph_id,
-                score=FAKE_GRAPH_SCORE,
-                score_type=SCORE_TYPE.RELATION_RELEVANCE,
-                order=0,  # NOTE: this will be filled later
-                text="",  # NOTE: this will be filled later too
-                position=TextPosition(
-                    page_number=0,
-                    index=0,
-                    start=paragraph_id.paragraph_start,
-                    end=paragraph_id.paragraph_end,
-                    start_seconds=[],
-                    end_seconds=[],
-                ),
-                # XXX: we should split labels
-                field_labels=[],
-                paragraph_labels=[],
-                fuzzy_search=False,  # TODO: this depends on the query, should we populate it?
-                is_a_table=False,
-                representation_file="",
-                page_with_visual=False,
-            )
-        )
-    return matches
 @merge_observer.wrap({"type": "hydrate_and_rerank"})
 async def hydrate_and_rerank(
     text_blocks: Iterable[TextBlockMatch],
@@ -398,11 +171,12 @@ async def hydrate_and_rerank(
     """
     max_operations = asyncio.Semaphore(50)
-    # Iterate text blocks and create text block and resource metadata hydration
-    # tasks depending on the reranker
+    # Iterate text blocks to create an "index" for faster access by id and get a
+    # list of text block ids and resource ids to hydrate
     text_blocks_by_id: dict[str, TextBlockMatch] = {}  # useful for faster access to text blocks later
-    resource_hydration_ops = {}
-    text_block_hydration_ops = []
+    resources_to_hydrate = set()
+    text_block_id_to_hydrate = set()
     for text_block in text_blocks:
         rid = text_block.paragraph_id.rid
         paragraph_id = text_block.paragraph_id.full()
@@ -417,41 +191,48 @@ async def hydrate_and_rerank(
         # ones we see now, so we'll skip this step and recompute the resources
         # later
         if not reranker.needs_extra_results:
-            if rid not in resource_hydration_ops:
-                resource_hydration_ops[rid] = asyncio.create_task(
-                    hydrate_resource_metadata(
-                        kbid,
-                        rid,
-                        options=resource_hydration_options,
-                        concurrency_control=max_operations,
-                        service_name=SERVICE_NAME,
-                    )
-                )
+            resources_to_hydrate.add(rid)
-        text_block_hydration_ops.append(
-            asyncio.create_task(
-                hydrate_text_block(
-                    kbid,
-                    text_block,
-                    text_block_hydration_options,
-                    concurrency_control=max_operations,
-                )
-            )
-        )
+        if text_block_hydration_options.only_hydrate_empty and text_block.text:
+            pass
+        else:
+            text_block_id_to_hydrate.add(paragraph_id)
     # hydrate only the strictly needed before rerank
-    hydrated_text_blocks: list[TextBlockMatch]
-    hydrated_resources: list[Union[Resource, None]]
     ops = [
-        *text_block_hydration_ops,
-        *resource_hydration_ops.values(),
+        augment_paragraphs(
+            kbid,
+            given=[
+                Paragraph.from_text_block_match(text_blocks_by_id[paragraph_id])
+                for paragraph_id in text_block_id_to_hydrate
+            ],
+            select=[ParagraphText()],
+            concurrency_control=max_operations,
+        ),
+        augment_resources_deep(
+            kbid,
+            given=list(resources_to_hydrate),
+            opts=resource_hydration_options,
+            concurrency_control=max_operations,
+        ),
     ]
-    FIND_FETCH_OPS_DISTRIBUTION.observe(len(ops))
+    FIND_FETCH_OPS_DISTRIBUTION.observe(len(text_block_id_to_hydrate) + len(resources_to_hydrate))
     results = await asyncio.gather(*ops)
-    hydrated_text_blocks = results[: len(text_block_hydration_ops)]  # type: ignore
-    hydrated_resources = results[len(text_block_hydration_ops) :]  # type: ignore
+    augmented_paragraphs: dict[ParagraphId, AugmentedParagraph | None] = results[0]  # type: ignore
+    augmented_resources: dict[str, Resource | None] = results[1]  # type: ignore
+    # add hydrated text to our text blocks
+    for text_block in text_blocks:
+        augmented = augmented_paragraphs.get(text_block.paragraph_id, None)
+        if augmented is not None and augmented.text is not None:
+            if text_block_hydration_options.highlight:
+                text = highlight_paragraph(
+                    augmented.text, words=[], ematches=text_block_hydration_options.ematches
+                )
+            else:
+                text = augmented.text
+            text_block.text = text
     # with the hydrated text, rerank and apply new scores to the text blocks
     to_rerank = [
@@ -461,7 +242,7 @@ async def hydrate_and_rerank(
             score_type=text_block.score_type,
             content=text_block.text or "",  # TODO: add a warning, this shouldn't usually happen
         )
-        for text_block in hydrated_text_blocks
+        for text_block in text_blocks
     ]
     reranked = await reranker.rerank(to_rerank, reranking_options)
@@ -476,7 +257,7 @@ async def hydrate_and_rerank(
         score_type = item.score_type
         text_block = text_blocks_by_id[paragraph_id]
-        text_block.score = score
+        text_block.scores.append(RerankerScore(score=score))
         text_block.score_type = score_type
         matches.append((paragraph_id, score))
@@ -485,7 +266,7 @@ async def hydrate_and_rerank(
     best_matches = []
     best_text_blocks = []
-    resource_hydration_ops = {}
+    resources_to_hydrate.clear()
     for order, (paragraph_id, _) in enumerate(matches):
         text_block = text_blocks_by_id[paragraph_id]
         text_block.order = order
@@ -495,24 +276,19 @@ async def hydrate_and_rerank(
         # now we have removed the text block surplus, fetch resource metadata
         if reranker.needs_extra_results:
             rid = ParagraphId.from_string(paragraph_id).rid
-            if rid not in resource_hydration_ops:
-                resource_hydration_ops[rid] = asyncio.create_task(
-                    hydrate_resource_metadata(
-                        kbid,
-                        rid,
-                        options=resource_hydration_options,
-                        concurrency_control=max_operations,
-                        service_name=SERVICE_NAME,
-                    )
-                )
+            resources_to_hydrate.add(rid)
     # Finally, fetch resource metadata if we haven't already done it
     if reranker.needs_extra_results:
-        ops = list(resource_hydration_ops.values())
-        FIND_FETCH_OPS_DISTRIBUTION.observe(len(ops))
-        hydrated_resources = await asyncio.gather(*ops)  # type: ignore
+        FIND_FETCH_OPS_DISTRIBUTION.observe(len(resources_to_hydrate))
+        augmented_resources = await augment_resources_deep(
+            kbid,
+            given=list(resources_to_hydrate),
+            opts=resource_hydration_options,
+            concurrency_control=max_operations,
+        )
-    resources = [resource for resource in hydrated_resources if resource is not None]
+    resources = [resource for resource in augmented_resources.values() if resource is not None]
     return best_text_blocks, resources, best_matches
@@ -547,5 +323,22 @@ def compose_find_resources(
     return find_resources
+def text_block_to_find_paragraph(text_block: TextBlockMatch) -> FindParagraph:
+    return FindParagraph(
+        id=text_block.paragraph_id.full(),
+        text=text_block.text or "",
+        score=text_block.score,
+        score_type=text_block.score_type,
+        order=text_block.order,
+        labels=text_block.paragraph_labels,
+        fuzzy_result=text_block.fuzzy_search,
+        is_a_table=text_block.is_a_table,
+        reference=text_block.representation_file,
+        page_with_visual=text_block.page_with_visual,
+        position=text_block.position,
+        relevant_relations=text_block.relevant_relations,
+    )
 def _round(x: float) -> float:
     return round(x, ndigits=3)

nucliadb/search/search/graph_strategy.py CHANGED Viewed

@@ -19,8 +19,9 @@
 import heapq
 import json
 from collections import defaultdict
+from collections.abc import Collection, Iterable
 from dataclasses import dataclass
-from typing import Any, Collection, Iterable, Optional, Union
+from typing import Any
 from nidx_protos import nodereader_pb2
 from nuclia_models.predict.generative_responses import (
@@ -55,6 +56,7 @@ from nucliadb_models.internal.predict import (
     RerankModel,
 )
 from nucliadb_models.resource import ExtractedDataTypeName
+from nucliadb_models.retrieval import GraphScore
 from nucliadb_models.search import (
     SCORE_TYPE,
     AskRequest,
@@ -112,11 +114,11 @@ SCHEMA = {
 }
 PROMPT = """\
-You are an advanced language model assisting in scoring relationships (edges) between two entities in a knowledge graph, given a user’s question.
+You are an advanced language model assisting in scoring relationships (edges) between two entities in a knowledge graph, given a user's question.
 For each provided **(head_entity, relationship, tail_entity)**, you must:
 1. Assign a **relevance score** between **0** and **10**.
-2. **0** means “this relationship can’t be relevant at all to the question.”
+2. **0** means “this relationship can't be relevant at all to the question.”
 3. **10** means “this relationship is extremely relevant to the question.”
 4. You may use **any integer** between 0 and 10 (e.g., 3, 7, etc.) based on how relevant you deem the relationship to be.
 5. **Language Agnosticism**: The question and the relationships may be in different languages. The relevance scoring should still work and be agnostic of the language.
@@ -318,8 +320,8 @@ async def get_graph_results(
     graph_strategy: GraphStrategy,
     text_block_reranker: Reranker,
     metrics: Metrics,
-    generative_model: Optional[str] = None,
-    shards: Optional[list[str]] = None,
+    generative_model: str | None = None,
+    shards: list[str] | None = None,
 ) -> tuple[KnowledgeboxFindResults, FindRequest]:
     relations = Relations(entities={})
     explored_entities: set[FrozenRelationNode] = set()
@@ -465,7 +467,7 @@ async def get_graph_results(
 async def fuzzy_search_entities(
     kbid: str,
     query: str,
-) -> Optional[RelatedEntities]:
+) -> RelatedEntities | None:
     """Fuzzy find entities in KB given a query using the same methodology as /suggest, but split by words."""
     # Build an OR for each word in the query matching with fuzzy any word in any
@@ -493,7 +495,7 @@ async def fuzzy_search_entities(
     # merge shard results while deduplicating repeated entities across shards
     unique_entities: set[RelatedEntity] = set()
     for response in results:
-        unique_entities.update((RelatedEntity(family=e.subtype, value=e.value) for e in response.nodes))
+        unique_entities.update(RelatedEntity(family=e.subtype, value=e.value) for e in response.nodes)
     return RelatedEntities(entities=list(unique_entities), total=len(unique_entities))
@@ -572,7 +574,7 @@ async def rank_relations_generative(
     kbid: str,
     user: str,
     top_k: int,
-    generative_model: Optional[str] = None,
+    generative_model: str | None = None,
     score_threshold: float = 2,
     max_rels_to_eval: int = 100,
 ) -> tuple[Relations, dict[str, list[float]]]:
@@ -650,7 +652,7 @@ async def rank_relations_generative(
     if response_json is None or status is None or status.code != "0":
         raise ValueError("No JSON response found")
-    scored_unique_triplets: list[dict[str, Union[str, Any]]] = response_json.object["triplets"]
+    scored_unique_triplets: list[dict[str, str | Any]] = response_json.object["triplets"]
     if len(scored_unique_triplets) != len(unique_triplets):
         raise ValueError("Mismatch between input and output triplets")
@@ -716,7 +718,7 @@ def build_text_blocks_from_relations(
     This is a hacky way to generate paragraphs from relations, and it is not the intended use of TextBlockMatch.
     """
     # Build a set of unique triplets with their scores
-    triplets: dict[tuple[str, str, str], tuple[float, Relations, Optional[ParagraphId]]] = defaultdict(
+    triplets: dict[tuple[str, str, str], tuple[float, Relations, ParagraphId | None]] = defaultdict(
         lambda: (0.0, Relations(entities={}), None)
     )
     paragraph_count = 0
@@ -758,7 +760,7 @@ def build_text_blocks_from_relations(
         TextBlockMatch(
             # XXX: Even though we are setting a paragraph_id, the text is not coming from the paragraph
             paragraph_id=p_id,
-            score=score,
+            scores=[GraphScore(score=score)],
             score_type=SCORE_TYPE.RELATION_RELEVANCE,
             order=0,
             text=f"- {ent} {rel} {tail}",  # Manually build the text
@@ -902,7 +904,7 @@ def relations_match_to_text_block_match(
     parsed_paragraph_id = paragraph_match.paragraph_id
     return TextBlockMatch(
         paragraph_id=parsed_paragraph_id,
-        score=paragraph_match.score,
+        scores=[GraphScore(score=paragraph_match.score)],
         score_type=SCORE_TYPE.RELATION_RELEVANCE,
         order=0,  # NOTE: this will be filled later
         text="",  # NOTE: this will be filled later too

nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl