PyPI - nucliadb - Versions diffs - 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.9.1.post5192py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

migrations/0023_backfill_pg_catalog.py +2 -2
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +2 -2
migrations/0039_backfill_converation_splits_metadata.py +2 -2
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/interface.py +12 -12
nucliadb/common/catalog/pg.py +41 -29
nucliadb/common/catalog/utils.py +3 -3
nucliadb/common/cluster/manager.py +5 -4
nucliadb/common/cluster/rebalance.py +483 -114
nucliadb/common/cluster/rollover.py +25 -9
nucliadb/common/cluster/settings.py +3 -8
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +4 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +4 -5
nucliadb/common/filter_expression.py +128 -40
nucliadb/common/http_clients/processing.py +12 -23
nucliadb/common/ids.py +6 -4
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +3 -4
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +3 -8
nucliadb/ingest/consumer/service.py +3 -3
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +28 -49
nucliadb/ingest/fields/conversation.py +12 -12
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +78 -64
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +4 -4
nucliadb/ingest/orm/knowledgebox.py +18 -27
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +27 -27
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +72 -70
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +3 -109
nucliadb/ingest/settings.py +3 -4
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +11 -11
nucliadb/metrics_exporter.py +5 -4
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +3 -4
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/learning_config.py +24 -4
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +2 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +11 -15
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +25 -25
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +7 -7
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +24 -17
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -23
nucliadb/search/search/chat/ask.py +88 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +449 -36
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +3 -152
nucliadb/search/search/hydrator/fields.py +92 -50
nucliadb/search/search/hydrator/images.py +7 -7
nucliadb/search/search/hydrator/paragraphs.py +42 -26
nucliadb/search/search/hydrator/resources.py +20 -16
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +10 -9
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +13 -9
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -20
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +4 -5
nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
nucliadb/search/search/query_parser/parsers/common.py +5 -6
nucliadb/search/search/query_parser/parsers/find.py +6 -26
nucliadb/search/search/query_parser/parsers/graph.py +13 -23
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -53
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +5 -6
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +2 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +2 -2
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +7 -11
nucliadb/writer/api/v1/knowledgebox.py +3 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +7 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +1 -3
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +5 -6
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/search/search/hydrator/resources.py CHANGED Viewed

@@ -20,11 +20,16 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from nucliadb.common.models_utils import from_proto
 from nucliadb.ingest.orm.resource import Resource
+from nucliadb.models.internal.augment import (
+    ResourceOrigin,
+    ResourceProp,
+    ResourceSecurity,
+    ResourceSummary,
+    ResourceTitle,
+)
+from nucliadb.search.augmentor.resources import db_augment_resource
 from nucliadb_models import hydration as hydration_models
-from nucliadb_models.security import ResourceSecurity
 async def hydrate_resource(
@@ -35,22 +40,21 @@ async def hydrate_resource(
     slug = basic.slug
     hydrated = hydration_models.HydratedResource(id=rid, slug=slug)
+    select: list[ResourceProp] = []
     if config.title:
-        hydrated.title = basic.title
+        select.append(ResourceTitle())
     if config.summary:
-        hydrated.summary = basic.summary
+        select.append(ResourceSummary())
+    if config.origin:
+        select.append(ResourceOrigin())
     if config.security:
-        security = await resource.get_security()
-        hydrated.security = ResourceSecurity(access_groups=[])
-        if security is not None:
-            for group_id in security.access_groups:
-                hydrated.security.access_groups.append(group_id)
+        select.append(ResourceSecurity())
-    if config.origin:
-        origin = await resource.get_origin()
-        if origin is not None:
-            # TODO: we want a better hydration than proto to JSON
-            hydrated.origin = from_proto.origin(origin)
+    augmented = await db_augment_resource(resource, select)
+    hydrated.title = augmented.title
+    hydrated.summary = augmented.summary
+    hydrated.origin = augmented.origin
+    hydrated.security = augmented.security
     return hydrated

nucliadb/search/search/ingestion_agents.py CHANGED Viewed

@@ -19,10 +19,10 @@
 #
 import asyncio
 from base64 import b64encode
-from typing import Optional
 from nucliadb.common import datamanagers
 from nucliadb.ingest.fields.base import Field
+from nucliadb.ingest.orm.resource import Resource
 from nucliadb.search.predict_models import (
     FieldInfo,
     NameOperationFilter,
@@ -40,8 +40,8 @@ async def run_agents(
     kbid: str,
     rid: str,
     user_id: str,
-    filters: Optional[list[AgentsFilter]] = None,
-    agent_ids: Optional[list[str]] = None,
+    filters: list[AgentsFilter] | None = None,
+    agent_ids: list[str] | None = None,
 ) -> RunAgentsResponse:
     fields = await fetch_resource_fields(kbid, rid)
@@ -56,7 +56,7 @@ async def run_agents(
     return await predict.run_agents(kbid, item)
-def _parse_filters(filters: Optional[list[AgentsFilter]]) -> Optional[list[NameOperationFilter]]:
+def _parse_filters(filters: list[AgentsFilter] | None) -> list[NameOperationFilter] | None:
     if filters is None:
         return None
     return [
@@ -69,7 +69,7 @@ def _parse_filters(filters: Optional[list[AgentsFilter]]) -> Optional[list[NameO
 async def fetch_resource_fields(kbid: str, rid: str) -> list[FieldInfo]:
     async with datamanagers.with_ro_transaction() as txn:
-        resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
+        resource = await Resource.get(txn, kbid=kbid, rid=rid)
         if resource is None:
             raise ResourceNotFoundError()
         fields = await resource.get_fields(force=True)

nucliadb/search/search/merge.py CHANGED Viewed

@@ -20,7 +20,8 @@
 import asyncio
 import datetime
 import math
-from typing import Any, Iterable, Optional, Set, Union
+from collections.abc import Iterable
+from typing import Any
 from nidx_protos.nodereader_pb2 import (
     DocumentResult,
@@ -37,7 +38,6 @@ from nidx_protos.nodereader_pb2 import (
 from nucliadb.common.ids import FieldId, ParagraphId
 from nucliadb.common.models_utils import from_proto
 from nucliadb.common.models_utils.from_proto import RelationTypePbMap
-from nucliadb.search.search import cache
 from nucliadb.search.search.cut import cut_page
 from nucliadb.search.search.fetch import (
     fetch_resources,
@@ -80,7 +80,7 @@ from .paragraphs import get_paragraph_text, get_text_sentence
 Bm25Score = tuple[float, float]
 TimestampScore = datetime.datetime
 TitleScore = str
-SortValue = Union[Bm25Score, TimestampScore, TitleScore]
+SortValue = Bm25Score | TimestampScore | TitleScore
 def relation_node_type_to_entity_type(node_type: RelationNode.NodeType.ValueType) -> EntityType:
@@ -101,47 +101,17 @@ def entity_type_to_relation_node_type(node_type: EntityType) -> RelationNode.Nod
     }[node_type]
-def sort_results_by_score(results: Union[list[ParagraphResult], list[DocumentResult]]):
+def sort_results_by_score(results: list[ParagraphResult] | list[DocumentResult]):
     results.sort(key=lambda x: (x.score.bm25, x.score.booster), reverse=True)
-async def get_sort_value(
-    item: Union[DocumentResult, ParagraphResult],
-    sort_field: SortField,
-    kbid: str,
-) -> Optional[SortValue]:
-    """Returns the score for given `item` and `sort_field`. If the resource is being
-    deleted, it might appear on search results but not in maindb. In this
-    specific case, return None.
-    """
-    if sort_field == SortField.SCORE:
-        return (item.score.bm25, item.score.booster)
-    score: Any = None
-    resource = await cache.get_resource(kbid, item.uuid)
-    if resource is None:
-        return score
-    basic = await resource.get_basic()
-    if basic is None:
-        return score
-    if sort_field == SortField.CREATED:
-        score = basic.created.ToDatetime()
-    elif sort_field == SortField.MODIFIED:
-        score = basic.modified.ToDatetime()
-    elif sort_field == SortField.TITLE:
-        score = basic.title
-    return score
 async def merge_documents_results(
     kbid: str,
     responses: list[DocumentSearchResponse],
     *,
     query: FulltextQuery,
     top_k: int,
+    offset: int,
 ) -> tuple[Resources, list[str]]:
     raw_resource_list: list[tuple[DocumentResult, SortValue]] = []
     facets: dict[str, Any] = {}
@@ -159,14 +129,22 @@ async def merge_documents_results(
         if document_response.next_page:
             next_page = True
         for result in document_response.results:
-            sort_value = await get_sort_value(result, query.order_by, kbid)
+            sort_value: SortValue
+            if query.order_by == SortField.SCORE:
+                sort_value = (result.score.bm25, result.score.booster)
+            else:
+                sort_value = result.date.ToDatetime()
             if sort_value is not None:
                 raw_resource_list.append((result, sort_value))
         total += document_response.total
     # We need to cut first and then sort, otherwise the page will be wrong if the order is DESC
-    raw_resource_list, has_more = cut_page(raw_resource_list, top_k)
+    raw_resource_list, has_more = cut_page(raw_resource_list[offset:], top_k)
     next_page = next_page or has_more
+    # Sort the list by score. It's important that this sort is stable, so the
+    # ordering of results with same scores accross multiple shards doesn't change
     raw_resource_list.sort(key=lambda x: x[1], reverse=(query.sort == SortOrder.DESC))
     result_resource_ids = []
@@ -270,7 +248,7 @@ async def merge_vectors_results(
     resources: list[str],
     kbid: str,
     top_k: int,
-    min_score: Optional[float] = None,
+    min_score: float | None = None,
 ) -> Sentences:
     facets: dict[str, Any] = {}
     raw_vectors_list: list[DocumentScored] = []
@@ -350,12 +328,13 @@ async def merge_paragraph_results(
     highlight: bool,
     sort: SortOptions,
     min_score: float,
+    offset: int,
 ) -> tuple[Paragraphs, list[str]]:
     raw_paragraph_list: list[tuple[ParagraphResult, SortValue]] = []
     facets: dict[str, Any] = {}
     query = None
     next_page = False
-    ematches: Optional[list[str]] = None
+    ematches: list[str] | None = None
     total = 0
     for paragraph_response in paragraph_responses:
         if ematches is None:
@@ -373,66 +352,31 @@ async def merge_paragraph_results(
         if paragraph_response.next_page:
             next_page = True
         for result in paragraph_response.results:
-            score = await get_sort_value(result, sort.field, kbid)
-            if score is not None:
-                raw_paragraph_list.append((result, score))
+            sort_value: SortValue
+            if sort.field == SortField.SCORE:
+                sort_value = (result.score.bm25, result.score.booster)
+            else:
+                sort_value = result.date.ToDatetime()
+            if sort_value is not None:
+                raw_paragraph_list.append((result, sort_value))
         total += paragraph_response.total
+    # Sort the list by score. It's important that this sort is stable, so the
+    # ordering of results with same scores accross multiple shards doesn't change
     raw_paragraph_list.sort(key=lambda x: x[1], reverse=(sort.order == SortOrder.DESC))
-    raw_paragraph_list, has_more = cut_page(raw_paragraph_list, top_k)
+    raw_paragraph_list, has_more = cut_page(raw_paragraph_list[offset:], top_k)
     next_page = next_page or has_more
     result_resource_ids = []
-    result_paragraph_list: list[Paragraph] = []
-    for result, _ in raw_paragraph_list:
-        _, field_type, field = result.field.split("/")
-        text = await get_paragraph_text(
-            kbid=kbid,
-            paragraph_id=ParagraphId(
-                field_id=FieldId(
-                    rid=result.uuid,
-                    type=field_type,
-                    key=field,
-                    subfield_id=result.split,
-                ),
-                paragraph_start=result.start,
-                paragraph_end=result.end,
-            ),
-            highlight=highlight,
-            ematches=ematches,
-            matches=result.matches,  # type: ignore
-        )
-        labels = await get_labels_paragraph(result, kbid)
-        fuzzy_result = len(result.matches) > 0
-        new_paragraph = Paragraph(
-            score=result.score.bm25,
-            rid=result.uuid,
-            field_type=field_type,
-            field=field,
-            text=text,
-            labels=labels,
-            position=TextPosition(
-                index=result.metadata.position.index,
-                start=result.metadata.position.start,
-                end=result.metadata.position.end,
-                page_number=result.metadata.position.page_number,
-            ),
-            fuzzy_result=fuzzy_result,
-        )
-        if len(result.metadata.position.start_seconds) or len(result.metadata.position.end_seconds):
-            new_paragraph.start_seconds = list(result.metadata.position.start_seconds)
-            new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
-        else:
-            # TODO: Remove once we are sure all data has been migrated!
-            seconds_positions = await get_seconds_paragraph(result, kbid)
-            if seconds_positions is not None:
-                new_paragraph.start_seconds = seconds_positions[0]
-                new_paragraph.end_seconds = seconds_positions[1]
+    result_paragraph_list: list[Paragraph] = await asyncio.gather(
+        *(load_paragraph(result, kbid, highlight, ematches) for result, _ in raw_paragraph_list)
+    )
+    for paragraph in result_paragraph_list:
+        if paragraph.rid not in result_resource_ids:
+            result_resource_ids.append(paragraph.rid)
-        result_paragraph_list.append(new_paragraph)
-        if new_paragraph.rid not in result_resource_ids:
-            result_resource_ids.append(new_paragraph.rid)
     return Paragraphs(
         results=result_paragraph_list,
         facets=facets,
@@ -445,6 +389,56 @@ async def merge_paragraph_results(
     ), result_resource_ids
+async def load_paragraph(
+    result: ParagraphResult, kbid: str, highlight: bool, ematches: list[str] | None
+) -> Paragraph:
+    _, field_type, field = result.field.split("/")
+    text = await get_paragraph_text(
+        kbid=kbid,
+        paragraph_id=ParagraphId(
+            field_id=FieldId(
+                rid=result.uuid,
+                type=field_type,
+                key=field,
+                subfield_id=result.split,
+            ),
+            paragraph_start=result.start,
+            paragraph_end=result.end,
+        ),
+        highlight=highlight,
+        ematches=ematches,
+        matches=result.matches,  # type: ignore
+    )
+    labels = await get_labels_paragraph(result, kbid)
+    fuzzy_result = len(result.matches) > 0
+    new_paragraph = Paragraph(
+        score=result.score.bm25,
+        rid=result.uuid,
+        field_type=field_type,
+        field=field,
+        text=text,
+        labels=labels,
+        position=TextPosition(
+            index=result.metadata.position.index,
+            start=result.metadata.position.start,
+            end=result.metadata.position.end,
+            page_number=result.metadata.position.page_number,
+        ),
+        fuzzy_result=fuzzy_result,
+    )
+    if len(result.metadata.position.start_seconds) or len(result.metadata.position.end_seconds):
+        new_paragraph.start_seconds = list(result.metadata.position.start_seconds)
+        new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
+    else:
+        # TODO: Remove once we are sure all data has been migrated!
+        seconds_positions = await get_seconds_paragraph(result, kbid)
+        if seconds_positions is not None:
+            new_paragraph.start_seconds = seconds_positions[0]
+            new_paragraph.end_seconds = seconds_positions[1]
+    return new_paragraph
 @merge_observer.wrap({"type": "merge_relations"})
 async def merge_relations_results(
     graph_responses: list[GraphSearchResponse],
@@ -520,6 +514,7 @@ async def merge_results(
     show: list[ResourceProperties],
     field_type_filter: list[FieldTypeName],
     extracted: list[ExtractedDataTypeName],
+    offset: int,
     highlight: bool = False,
 ) -> KnowledgeboxSearchResults:
     paragraphs = []
@@ -543,6 +538,7 @@ async def merge_results(
             documents,
             query=retrieval.query.fulltext,
             top_k=retrieval.top_k,
+            offset=offset,
         )
         resources.extend(matched_resources)
@@ -550,7 +546,6 @@ async def merge_results(
         sort = SortOptions(
             field=retrieval.query.keyword.order_by,
             order=retrieval.query.keyword.sort,
-            limit=None,  # unused
         )
         api_results.paragraphs, matched_resources = await merge_paragraph_results(
             kbid,
@@ -559,6 +554,7 @@ async def merge_results(
             highlight,
             sort,
             min_score=retrieval.query.keyword.min_score,
+            offset=offset,
         )
         resources.extend(matched_resources)
@@ -601,9 +597,9 @@ async def merge_paragraphs_results(
         sort=SortOptions(
             field=SortField.SCORE,
             order=SortOrder.DESC,
-            limit=None,
         ),
         min_score=min_score,
+        offset=0,
     )
     return api_results
@@ -611,7 +607,7 @@ async def merge_paragraphs_results(
 async def merge_suggest_entities_results(
     suggest_responses: list[SuggestResponse],
 ) -> RelatedEntities:
-    unique_entities: Set[RelatedEntity] = set()
+    unique_entities: set[RelatedEntity] = set()
     for response in suggest_responses:
         response_entities = (
             RelatedEntity(family=e.subtype, value=e.value) for e in response.entity_results.nodes

nucliadb/search/search/metrics.py CHANGED Viewed

@@ -19,7 +19,7 @@
 #
 import contextlib
 import time
-from typing import Any, Optional, Union
+from typing import Any
 from nucliadb_telemetry import metrics
@@ -27,6 +27,7 @@ merge_observer = metrics.Observer("merge_results", labels={"type": ""})
 node_features = metrics.Counter("nucliadb_node_features", labels={"type": ""})
 query_parse_dependency_observer = metrics.Observer("query_parse_dependency", labels={"type": ""})
 query_parser_observer = metrics.Observer("nucliadb_query_parser", labels={"type": ""})
+search_observer = metrics.Observer("nucliadb_search", labels={"type": ""})
 buckets = [
     0.005,
@@ -62,7 +63,7 @@ rag_histogram = metrics.Histogram(
     buckets=buckets,
 )
-MetricsData = dict[str, Union[int, float]]
+MetricsData = dict[str, int | float]
 class Metrics:
@@ -86,10 +87,10 @@ class Metrics:
         self.child_spans.append(child_span)
         return child_span
-    def set(self, key: str, value: Union[int, float]):
+    def set(self, key: str, value: int | float):
         self._metrics[key] = value
-    def get(self, key: str) -> Optional[Union[int, float]]:
+    def get(self, key: str) -> int | float | None:
         return self._metrics.get(key)
     def to_dict(self) -> MetricsData:
@@ -102,7 +103,7 @@ class Metrics:
         result[self.id] = self.to_dict()
         return result
-    def __getitem__(self, key: str) -> Union[int, float]:
+    def __getitem__(self, key: str) -> int | float:
         return self._metrics[key]
@@ -110,8 +111,8 @@ class AskMetrics(Metrics):
     def __init__(self: "AskMetrics"):
         super().__init__(id="ask")
         self.global_start = time.monotonic()
-        self.first_chunk_yielded_at: Optional[float] = None
-        self.first_reasoning_chunk_yielded_at: Optional[float] = None
+        self.first_chunk_yielded_at: float | None = None
+        self.first_reasoning_chunk_yielded_at: float | None = None
     def record_first_chunk_yielded(self):
         self.first_chunk_yielded_at = time.monotonic()
@@ -123,12 +124,12 @@ class AskMetrics(Metrics):
             self.first_reasoning_chunk_yielded_at - self.global_start
         )
-    def get_first_chunk_time(self) -> Optional[float]:
+    def get_first_chunk_time(self) -> float | None:
         if self.first_chunk_yielded_at is None:
             return None
         return self.first_chunk_yielded_at - self.global_start
-    def get_first_reasoning_chunk_time(self) -> Optional[float]:
+    def get_first_reasoning_chunk_time(self) -> float | None:
         if self.first_reasoning_chunk_yielded_at is None:
             return None
         return self.first_reasoning_chunk_yielded_at - self.global_start

nucliadb/search/search/paragraphs.py CHANGED Viewed

@@ -20,7 +20,6 @@
 import logging
 import re
 import string
-from typing import Optional
 from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, ParagraphId
 from nucliadb.ingest.fields.base import Field
@@ -58,7 +57,7 @@ async def get_paragraph_from_full_text(
     field: Field,
     start: int,
     end: int,
-    split: Optional[str] = None,
+    split: str | None = None,
     log_on_missing_field: bool = True,
 ) -> str:
     """
@@ -90,11 +89,10 @@ async def get_paragraph_text(
     kbid: str,
     paragraph_id: ParagraphId,
     highlight: bool = False,
-    ematches: Optional[list[str]] = None,
-    matches: Optional[list[str]] = None,
-    orm_resource: Optional[
-        ResourceORM
-    ] = None,  # allow passing in orm_resource to avoid extra DB calls or txn issues
+    ematches: list[str] | None = None,
+    matches: list[str] | None = None,
+    orm_resource: None
+    | (ResourceORM) = None,  # allow passing in orm_resource to avoid extra DB calls or txn issues
     log_on_missing_field: bool = True,
 ) -> str:
     rid = paragraph_id.rid
@@ -139,7 +137,7 @@ async def get_text_sentence(
     index: int,
     start: int,
     end: int,
-    split: Optional[str] = None,
+    split: str | None = None,
 ) -> str:
     """
     Leave separated from get paragraph for now until we understand the differences
@@ -169,7 +167,7 @@ async def get_text_sentence(
 def highlight_paragraph(
-    text: str, words: Optional[list[str]] = None, ematches: Optional[list[str]] = None
+    text: str, words: list[str] | None = None, ematches: list[str] | None = None
 ) -> str:
     """
     Highlight `text` with <mark></mark> tags around the words in `words` and `ematches`.

nucliadb/search/search/predict_proxy.py CHANGED Viewed

@@ -19,7 +19,7 @@
 #
 import json
 from enum import Enum
-from typing import Any, Optional, Union
+from typing import Any
 import aiohttp
 from fastapi.datastructures import QueryParams
@@ -78,9 +78,9 @@ async def predict_proxy(
     user_id: str,
     client_type: NucliaDBClientType,
     origin: str,
-    json: Optional[Any] = None,
+    json: Any | None = None,
     headers: dict[str, str] = {},
-) -> Union[Response, StreamingResponse]:
+) -> Response | StreamingResponse:
     if not await exists_kb(kbid=kbid):
         raise datamanagers.exceptions.KnowledgeBoxNotFound()
@@ -99,11 +99,15 @@ async def predict_proxy(
     )
     status_code = predict_response.status
+    # Only audit /predict/chat successful responses
+    should_audit = endpoint == PredictProxiedEndpoints.CHAT and 200 <= status_code < 300
     media_type = predict_response.headers.get("Content-Type")
-    response: Union[Response, StreamingResponse]
+    response: Response | StreamingResponse
     user_query = json.get("question") if json is not None else ""
     if predict_response.headers.get("Transfer-Encoding") == "chunked":
-        if endpoint == PredictProxiedEndpoints.CHAT:
+        if should_audit:
             streaming_generator = chat_streaming_generator(
                 predict_response=predict_response,
                 kbid=kbid,
@@ -126,7 +130,7 @@ async def predict_proxy(
         with metrics.time(PREDICT_ANSWER_METRIC):
             content = await predict_response.read()
-        if endpoint == PredictProxiedEndpoints.CHAT:
+        if should_audit:
             try:
                 llm_status_code = int(content[-1:].decode())  # Decode just the last char
                 if llm_status_code != 0:
@@ -250,10 +254,10 @@ def audit_predict_proxy_endpoint(
     client_type: NucliaDBClientType,
     origin: str,
     text_answer: bytes,
-    text_reasoning: Optional[str],
+    text_reasoning: str | None,
     generative_answer_time: float,
-    generative_answer_first_chunk_time: Optional[float],
-    generative_reasoning_first_chunk_time: Optional[float],
+    generative_answer_first_chunk_time: float | None,
+    generative_reasoning_first_chunk_time: float | None,
     status_code: AnswerStatusCode,
 ):
     maybe_audit_chat(

nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.9.1.post5192py3-none-any.whl → 6.10.0.post5705py3-none-any.whl