PyPI - nucliadb - Versions diffs - 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (246) hide show

migrations/0023_backfill_pg_catalog.py +8 -4
migrations/0028_extracted_vectors_reference.py +1 -1
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +8 -4
migrations/0039_backfill_converation_splits_metadata.py +106 -0
migrations/0040_migrate_search_configurations.py +79 -0
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/__init__.py +79 -0
nucliadb/common/catalog/dummy.py +36 -0
nucliadb/common/catalog/interface.py +85 -0
nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
nucliadb/common/catalog/utils.py +56 -0
nucliadb/common/cluster/manager.py +8 -23
nucliadb/common/cluster/rebalance.py +484 -112
nucliadb/common/cluster/rollover.py +36 -9
nucliadb/common/cluster/settings.py +4 -9
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +9 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +5 -34
nucliadb/common/external_index_providers/settings.py +1 -27
nucliadb/common/filter_expression.py +129 -41
nucliadb/common/http_clients/exceptions.py +8 -0
nucliadb/common/http_clients/processing.py +16 -23
nucliadb/common/http_clients/utils.py +3 -0
nucliadb/common/ids.py +82 -58
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +22 -5
nucliadb/common/vector_index_config.py +1 -1
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +10 -8
nucliadb/ingest/consumer/service.py +5 -30
nucliadb/ingest/consumer/shard_creator.py +16 -5
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +37 -49
nucliadb/ingest/fields/conversation.py +55 -9
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +89 -57
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +128 -113
nucliadb/ingest/orm/knowledgebox.py +91 -59
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +98 -153
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +82 -71
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/partitions.py +12 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +15 -114
nucliadb/ingest/settings.py +36 -15
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +23 -26
nucliadb/metrics_exporter.py +20 -6
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +4 -11
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/knowledgebox.py +37 -9
nucliadb/reader/api/v1/learning_config.py +33 -14
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +3 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +15 -19
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +328 -0
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +28 -8
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +33 -19
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -42
nucliadb/search/search/chat/ask.py +131 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +453 -32
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +49 -0
nucliadb/search/search/hydrator/fields.py +217 -0
nucliadb/search/search/hydrator/images.py +130 -0
nucliadb/search/search/hydrator/paragraphs.py +323 -0
nucliadb/search/search/hydrator/resources.py +60 -0
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +24 -7
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +44 -18
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -48
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +5 -6
nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
nucliadb/search/search/query_parser/parsers/common.py +21 -13
nucliadb/search/search/query_parser/parsers/find.py +6 -29
nucliadb/search/search/query_parser/parsers/graph.py +18 -28
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -56
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +6 -7
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +5 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +4 -10
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +15 -14
nucliadb/writer/api/v1/knowledgebox.py +18 -56
nucliadb/writer/api/v1/learning_config.py +5 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +43 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +5 -7
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +15 -22
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb/common/external_index_providers/pinecone.py +0 -894
nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
nucliadb/search/search/hydrator.py +0 -197
nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/search/search/merge.py CHANGED Viewed

@@ -20,7 +20,8 @@
 import asyncio
 import datetime
 import math
-from typing import Any, Iterable, Optional, Set, Union
+from collections.abc import Iterable
+from typing import Any
 from nidx_protos.nodereader_pb2 import (
     DocumentResult,
@@ -37,7 +38,6 @@ from nidx_protos.nodereader_pb2 import (
 from nucliadb.common.ids import FieldId, ParagraphId
 from nucliadb.common.models_utils import from_proto
 from nucliadb.common.models_utils.from_proto import RelationTypePbMap
-from nucliadb.search.search import cache
 from nucliadb.search.search.cut import cut_page
 from nucliadb.search.search.fetch import (
     fetch_resources,
@@ -80,7 +80,7 @@ from .paragraphs import get_paragraph_text, get_text_sentence
 Bm25Score = tuple[float, float]
 TimestampScore = datetime.datetime
 TitleScore = str
-SortValue = Union[Bm25Score, TimestampScore, TitleScore]
+SortValue = Bm25Score | TimestampScore | TitleScore
 def relation_node_type_to_entity_type(node_type: RelationNode.NodeType.ValueType) -> EntityType:
@@ -101,47 +101,17 @@ def entity_type_to_relation_node_type(node_type: EntityType) -> RelationNode.Nod
     }[node_type]
-def sort_results_by_score(results: Union[list[ParagraphResult], list[DocumentResult]]):
+def sort_results_by_score(results: list[ParagraphResult] | list[DocumentResult]):
     results.sort(key=lambda x: (x.score.bm25, x.score.booster), reverse=True)
-async def get_sort_value(
-    item: Union[DocumentResult, ParagraphResult],
-    sort_field: SortField,
-    kbid: str,
-) -> Optional[SortValue]:
-    """Returns the score for given `item` and `sort_field`. If the resource is being
-    deleted, it might appear on search results but not in maindb. In this
-    specific case, return None.
-    """
-    if sort_field == SortField.SCORE:
-        return (item.score.bm25, item.score.booster)
-    score: Any = None
-    resource = await cache.get_resource(kbid, item.uuid)
-    if resource is None:
-        return score
-    basic = await resource.get_basic()
-    if basic is None:
-        return score
-    if sort_field == SortField.CREATED:
-        score = basic.created.ToDatetime()
-    elif sort_field == SortField.MODIFIED:
-        score = basic.modified.ToDatetime()
-    elif sort_field == SortField.TITLE:
-        score = basic.title
-    return score
 async def merge_documents_results(
     kbid: str,
     responses: list[DocumentSearchResponse],
     *,
     query: FulltextQuery,
     top_k: int,
+    offset: int,
 ) -> tuple[Resources, list[str]]:
     raw_resource_list: list[tuple[DocumentResult, SortValue]] = []
     facets: dict[str, Any] = {}
@@ -159,14 +129,22 @@ async def merge_documents_results(
         if document_response.next_page:
             next_page = True
         for result in document_response.results:
-            sort_value = await get_sort_value(result, query.order_by, kbid)
+            sort_value: SortValue
+            if query.order_by == SortField.SCORE:
+                sort_value = (result.score.bm25, result.score.booster)
+            else:
+                sort_value = result.date.ToDatetime()
             if sort_value is not None:
                 raw_resource_list.append((result, sort_value))
         total += document_response.total
     # We need to cut first and then sort, otherwise the page will be wrong if the order is DESC
-    raw_resource_list, has_more = cut_page(raw_resource_list, top_k)
+    raw_resource_list, has_more = cut_page(raw_resource_list[offset:], top_k)
     next_page = next_page or has_more
+    # Sort the list by score. It's important that this sort is stable, so the
+    # ordering of results with same scores accross multiple shards doesn't change
     raw_resource_list.sort(key=lambda x: x[1], reverse=(query.sort == SortOrder.DESC))
     result_resource_ids = []
@@ -270,7 +248,7 @@ async def merge_vectors_results(
     resources: list[str],
     kbid: str,
     top_k: int,
-    min_score: Optional[float] = None,
+    min_score: float | None = None,
 ) -> Sentences:
     facets: dict[str, Any] = {}
     raw_vectors_list: list[DocumentScored] = []
@@ -350,12 +328,13 @@ async def merge_paragraph_results(
     highlight: bool,
     sort: SortOptions,
     min_score: float,
+    offset: int,
 ) -> tuple[Paragraphs, list[str]]:
     raw_paragraph_list: list[tuple[ParagraphResult, SortValue]] = []
     facets: dict[str, Any] = {}
     query = None
     next_page = False
-    ematches: Optional[list[str]] = None
+    ematches: list[str] | None = None
     total = 0
     for paragraph_response in paragraph_responses:
         if ematches is None:
@@ -373,66 +352,31 @@ async def merge_paragraph_results(
         if paragraph_response.next_page:
             next_page = True
         for result in paragraph_response.results:
-            score = await get_sort_value(result, sort.field, kbid)
-            if score is not None:
-                raw_paragraph_list.append((result, score))
+            sort_value: SortValue
+            if sort.field == SortField.SCORE:
+                sort_value = (result.score.bm25, result.score.booster)
+            else:
+                sort_value = result.date.ToDatetime()
+            if sort_value is not None:
+                raw_paragraph_list.append((result, sort_value))
         total += paragraph_response.total
+    # Sort the list by score. It's important that this sort is stable, so the
+    # ordering of results with same scores accross multiple shards doesn't change
     raw_paragraph_list.sort(key=lambda x: x[1], reverse=(sort.order == SortOrder.DESC))
-    raw_paragraph_list, has_more = cut_page(raw_paragraph_list, top_k)
+    raw_paragraph_list, has_more = cut_page(raw_paragraph_list[offset:], top_k)
     next_page = next_page or has_more
     result_resource_ids = []
-    result_paragraph_list: list[Paragraph] = []
-    for result, _ in raw_paragraph_list:
-        _, field_type, field = result.field.split("/")
-        text = await get_paragraph_text(
-            kbid=kbid,
-            paragraph_id=ParagraphId(
-                field_id=FieldId(
-                    rid=result.uuid,
-                    type=field_type,
-                    key=field,
-                    subfield_id=result.split,
-                ),
-                paragraph_start=result.start,
-                paragraph_end=result.end,
-            ),
-            highlight=highlight,
-            ematches=ematches,
-            matches=result.matches,  # type: ignore
-        )
-        labels = await get_labels_paragraph(result, kbid)
-        fuzzy_result = len(result.matches) > 0
-        new_paragraph = Paragraph(
-            score=result.score.bm25,
-            rid=result.uuid,
-            field_type=field_type,
-            field=field,
-            text=text,
-            labels=labels,
-            position=TextPosition(
-                index=result.metadata.position.index,
-                start=result.metadata.position.start,
-                end=result.metadata.position.end,
-                page_number=result.metadata.position.page_number,
-            ),
-            fuzzy_result=fuzzy_result,
-        )
-        if len(result.metadata.position.start_seconds) or len(result.metadata.position.end_seconds):
-            new_paragraph.start_seconds = list(result.metadata.position.start_seconds)
-            new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
-        else:
-            # TODO: Remove once we are sure all data has been migrated!
-            seconds_positions = await get_seconds_paragraph(result, kbid)
-            if seconds_positions is not None:
-                new_paragraph.start_seconds = seconds_positions[0]
-                new_paragraph.end_seconds = seconds_positions[1]
+    result_paragraph_list: list[Paragraph] = await asyncio.gather(
+        *(load_paragraph(result, kbid, highlight, ematches) for result, _ in raw_paragraph_list)
+    )
+    for paragraph in result_paragraph_list:
+        if paragraph.rid not in result_resource_ids:
+            result_resource_ids.append(paragraph.rid)
-        result_paragraph_list.append(new_paragraph)
-        if new_paragraph.rid not in result_resource_ids:
-            result_resource_ids.append(new_paragraph.rid)
     return Paragraphs(
         results=result_paragraph_list,
         facets=facets,
@@ -445,6 +389,56 @@ async def merge_paragraph_results(
     ), result_resource_ids
+async def load_paragraph(
+    result: ParagraphResult, kbid: str, highlight: bool, ematches: list[str] | None
+) -> Paragraph:
+    _, field_type, field = result.field.split("/")
+    text = await get_paragraph_text(
+        kbid=kbid,
+        paragraph_id=ParagraphId(
+            field_id=FieldId(
+                rid=result.uuid,
+                type=field_type,
+                key=field,
+                subfield_id=result.split,
+            ),
+            paragraph_start=result.start,
+            paragraph_end=result.end,
+        ),
+        highlight=highlight,
+        ematches=ematches,
+        matches=result.matches,  # type: ignore
+    )
+    labels = await get_labels_paragraph(result, kbid)
+    fuzzy_result = len(result.matches) > 0
+    new_paragraph = Paragraph(
+        score=result.score.bm25,
+        rid=result.uuid,
+        field_type=field_type,
+        field=field,
+        text=text,
+        labels=labels,
+        position=TextPosition(
+            index=result.metadata.position.index,
+            start=result.metadata.position.start,
+            end=result.metadata.position.end,
+            page_number=result.metadata.position.page_number,
+        ),
+        fuzzy_result=fuzzy_result,
+    )
+    if len(result.metadata.position.start_seconds) or len(result.metadata.position.end_seconds):
+        new_paragraph.start_seconds = list(result.metadata.position.start_seconds)
+        new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
+    else:
+        # TODO: Remove once we are sure all data has been migrated!
+        seconds_positions = await get_seconds_paragraph(result, kbid)
+        if seconds_positions is not None:
+            new_paragraph.start_seconds = seconds_positions[0]
+            new_paragraph.end_seconds = seconds_positions[1]
+    return new_paragraph
 @merge_observer.wrap({"type": "merge_relations"})
 async def merge_relations_results(
     graph_responses: list[GraphSearchResponse],
@@ -520,6 +514,7 @@ async def merge_results(
     show: list[ResourceProperties],
     field_type_filter: list[FieldTypeName],
     extracted: list[ExtractedDataTypeName],
+    offset: int,
     highlight: bool = False,
 ) -> KnowledgeboxSearchResults:
     paragraphs = []
@@ -543,6 +538,7 @@ async def merge_results(
             documents,
             query=retrieval.query.fulltext,
             top_k=retrieval.top_k,
+            offset=offset,
         )
         resources.extend(matched_resources)
@@ -550,7 +546,6 @@ async def merge_results(
         sort = SortOptions(
             field=retrieval.query.keyword.order_by,
             order=retrieval.query.keyword.sort,
-            limit=None,  # unused
         )
         api_results.paragraphs, matched_resources = await merge_paragraph_results(
             kbid,
@@ -559,6 +554,7 @@ async def merge_results(
             highlight,
             sort,
             min_score=retrieval.query.keyword.min_score,
+            offset=offset,
         )
         resources.extend(matched_resources)
@@ -601,9 +597,9 @@ async def merge_paragraphs_results(
         sort=SortOptions(
             field=SortField.SCORE,
             order=SortOrder.DESC,
-            limit=None,
         ),
         min_score=min_score,
+        offset=0,
     )
     return api_results
@@ -611,7 +607,7 @@ async def merge_paragraphs_results(
 async def merge_suggest_entities_results(
     suggest_responses: list[SuggestResponse],
 ) -> RelatedEntities:
-    unique_entities: Set[RelatedEntity] = set()
+    unique_entities: set[RelatedEntity] = set()
     for response in suggest_responses:
         response_entities = (
             RelatedEntity(family=e.subtype, value=e.value) for e in response.entity_results.nodes

nucliadb/search/search/metrics.py CHANGED Viewed

@@ -19,7 +19,7 @@
 #
 import contextlib
 import time
-from typing import Any, Optional, Union
+from typing import Any
 from nucliadb_telemetry import metrics
@@ -27,6 +27,7 @@ merge_observer = metrics.Observer("merge_results", labels={"type": ""})
 node_features = metrics.Counter("nucliadb_node_features", labels={"type": ""})
 query_parse_dependency_observer = metrics.Observer("query_parse_dependency", labels={"type": ""})
 query_parser_observer = metrics.Observer("nucliadb_query_parser", labels={"type": ""})
+search_observer = metrics.Observer("nucliadb_search", labels={"type": ""})
 buckets = [
     0.005,
@@ -49,6 +50,10 @@ buckets = [
 ]
 generative_first_chunk_histogram = metrics.Histogram(
+    name="generative_reasoning_first_chunk",
+    buckets=buckets,
+)
+reasoning_first_chunk_histogram = metrics.Histogram(
     name="generative_first_chunk",
     buckets=buckets,
 )
@@ -58,7 +63,7 @@ rag_histogram = metrics.Histogram(
     buckets=buckets,
 )
-MetricsData = dict[str, Union[int, float]]
+MetricsData = dict[str, int | float]
 class Metrics:
@@ -82,10 +87,10 @@ class Metrics:
         self.child_spans.append(child_span)
         return child_span
-    def set(self, key: str, value: Union[int, float]):
+    def set(self, key: str, value: int | float):
         self._metrics[key] = value
-    def get(self, key: str) -> Optional[Union[int, float]]:
+    def get(self, key: str) -> int | float | None:
         return self._metrics.get(key)
     def to_dict(self) -> MetricsData:
@@ -98,7 +103,7 @@ class Metrics:
         result[self.id] = self.to_dict()
         return result
-    def __getitem__(self, key: str) -> Union[int, float]:
+    def __getitem__(self, key: str) -> int | float:
         return self._metrics[key]
@@ -106,13 +111,25 @@ class AskMetrics(Metrics):
     def __init__(self: "AskMetrics"):
         super().__init__(id="ask")
         self.global_start = time.monotonic()
-        self.first_chunk_yielded_at: Optional[float] = None
+        self.first_chunk_yielded_at: float | None = None
+        self.first_reasoning_chunk_yielded_at: float | None = None
     def record_first_chunk_yielded(self):
         self.first_chunk_yielded_at = time.monotonic()
         generative_first_chunk_histogram.observe(self.first_chunk_yielded_at - self.global_start)
-    def get_first_chunk_time(self) -> Optional[float]:
+    def record_first_reasoning_chunk_yielded(self):
+        self.first_reasoning_chunk_yielded_at = time.monotonic()
+        reasoning_first_chunk_histogram.observe(
+            self.first_reasoning_chunk_yielded_at - self.global_start
+        )
+    def get_first_chunk_time(self) -> float | None:
         if self.first_chunk_yielded_at is None:
             return None
         return self.first_chunk_yielded_at - self.global_start
+    def get_first_reasoning_chunk_time(self) -> float | None:
+        if self.first_reasoning_chunk_yielded_at is None:
+            return None
+        return self.first_reasoning_chunk_yielded_at - self.global_start

nucliadb/search/search/paragraphs.py CHANGED Viewed

@@ -20,7 +20,6 @@
 import logging
 import re
 import string
-from typing import Optional
 from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, ParagraphId
 from nucliadb.ingest.fields.base import Field
@@ -58,7 +57,7 @@ async def get_paragraph_from_full_text(
     field: Field,
     start: int,
     end: int,
-    split: Optional[str] = None,
+    split: str | None = None,
     log_on_missing_field: bool = True,
 ) -> str:
     """
@@ -90,11 +89,10 @@ async def get_paragraph_text(
     kbid: str,
     paragraph_id: ParagraphId,
     highlight: bool = False,
-    ematches: Optional[list[str]] = None,
-    matches: Optional[list[str]] = None,
-    orm_resource: Optional[
-        ResourceORM
-    ] = None,  # allow passing in orm_resource to avoid extra DB calls or txn issues
+    ematches: list[str] | None = None,
+    matches: list[str] | None = None,
+    orm_resource: None
+    | (ResourceORM) = None,  # allow passing in orm_resource to avoid extra DB calls or txn issues
     log_on_missing_field: bool = True,
 ) -> str:
     rid = paragraph_id.rid
@@ -139,7 +137,7 @@ async def get_text_sentence(
     index: int,
     start: int,
     end: int,
-    split: Optional[str] = None,
+    split: str | None = None,
 ) -> str:
     """
     Leave separated from get paragraph for now until we understand the differences
@@ -169,7 +167,7 @@ async def get_text_sentence(
 def highlight_paragraph(
-    text: str, words: Optional[list[str]] = None, ematches: Optional[list[str]] = None
+    text: str, words: list[str] | None = None, ematches: list[str] | None = None
 ) -> str:
     """
     Highlight `text` with <mark></mark> tags around the words in `words` and `ematches`.

nucliadb/search/search/predict_proxy.py CHANGED Viewed

@@ -19,7 +19,7 @@
 #
 import json
 from enum import Enum
-from typing import Any, Optional, Union
+from typing import Any
 import aiohttp
 from fastapi.datastructures import QueryParams
@@ -28,6 +28,7 @@ from multidict import CIMultiDictProxy
 from nuclia_models.predict.generative_responses import (
     GenerativeChunk,
     JSONGenerativeResponse,
+    ReasoningGenerativeResponse,
     StatusGenerativeResponse,
     TextGenerativeResponse,
 )
@@ -77,9 +78,9 @@ async def predict_proxy(
     user_id: str,
     client_type: NucliaDBClientType,
     origin: str,
-    json: Optional[Any] = None,
+    json: Any | None = None,
     headers: dict[str, str] = {},
-) -> Union[Response, StreamingResponse]:
+) -> Response | StreamingResponse:
     if not await exists_kb(kbid=kbid):
         raise datamanagers.exceptions.KnowledgeBoxNotFound()
@@ -87,6 +88,7 @@ async def predict_proxy(
     predict_headers = predict.get_predict_headers(kbid)
     user_headers = {k: v for k, v in headers.items() if k.capitalize() in ALLOWED_HEADERS}
+    metrics = AskMetrics()
     # Proxy the request to predict API
     predict_response = await predict.make_request(
         method=method,
@@ -97,11 +99,15 @@ async def predict_proxy(
     )
     status_code = predict_response.status
+    # Only audit /predict/chat successful responses
+    should_audit = endpoint == PredictProxiedEndpoints.CHAT and 200 <= status_code < 300
     media_type = predict_response.headers.get("Content-Type")
-    response: Union[Response, StreamingResponse]
+    response: Response | StreamingResponse
     user_query = json.get("question") if json is not None else ""
     if predict_response.headers.get("Transfer-Encoding") == "chunked":
-        if endpoint == PredictProxiedEndpoints.CHAT:
+        if should_audit:
             streaming_generator = chat_streaming_generator(
                 predict_response=predict_response,
                 kbid=kbid,
@@ -109,7 +115,8 @@ async def predict_proxy(
                 client_type=client_type,
                 origin=origin,
                 user_query=user_query,
-                is_json="json" in (media_type or ""),
+                is_ndjson_stream="json" in (media_type or ""),
+                metrics=metrics,
             )
         else:
             streaming_generator = predict_response.content.iter_any()
@@ -120,11 +127,10 @@ async def predict_proxy(
             media_type=media_type,
         )
     else:
-        metrics = AskMetrics()
         with metrics.time(PREDICT_ANSWER_METRIC):
             content = await predict_response.read()
-        if endpoint == PredictProxiedEndpoints.CHAT:
+        if should_audit:
             try:
                 llm_status_code = int(content[-1:].decode())  # Decode just the last char
                 if llm_status_code != 0:
@@ -140,8 +146,10 @@ async def predict_proxy(
                 client_type=client_type,
                 origin=origin,
                 text_answer=content,
+                text_reasoning=None,
                 generative_answer_time=metrics[PREDICT_ANSWER_METRIC],
                 generative_answer_first_chunk_time=None,
+                generative_reasoning_first_chunk_time=None,
                 status_code=AnswerStatusCode(str(llm_status_code)),
             )
@@ -170,26 +178,35 @@ async def chat_streaming_generator(
     client_type: NucliaDBClientType,
     origin: str,
     user_query: str,
-    is_json: bool,
+    is_ndjson_stream: bool,
+    metrics: AskMetrics,
 ):
     first = True
+    first_reasoning = True
     status_code = AnswerStatusCode.ERROR.value
     text_answer = ""
+    text_reasoning = ""
     json_object = None
-    metrics = AskMetrics()
     with metrics.time(PREDICT_ANSWER_METRIC):
         async for chunk in predict_response.content:
-            if first:
-                metrics.record_first_chunk_yielded()
-                first = False
             yield chunk
-            if is_json:
+            if is_ndjson_stream:
                 try:
                     parsed_chunk = GenerativeChunk.model_validate_json(chunk).chunk
+                    if first and isinstance(
+                        parsed_chunk,
+                        (TextGenerativeResponse, JSONGenerativeResponse, StatusGenerativeResponse),
+                    ):
+                        metrics.record_first_chunk_yielded()
+                        first = False
                     if isinstance(parsed_chunk, TextGenerativeResponse):
                         text_answer += parsed_chunk.text
+                    elif isinstance(parsed_chunk, ReasoningGenerativeResponse):
+                        if first_reasoning:
+                            metrics.record_first_reasoning_chunk_yielded()
+                            first_reasoning = False
+                        text_reasoning += parsed_chunk.text
                     elif isinstance(parsed_chunk, JSONGenerativeResponse):
                         json_object = parsed_chunk.object
                     elif isinstance(parsed_chunk, StatusGenerativeResponse):
@@ -201,8 +218,11 @@ async def chat_streaming_generator(
                     )
             else:
                 text_answer += chunk.decode()
+                if first:
+                    metrics.record_first_chunk_yielded()
+                    first = False
-    if is_json is False and chunk:  # Ensure chunk is not empty before decoding
+    if is_ndjson_stream is False and chunk:  # Ensure chunk is not empty before decoding
         # If response is text the status_code comes at the last chunk of data
         last_chunk = chunk.decode()
         if last_chunk[-1] == "0":
@@ -218,8 +238,10 @@ async def chat_streaming_generator(
         client_type=client_type,
         origin=origin,
         text_answer=text_answer.encode() if json_object is None else json.dumps(json_object).encode(),
+        text_reasoning=text_reasoning if text_reasoning else None,
         generative_answer_time=metrics[PREDICT_ANSWER_METRIC],
         generative_answer_first_chunk_time=metrics.get_first_chunk_time(),
+        generative_reasoning_first_chunk_time=metrics.get_first_reasoning_chunk_time(),
         status_code=AnswerStatusCode(status_code),
     )
@@ -232,8 +254,10 @@ def audit_predict_proxy_endpoint(
     client_type: NucliaDBClientType,
     origin: str,
     text_answer: bytes,
+    text_reasoning: str | None,
     generative_answer_time: float,
-    generative_answer_first_chunk_time: Optional[float],
+    generative_answer_first_chunk_time: float | None,
+    generative_reasoning_first_chunk_time: float | None,
     status_code: AnswerStatusCode,
 ):
     maybe_audit_chat(
@@ -250,8 +274,10 @@ def audit_predict_proxy_endpoint(
         query_context_order={},
         model=headers.get(NUCLIA_LEARNING_MODEL_HEADER),
         text_answer=text_answer,
+        text_reasoning=text_reasoning,
         generative_answer_time=generative_answer_time,
         generative_answer_first_chunk_time=generative_answer_first_chunk_time or 0,
+        generative_reasoning_first_chunk_time=generative_reasoning_first_chunk_time,
         rephrase_time=None,
         status_code=status_code,
     )

nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl