PyPI - nucliadb - Versions diffs - 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (246) hide show

migrations/0023_backfill_pg_catalog.py +8 -4
migrations/0028_extracted_vectors_reference.py +1 -1
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +8 -4
migrations/0039_backfill_converation_splits_metadata.py +106 -0
migrations/0040_migrate_search_configurations.py +79 -0
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/__init__.py +79 -0
nucliadb/common/catalog/dummy.py +36 -0
nucliadb/common/catalog/interface.py +85 -0
nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
nucliadb/common/catalog/utils.py +56 -0
nucliadb/common/cluster/manager.py +8 -23
nucliadb/common/cluster/rebalance.py +484 -112
nucliadb/common/cluster/rollover.py +36 -9
nucliadb/common/cluster/settings.py +4 -9
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +9 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +5 -34
nucliadb/common/external_index_providers/settings.py +1 -27
nucliadb/common/filter_expression.py +129 -41
nucliadb/common/http_clients/exceptions.py +8 -0
nucliadb/common/http_clients/processing.py +16 -23
nucliadb/common/http_clients/utils.py +3 -0
nucliadb/common/ids.py +82 -58
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +22 -5
nucliadb/common/vector_index_config.py +1 -1
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +10 -8
nucliadb/ingest/consumer/service.py +5 -30
nucliadb/ingest/consumer/shard_creator.py +16 -5
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +37 -49
nucliadb/ingest/fields/conversation.py +55 -9
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +89 -57
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +128 -113
nucliadb/ingest/orm/knowledgebox.py +91 -59
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +98 -153
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +82 -71
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/partitions.py +12 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +15 -114
nucliadb/ingest/settings.py +36 -15
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +23 -26
nucliadb/metrics_exporter.py +20 -6
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +4 -11
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/knowledgebox.py +37 -9
nucliadb/reader/api/v1/learning_config.py +33 -14
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +3 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +15 -19
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +328 -0
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +28 -8
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +33 -19
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -42
nucliadb/search/search/chat/ask.py +131 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +453 -32
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +49 -0
nucliadb/search/search/hydrator/fields.py +217 -0
nucliadb/search/search/hydrator/images.py +130 -0
nucliadb/search/search/hydrator/paragraphs.py +323 -0
nucliadb/search/search/hydrator/resources.py +60 -0
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +24 -7
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +44 -18
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -48
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +5 -6
nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
nucliadb/search/search/query_parser/parsers/common.py +21 -13
nucliadb/search/search/query_parser/parsers/find.py +6 -29
nucliadb/search/search/query_parser/parsers/graph.py +18 -28
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -56
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +6 -7
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +5 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +4 -10
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +15 -14
nucliadb/writer/api/v1/knowledgebox.py +18 -56
nucliadb/writer/api/v1/learning_config.py +5 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +43 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +5 -7
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +15 -22
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb/common/external_index_providers/pinecone.py +0 -894
nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
nucliadb/search/search/hydrator.py +0 -197
nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/search/search/chat/prompt.py CHANGED Viewed

@@ -17,34 +17,36 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-import asyncio
 import copy
-from collections import deque
+from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Deque, Dict, List, Optional, Sequence, Tuple, Union, cast
+from typing import cast
 import yaml
 from pydantic import BaseModel
-from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, FieldId, ParagraphId
-from nucliadb.common.maindb.utils import get_driver
-from nucliadb.common.models_utils import from_proto
-from nucliadb.ingest.fields.base import Field
-from nucliadb.ingest.fields.conversation import Conversation
-from nucliadb.ingest.fields.file import File
-from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
-from nucliadb.search import logger
-from nucliadb.search.search import cache
-from nucliadb.search.search.chat.images import (
-    get_file_thumbnail_image,
-    get_page_image,
-    get_paragraph_image,
+import nucliadb_models
+from nucliadb.common.ids import (
+    FIELD_TYPE_STR_TO_NAME,
+    FieldId,
+    ParagraphId,
 )
-from nucliadb.search.search.hydrator import hydrate_field_text, hydrate_resource_text
+from nucliadb.search import logger
+from nucliadb.search.search.chat import rpc
 from nucliadb.search.search.metrics import Metrics
-from nucliadb.search.search.paragraphs import get_paragraph_text
+from nucliadb_models.augment import (
+    AugmentedConversationField,
+    AugmentedField,
+    AugmentedFileField,
+    AugmentFields,
+    AugmentParagraph,
+    AugmentParagraphs,
+    AugmentRequest,
+    AugmentResourceFields,
+    AugmentResources,
+)
+from nucliadb_models.common import FieldTypeName
 from nucliadb_models.labels import translate_alias_to_system_label
-from nucliadb_models.metadata import Extra, Origin
 from nucliadb_models.search import (
     SCORE_TYPE,
     AugmentedContext,
@@ -71,24 +73,9 @@ from nucliadb_models.search import (
     TextBlockAugmentationType,
     TextPosition,
 )
-from nucliadb_protos import resources_pb2
-from nucliadb_protos.resources_pb2 import ExtractedText, FieldComputedMetadata
-from nucliadb_utils.asyncio_utils import run_concurrently
-from nucliadb_utils.utilities import get_storage
-MAX_RESOURCE_TASKS = 5
-MAX_RESOURCE_FIELD_TASKS = 4
-# Number of messages to pull after a match in a message
-# The hope here is it will be enough to get the answer to the question.
-CONVERSATION_MESSAGE_CONTEXT_EXPANSION = 15
-TextBlockId = Union[ParagraphId, FieldId]
+from nucliadb_protos.resources_pb2 import FieldComputedMetadata
-class ParagraphIdNotFoundInExtractedMetadata(Exception):
-    pass
+TextBlockId = ParagraphId | FieldId
 class CappedPromptContext:
@@ -97,7 +84,7 @@ class CappedPromptContext:
     and automatically trim data that exceeds the limit when it's being set on the dictionary.
     """
-    def __init__(self, max_size: Optional[int]):
+    def __init__(self, max_size: int | None):
         self.output: PromptContext = {}
         self.images: PromptContextImages = {}
         self.max_size = max_size
@@ -158,79 +145,6 @@ class CappedPromptContext:
         return self.output
-async def get_next_conversation_messages(
-    *,
-    field_obj: Conversation,
-    page: int,
-    start_idx: int,
-    num_messages: int,
-    message_type: Optional[resources_pb2.Message.MessageType.ValueType] = None,
-    msg_to: Optional[str] = None,
-) -> List[resources_pb2.Message]:
-    output = []
-    cmetadata = await field_obj.get_metadata()
-    for current_page in range(page, cmetadata.pages + 1):
-        conv = await field_obj.db_get_value(current_page)
-        for message in conv.messages[start_idx:]:
-            if message_type is not None and message.type != message_type:  # pragma: no cover
-                continue
-            if msg_to is not None and msg_to not in message.to:  # pragma: no cover
-                continue
-            output.append(message)
-            if len(output) >= num_messages:
-                return output
-        start_idx = 0
-    return output
-async def find_conversation_message(
-    field_obj: Conversation, mident: str
-) -> tuple[Optional[resources_pb2.Message], int, int]:
-    cmetadata = await field_obj.get_metadata()
-    for page in range(1, cmetadata.pages + 1):
-        conv = await field_obj.db_get_value(page)
-        for idx, message in enumerate(conv.messages):
-            if message.ident == mident:
-                return message, page, idx
-    return None, -1, -1
-async def get_expanded_conversation_messages(
-    *,
-    kb: KnowledgeBoxORM,
-    rid: str,
-    field_id: str,
-    mident: str,
-    max_messages: int = CONVERSATION_MESSAGE_CONTEXT_EXPANSION,
-) -> list[resources_pb2.Message]:
-    resource = await kb.get(rid)
-    if resource is None:  # pragma: no cover
-        return []
-    field_obj: Conversation = await resource.get_field(field_id, FIELD_TYPE_STR_TO_PB["c"], load=True)  # type: ignore
-    found_message, found_page, found_idx = await find_conversation_message(
-        field_obj=field_obj, mident=mident
-    )
-    if found_message is None:  # pragma: no cover
-        return []
-    elif found_message.type == resources_pb2.Message.MessageType.QUESTION:
-        # only try to get answer if it was a question
-        return await get_next_conversation_messages(
-            field_obj=field_obj,
-            page=found_page,
-            start_idx=found_idx + 1,
-            num_messages=1,
-            message_type=resources_pb2.Message.MessageType.ANSWER,
-        )
-    else:
-        return await get_next_conversation_messages(
-            field_obj=field_obj,
-            page=found_page,
-            start_idx=found_idx + 1,
-            num_messages=max_messages,
-        )
 async def default_prompt_context(
     context: CappedPromptContext,
     kbid: str,
@@ -245,35 +159,59 @@ async def default_prompt_context(
     - User context is inserted first, in order of appearance.
     - Using an dict prevents from duplicates pulled in through conversation expansion.
     """
-    # Sort retrieved paragraphs by decreasing order (most relevant first)
-    async with get_driver().ro_transaction() as txn:
-        storage = await get_storage()
-        kb = KnowledgeBoxORM(txn, storage, kbid)
-        for paragraph in ordered_paragraphs:
-            context[paragraph.id] = _clean_paragraph_text(paragraph)
-            # If the paragraph is a conversation and it matches semantically, we assume we
-            # have matched with the question, therefore try to include the answer to the
-            # context by pulling the next few messages of the conversation field
-            rid, field_type, field_id, mident = paragraph.id.split("/")[:4]
-            if field_type == "c" and paragraph.score_type in (
-                SCORE_TYPE.VECTOR,
-                SCORE_TYPE.BOTH,
-            ):
-                expanded_msgs = await get_expanded_conversation_messages(
-                    kb=kb, rid=rid, field_id=field_id, mident=mident
-                )
-                for msg in expanded_msgs:
-                    text = msg.content.text.strip()
-                    pid = f"{rid}/{field_type}/{field_id}/{msg.ident}/0-{len(msg.content.text) + 1}"
-                    context[pid] = text
+    conversations = []
+    for paragraph in ordered_paragraphs:
+        context[paragraph.id] = _clean_paragraph_text(paragraph)
+        # If the paragraph is a conversation and it matches semantically, we
+        # assume we have matched with the question, therefore try to include the
+        # answer to the context by pulling the next few messages of the
+        # conversation field
+        rid, field_type, field_id, mident = paragraph.id.split("/")[:4]
+        # FIXME: a semantic paragraph can have reranker score. Once we
+        # refactor and have access to the score history, we can fix this
+        if field_type == "c" and paragraph.score_type in (
+            SCORE_TYPE.VECTOR,
+            SCORE_TYPE.BOTH,
+        ):
+            conversations.append(f"{rid}/{field_type}/{field_id}/{mident}")
+    augment = AugmentRequest(
+        fields=[
+            AugmentFields(
+                given=[id for id in conversations],
+                conversation_answer_or_messages_after=True,
+            ),
+        ]
+    )
+    augmented = await rpc.augment(kbid, augment)
+    for id in conversations:
+        conversation_id = FieldId.from_string(id)
+        augmented_field = augmented.fields.get(conversation_id.full_without_subfield())
+        if augmented_field is None or not isinstance(augmented_field, AugmentedConversationField):
+            continue
+        for message in augmented_field.messages or []:
+            if message.text is None:
+                continue
+            message_id = copy.copy(conversation_id)
+            message_id.subfield_id = message.ident
+            pid = ParagraphId(
+                field_id=message_id, paragraph_start=0, paragraph_end=len(message.text)
+            ).full()
+            context[pid] = message.text
 async def full_resource_prompt_context(
     context: CappedPromptContext,
     kbid: str,
     ordered_paragraphs: list[FindParagraph],
-    resource: Optional[str],
+    rid: str | None,
     strategy: FullResourceStrategy,
     metrics: Metrics,
     augmented_context: AugmentedContext,
@@ -288,16 +226,16 @@ async def full_resource_prompt_context(
         ordered_paragraphs: The results of the retrieval (find) operation.
         resource: The resource to be included in the context. This is used only when chatting with a specific resource with no retrieval.
         strategy: strategy instance containing, for example, the number of full resources to include in the context.
-    """  # noqa: E501
-    if resource is not None:
+    """
+    if rid is not None:
         # The user has specified a resource to be included in the context.
-        ordered_resources = [resource]
+        ordered_resources = [rid]
     else:
         # Collect the list of resources in the results (in order of relevance).
         ordered_resources = []
         for paragraph in ordered_paragraphs:
-            resource_uuid = parse_text_block_id(paragraph.id).rid
-            if resource_uuid not in ordered_resources:
+            rid = parse_text_block_id(paragraph.id).rid
+            if rid not in ordered_resources:
                 skip = False
                 if strategy.apply_to is not None:
                     # decide whether the resource should be extended or not
@@ -307,35 +245,62 @@ async def full_resource_prompt_context(
                         )
                 if not skip:
-                    ordered_resources.append(resource_uuid)
-    # For each resource, collect the extracted text from all its fields.
-    resources_extracted_texts = await run_concurrently(
-        [
-            hydrate_resource_text(kbid, resource_uuid, max_concurrent_tasks=MAX_RESOURCE_FIELD_TASKS)
-            for resource_uuid in ordered_resources[: strategy.count]
-        ],
-        max_concurrent=MAX_RESOURCE_TASKS,
+                    ordered_resources.append(rid)
+                    # skip when we have enough resource ids
+                    if strategy.count is not None and len(ordered_resources) > strategy.count:
+                        break
+    ordered_resources = ordered_resources[: strategy.count]
+    # For each resource, collect the extracted text from all its fields and
+    # include the title and summary as well
+    augmented = await rpc.augment(
+        kbid,
+        AugmentRequest(
+            resources=[
+                AugmentResources(
+                    given=ordered_resources,
+                    title=True,
+                    summary=True,
+                    fields=AugmentResourceFields(
+                        text=True,
+                        filters=[],
+                    ),
+                )
+            ]
+        ),
     )
+    extracted_texts = {}
+    for rid, resource in augmented.resources.items():
+        if resource.title is not None:
+            field_id = FieldId(rid=rid, type="a", key="title").full()
+            extracted_texts[field_id] = resource.title
+        if resource.summary is not None:
+            field_id = FieldId(rid=rid, type="a", key="summary").full()
+            extracted_texts[field_id] = resource.summary
+    for field_id, field in augmented.fields.items():
+        field = cast(AugmentedField, field)
+        if field.text is not None:
+            extracted_texts[field_id] = field.text
     added_fields = set()
-    for resource_extracted_texts in resources_extracted_texts:
-        if resource_extracted_texts is None:
-            continue
-        for field, extracted_text in resource_extracted_texts:
-            # First off, remove the text block ids from paragraphs that belong to
-            # the same field, as otherwise the context will be duplicated.
-            for tb_id in context.text_block_ids():
-                if tb_id.startswith(field.full()):
-                    del context[tb_id]
-            # Add the extracted text of each field to the context.
-            context[field.full()] = extracted_text
-            augmented_context.fields[field.full()] = AugmentedTextBlock(
-                id=field.full(),
-                text=extracted_text,
-                augmentation_type=TextBlockAugmentationType.FULL_RESOURCE,
-            )
+    for field_id, extracted_text in extracted_texts.items():
+        # First off, remove the text block ids from paragraphs that belong to
+        # the same field, as otherwise the context will be duplicated.
+        for tb_id in context.text_block_ids():
+            if tb_id.startswith(field_id):
+                del context[tb_id]
+        # Add the extracted text of each field to the context.
+        context[field_id] = extracted_text
+        augmented_context.fields[field_id] = AugmentedTextBlock(
+            id=field_id,
+            text=extracted_text,
+            augmentation_type=TextBlockAugmentationType.FULL_RESOURCE,
+        )
-            added_fields.add(field.full())
+        added_fields.add(field_id)
     metrics.set("full_resource_ops", len(added_fields))
@@ -353,213 +318,167 @@ async def extend_prompt_context_with_metadata(
     metrics: Metrics,
     augmented_context: AugmentedContext,
 ) -> None:
+    rids: list[str] = []
+    field_ids: list[str] = []
     text_block_ids: list[TextBlockId] = []
     for text_block_id in context.text_block_ids():
         try:
-            text_block_ids.append(parse_text_block_id(text_block_id))
+            tb_id = parse_text_block_id(text_block_id)
         except ValueError:  # pragma: no cover
             # Some text block ids are not paragraphs nor fields, so they are skipped
             # (e.g. USER_CONTEXT_0, when the user provides extra context)
             continue
+        field_id = tb_id if isinstance(tb_id, FieldId) else tb_id.field_id
+        text_block_ids.append(tb_id)
+        field_ids.append(field_id.full())
+        rids.append(tb_id.rid)
     if len(text_block_ids) == 0:  # pragma: no cover
         return
+    resource_origin = False
+    resource_extra = False
+    classification_labels = False
+    field_entities = False
     ops = 0
     if MetadataExtensionType.ORIGIN in strategy.types:
         ops += 1
-        await extend_prompt_context_with_origin_metadata(
-            context, kbid, text_block_ids, augmented_context
-        )
+        resource_origin = True
     if MetadataExtensionType.CLASSIFICATION_LABELS in strategy.types:
         ops += 1
-        await extend_prompt_context_with_classification_labels(
-            context, kbid, text_block_ids, augmented_context
-        )
+        classification_labels = True
     if MetadataExtensionType.NERS in strategy.types:
         ops += 1
-        await extend_prompt_context_with_ner(context, kbid, text_block_ids, augmented_context)
+        field_entities = True
     if MetadataExtensionType.EXTRA_METADATA in strategy.types:
         ops += 1
-        await extend_prompt_context_with_extra_metadata(context, kbid, text_block_ids, augmented_context)
+        resource_extra = True
     metrics.set("metadata_extension_ops", ops * len(text_block_ids))
+    augment_req = AugmentRequest()
+    if resource_origin or resource_extra or classification_labels:
+        augment_req.resources = [
+            AugmentResources(
+                given=rids,
+                origin=resource_origin,
+                extra=resource_extra,
+                classification_labels=classification_labels,
+            )
+        ]
+    if classification_labels or field_entities:
+        augment_req.fields = [
+            AugmentFields(
+                given=field_ids,
+                classification_labels=classification_labels,
+                entities=field_entities,
+            )
+        ]
-def parse_text_block_id(text_block_id: str) -> TextBlockId:
-    try:
-        # Typically, the text block id is a paragraph id
-        return ParagraphId.from_string(text_block_id)
-    except ValueError:
-        # When we're doing `full_resource` or `hierarchy` strategies,the text block id
-        # is a field id
-        return FieldId.from_string(text_block_id)
+    if augment_req.resources is None and augment_req.fields is None:
+        # nothing to augment
+        return
+    augmented = await rpc.augment(kbid, augment_req)
-async def extend_prompt_context_with_origin_metadata(
-    context: CappedPromptContext,
-    kbid,
-    text_block_ids: list[TextBlockId],
-    augmented_context: AugmentedContext,
-):
-    async def _get_origin(kbid: str, rid: str) -> tuple[str, Optional[Origin]]:
-        origin = None
-        resource = await cache.get_resource(kbid, rid)
-        if resource is not None:
-            pb_origin = await resource.get_origin()
-            if pb_origin is not None:
-                origin = from_proto.origin(pb_origin)
-        return rid, origin
-    rids = {tb_id.rid for tb_id in text_block_ids}
-    origins = await run_concurrently([_get_origin(kbid, rid) for rid in rids])
-    rid_to_origin = {rid: origin for rid, origin in origins if origin is not None}
     for tb_id in text_block_ids:
-        origin = rid_to_origin.get(tb_id.rid)
-        if origin is not None and tb_id.full() in context:
-            text = context.output.pop(tb_id.full())
-            extended_text = text + f"\n\nDOCUMENT METADATA AT ORIGIN:\n{to_yaml(origin)}"
-            context[tb_id.full()] = extended_text
-            augmented_context.paragraphs[tb_id.full()] = AugmentedTextBlock(
-                id=tb_id.full(),
-                text=extended_text,
-                parent=tb_id.full(),
-                augmentation_type=TextBlockAugmentationType.METADATA_EXTENSION,
-            )
+        field_id = tb_id if isinstance(tb_id, FieldId) else tb_id.field_id
+        resource = augmented.resources.get(tb_id.rid)
+        field = augmented.fields.get(field_id.full())
-async def extend_prompt_context_with_classification_labels(
-    context: CappedPromptContext,
-    kbid: str,
-    text_block_ids: list[TextBlockId],
-    augmented_context: AugmentedContext,
-):
-    async def _get_labels(kbid: str, _id: TextBlockId) -> tuple[TextBlockId, list[tuple[str, str]]]:
-        fid = _id if isinstance(_id, FieldId) else _id.field_id
-        labels = set()
-        resource = await cache.get_resource(kbid, fid.rid)
         if resource is not None:
-            pb_basic = await resource.get_basic()
-            if pb_basic is not None:
-                # Add the classification labels of the resource
-                for classif in pb_basic.usermetadata.classifications:
-                    labels.add((classif.labelset, classif.label))
-                # Add the classifications labels of the field
-                for fc in pb_basic.computedmetadata.field_classifications:
-                    if fc.field.field == fid.key and fc.field.field_type == fid.pb_type:
-                        for classif in fc.classifications:
-                            if classif.cancelled_by_user:  # pragma: no cover
-                                continue
-                            labels.add((classif.labelset, classif.label))
-        return _id, list(labels)
-    classif_labels = await run_concurrently([_get_labels(kbid, tb_id) for tb_id in text_block_ids])
-    tb_id_to_labels = {tb_id: labels for tb_id, labels in classif_labels if len(labels) > 0}
-    for tb_id in text_block_ids:
-        labels = tb_id_to_labels.get(tb_id)
-        if labels is not None and tb_id.full() in context:
-            text = context.output.pop(tb_id.full())
-            labels_text = "DOCUMENT CLASSIFICATION LABELS:"
-            for labelset, label in labels:
-                labels_text += f"\n - {label} ({labelset})"
-            extended_text = text + "\n\n" + labels_text
-            context[tb_id.full()] = extended_text
-            augmented_context.paragraphs[tb_id.full()] = AugmentedTextBlock(
-                id=tb_id.full(),
-                text=extended_text,
-                parent=tb_id.full(),
-                augmentation_type=TextBlockAugmentationType.METADATA_EXTENSION,
-            )
+            if resource.origin is not None:
+                text = context.output.pop(tb_id.full())
+                extended_text = text + f"\n\nDOCUMENT METADATA AT ORIGIN:\n{to_yaml(resource.origin)}"
+                context[tb_id.full()] = extended_text
+                augmented_context.paragraphs[tb_id.full()] = AugmentedTextBlock(
+                    id=tb_id.full(),
+                    text=extended_text,
+                    parent=tb_id.full(),
+                    augmentation_type=TextBlockAugmentationType.METADATA_EXTENSION,
+                )
+            if resource.extra is not None:
+                text = context.output.pop(tb_id.full())
+                extended_text = text + f"\n\nDOCUMENT EXTRA METADATA:\n{to_yaml(resource.extra)}"
+                context[tb_id.full()] = extended_text
+                augmented_context.paragraphs[tb_id.full()] = AugmentedTextBlock(
+                    id=tb_id.full(),
+                    text=extended_text,
+                    parent=tb_id.full(),
+                    augmentation_type=TextBlockAugmentationType.METADATA_EXTENSION,
+                )
-async def extend_prompt_context_with_ner(
-    context: CappedPromptContext,
-    kbid: str,
-    text_block_ids: list[TextBlockId],
-    augmented_context: AugmentedContext,
-):
-    async def _get_ners(kbid: str, _id: TextBlockId) -> tuple[TextBlockId, dict[str, set[str]]]:
-        fid = _id if isinstance(_id, FieldId) else _id.field_id
-        ners: dict[str, set[str]] = {}
-        resource = await cache.get_resource(kbid, fid.rid)
-        if resource is not None:
-            field = await resource.get_field(fid.key, fid.pb_type, load=False)
-            fcm = await field.get_field_metadata()
-            if fcm is not None:
-                # Data Augmentation + Processor entities
-                for (
-                    data_aumgentation_task_id,
-                    entities_wrapper,
-                ) in fcm.metadata.entities.items():
-                    for entity in entities_wrapper.entities:
-                        ners.setdefault(entity.label, set()).add(entity.text)
-                # Legacy processor entities
-                # TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
-                for token, family in fcm.metadata.ner.items():
-                    ners.setdefault(family, set()).add(token)
-        return _id, ners
-    nerss = await run_concurrently([_get_ners(kbid, tb_id) for tb_id in text_block_ids])
-    tb_id_to_ners = {tb_id: ners for tb_id, ners in nerss if len(ners) > 0}
-    for tb_id in text_block_ids:
-        ners = tb_id_to_ners.get(tb_id)
-        if ners is not None and tb_id.full() in context:
-            text = context.output.pop(tb_id.full())
-            ners_text = "DOCUMENT NAMED ENTITIES (NERs):"
-            for family, tokens in ners.items():
-                ners_text += f"\n - {family}:"
-                for token in sorted(list(tokens)):
-                    ners_text += f"\n   - {token}"
-            extended_text = text + "\n\n" + ners_text
-            context[tb_id.full()] = extended_text
-            augmented_context.paragraphs[tb_id.full()] = AugmentedTextBlock(
-                id=tb_id.full(),
-                text=extended_text,
-                parent=tb_id.full(),
-                augmentation_type=TextBlockAugmentationType.METADATA_EXTENSION,
-            )
+        if tb_id.full() in context:
+            if (resource is not None and resource.classification_labels) or (
+                field is not None and field.classification_labels
+            ):
+                text = context.output.pop(tb_id.full())
+                labels_text = "DOCUMENT CLASSIFICATION LABELS:"
+                if resource is not None and resource.classification_labels:
+                    for labelset, labels in resource.classification_labels.items():
+                        for label in labels:
+                            labels_text += f"\n - {label} ({labelset})"
+                if field is not None and field.classification_labels:
+                    for labelset, labels in field.classification_labels.items():
+                        for label in labels:
+                            labels_text += f"\n - {label} ({labelset})"
+                extended_text = text + "\n\n" + labels_text
+                context[tb_id.full()] = extended_text
+                augmented_context.paragraphs[tb_id.full()] = AugmentedTextBlock(
+                    id=tb_id.full(),
+                    text=extended_text,
+                    parent=tb_id.full(),
+                    augmentation_type=TextBlockAugmentationType.METADATA_EXTENSION,
+                )
+            if field is not None and field.entities:
+                ners = field.entities
-async def extend_prompt_context_with_extra_metadata(
-    context: CappedPromptContext,
-    kbid: str,
-    text_block_ids: list[TextBlockId],
-    augmented_context: AugmentedContext,
-):
-    async def _get_extra(kbid: str, rid: str) -> tuple[str, Optional[Extra]]:
-        extra = None
-        resource = await cache.get_resource(kbid, rid)
-        if resource is not None:
-            pb_extra = await resource.get_extra()
-            if pb_extra is not None:
-                extra = from_proto.extra(pb_extra)
-        return rid, extra
-    rids = {tb_id.rid for tb_id in text_block_ids}
-    extras = await run_concurrently([_get_extra(kbid, rid) for rid in rids])
-    rid_to_extra = {rid: extra for rid, extra in extras if extra is not None}
-    for tb_id in text_block_ids:
-        extra = rid_to_extra.get(tb_id.rid)
-        if extra is not None and tb_id.full() in context:
-            text = context.output.pop(tb_id.full())
-            extended_text = text + f"\n\nDOCUMENT EXTRA METADATA:\n{to_yaml(extra)}"
-            context[tb_id.full()] = extended_text
-            augmented_context.paragraphs[tb_id.full()] = AugmentedTextBlock(
-                id=tb_id.full(),
-                text=extended_text,
-                parent=tb_id.full(),
-                augmentation_type=TextBlockAugmentationType.METADATA_EXTENSION,
-            )
+                text = context.output.pop(tb_id.full())
+                ners_text = "DOCUMENT NAMED ENTITIES (NERs):"
+                for family, tokens in ners.items():
+                    ners_text += f"\n - {family}:"
+                    for token in sorted(list(tokens)):
+                        ners_text += f"\n   - {token}"
+                extended_text = text + "\n\n" + ners_text
+                context[tb_id.full()] = extended_text
+                augmented_context.paragraphs[tb_id.full()] = AugmentedTextBlock(
+                    id=tb_id.full(),
+                    text=extended_text,
+                    parent=tb_id.full(),
+                    augmentation_type=TextBlockAugmentationType.METADATA_EXTENSION,
+                )
+def parse_text_block_id(text_block_id: str) -> TextBlockId:
+    try:
+        # Typically, the text block id is a paragraph id
+        return ParagraphId.from_string(text_block_id)
+    except ValueError:
+        # When we're doing `full_resource` or `hierarchy` strategies,the text block id
+        # is a field id
+        return FieldId.from_string(text_block_id)
 def to_yaml(obj: BaseModel) -> str:
+    # FIXME: this dumps enums REALLY poorly, e.g.,
+    # `!!python/object/apply:nucliadb_models.metadata.Source\n- WEB` for
+    # Source.WEB instead of `WEB`
     return yaml.dump(
         obj.model_dump(exclude_none=True, exclude_defaults=True, exclude_unset=True),
         default_flow_style=False,
@@ -589,37 +508,74 @@ async def field_extension_prompt_context(
         if resource_uuid not in ordered_resources:
             ordered_resources.append(resource_uuid)
-    # Fetch the extracted texts of the specified fields for each resource
-    extend_fields = strategy.fields
-    extend_field_ids = []
-    for resource_uuid in ordered_resources:
-        for field_id in extend_fields:
-            try:
-                fid = FieldId.from_string(f"{resource_uuid}/{field_id.strip('/')}")
-                extend_field_ids.append(fid)
-            except ValueError:  # pragma: no cover
-                # Invalid field id, skiping
-                continue
+    resource_title = False
+    resource_summary = False
+    filters: list[nucliadb_models.filters.Field | nucliadb_models.filters.Generated] = []
+    # this strategy exposes a way to access resource title and summary using a
+    # field id. However, as they are resource properties, we must request it as
+    # that
+    for name in strategy.fields:
+        if name == "a/title":
+            resource_title = True
+        elif name == "a/summary":
+            resource_summary = True
+        else:
+            # model already enforces type/name format
+            field_type, field_name = name.split("/")
+            filters.append(
+                nucliadb_models.filters.Field(
+                    type=FIELD_TYPE_STR_TO_NAME[field_type], name=field_name or None
+                )
+            )
-    tasks = [hydrate_field_text(kbid, fid) for fid in extend_field_ids]
-    field_extracted_texts = await run_concurrently(tasks)
+    for da_prefix in strategy.data_augmentation_field_prefixes:
+        filters.append(nucliadb_models.filters.Generated(by="data-augmentation", da_task=da_prefix))
+    augmented = await rpc.augment(
+        kbid,
+        AugmentRequest(
+            resources=[
+                AugmentResources(
+                    given=ordered_resources,
+                    title=resource_title,
+                    summary=resource_summary,
+                    fields=AugmentResourceFields(
+                        text=True,
+                        filters=filters,
+                    ),
+                )
+            ]
+        ),
+    )
+    # REVIEW(decoupled-ask): we don't have the field count anymore, is this good enough?
+    metrics.set("field_extension_ops", len(ordered_resources))
-    metrics.set("field_extension_ops", len(field_extracted_texts))
+    extracted_texts = {}
+    # now we need to expose title and summary as fields again, so it gets
+    # consistent with the view we are providing in the API
+    for rid, augmented_resource in augmented.resources.items():
+        if augmented_resource.title:
+            extracted_texts[f"{rid}/a/title"] = augmented_resource.title
+        if augmented_resource.summary:
+            extracted_texts[f"{rid}/a/summary"] = augmented_resource.summary
-    for result in field_extracted_texts:
-        if result is None:  # pragma: no cover
+    for fid, augmented_field in augmented.fields.items():
+        if augmented_field is None or augmented_field.text is None:  # pragma: no cover
             continue
-        field, extracted_text = result
+        extracted_texts[fid] = augmented_field.text
+    for fid, extracted_text in extracted_texts.items():
         # First off, remove the text block ids from paragraphs that belong to
         # the same field, as otherwise the context will be duplicated.
         for tb_id in context.text_block_ids():
-            if tb_id.startswith(field.full()):
+            if tb_id.startswith(fid):
                 del context[tb_id]
         # Add the extracted text of each field to the beginning of the context.
-        if field.full() not in context:
-            context[field.full()] = extracted_text
-            augmented_context.fields[field.full()] = AugmentedTextBlock(
-                id=field.full(),
+        if fid not in context:
+            context[fid] = extracted_text
+            augmented_context.fields[fid] = AugmentedTextBlock(
+                id=fid,
                 text=extracted_text,
                 augmentation_type=TextBlockAugmentationType.FIELD_EXTENSION,
             )
@@ -630,13 +586,6 @@ async def field_extension_prompt_context(
             context[paragraph.id] = _clean_paragraph_text(paragraph)
-async def get_orm_field(kbid: str, field_id: FieldId) -> Optional[Field]:
-    resource = await cache.get_resource(kbid, field_id.rid)
-    if resource is None:  # pragma: no cover
-        return None
-    return await resource.get_field(key=field_id.key, type=field_id.pb_type, load=False)
 async def neighbouring_paragraphs_prompt_context(
     context: CappedPromptContext,
     kbid: str,
@@ -652,83 +601,52 @@ async def neighbouring_paragraphs_prompt_context(
     retrieved_paragraphs_ids = [
         ParagraphId.from_string(text_block.id) for text_block in ordered_text_blocks
     ]
-    unique_field_ids = list({pid.field_id for pid in retrieved_paragraphs_ids})
-    # Get extracted texts and metadatas for all fields
-    fm_ops = []
-    et_ops = []
-    for field_id in unique_field_ids:
-        field = await get_orm_field(kbid, field_id)
-        if field is None:
-            continue
-        fm_ops.append(asyncio.create_task(field.get_field_metadata()))
-        et_ops.append(asyncio.create_task(field.get_extracted_text()))
-    field_metadatas: dict[FieldId, FieldComputedMetadata] = {
-        fid: fm for fid, fm in zip(unique_field_ids, await asyncio.gather(*fm_ops)) if fm is not None
-    }
-    extracted_texts: dict[FieldId, ExtractedText] = {
-        fid: et for fid, et in zip(unique_field_ids, await asyncio.gather(*et_ops)) if et is not None
-    }
-    def _get_paragraph_text(extracted_text: ExtractedText, pid: ParagraphId) -> str:
-        if pid.field_id.subfield_id:
-            text = extracted_text.split_text.get(pid.field_id.subfield_id) or ""
-        else:
-            text = extracted_text.text
-        return text[pid.paragraph_start : pid.paragraph_end]
+    augmented = await rpc.augment(
+        kbid,
+        AugmentRequest(
+            paragraphs=[
+                AugmentParagraphs(
+                    given=[AugmentParagraph(id=pid.full()) for pid in retrieved_paragraphs_ids],
+                    text=True,
+                    neighbours_before=strategy.before,
+                    neighbours_after=strategy.after,
+                )
+            ]
+        ),
+    )
     for pid in retrieved_paragraphs_ids:
-        # Add the retrieved paragraph first
-        field_extracted_text = extracted_texts.get(pid.field_id, None)
-        if field_extracted_text is None:
+        paragraph = augmented.paragraphs.get(pid.full())
+        if paragraph is None:
             continue
-        ptext = _get_paragraph_text(field_extracted_text, pid)
+        ptext = paragraph.text or ""
         if ptext and pid.full() not in context:
             context[pid.full()] = ptext
         # Now add the neighbouring paragraphs
-        field_extracted_metadata = field_metadatas.get(pid.field_id, None)
-        if field_extracted_metadata is None:
-            continue
-        field_pids = [
-            ParagraphId(
-                field_id=pid.field_id,
-                paragraph_start=p.start,
-                paragraph_end=p.end,
-            )
-            for p in field_extracted_metadata.metadata.paragraphs
+        neighbour_ids = [
+            *(paragraph.neighbours_before or []),
+            *(paragraph.neighbours_after or []),
         ]
-        try:
-            index = field_pids.index(pid)
-        except ValueError:
-            continue
+        for npid in neighbour_ids:
+            neighbour = augmented.paragraphs.get(npid)
+            assert neighbour is not None, "augment should never return dangling paragraph references"
-        for neighbour_index in get_neighbouring_indices(
-            index=index,
-            before=strategy.before,
-            after=strategy.after,
-            field_pids=field_pids,
-        ):
-            if neighbour_index == index:
-                # Already handled above
+            if ParagraphId.from_string(npid) in retrieved_paragraphs_ids or npid in context:
+                # already added
                 continue
-            try:
-                npid = field_pids[neighbour_index]
-            except IndexError:
-                continue
-            if npid in retrieved_paragraphs_ids or npid.full() in context:
-                # Already added
-                continue
-            ptext = _get_paragraph_text(field_extracted_text, npid)
-            if not ptext:
+            ntext = neighbour.text
+            if not ntext:
                 continue
-            context[npid.full()] = ptext
-            augmented_context.paragraphs[npid.full()] = AugmentedTextBlock(
-                id=npid.full(),
-                text=ptext,
-                position=get_text_position(npid, neighbour_index, field_extracted_metadata),
+            context[npid] = ntext
+            augmented_context.paragraphs[npid] = AugmentedTextBlock(
+                id=npid,
+                text=ntext,
+                position=neighbour.position,
                 parent=pid.full(),
                 augmentation_type=TextBlockAugmentationType.NEIGHBOURING_PARAGRAPHS,
             )
@@ -738,7 +656,7 @@ async def neighbouring_paragraphs_prompt_context(
 def get_text_position(
     paragraph_id: ParagraphId, index: int, field_metadata: FieldComputedMetadata
-) -> Optional[TextPosition]:
+) -> TextPosition | None:
     if paragraph_id.field_id.subfield_id:
         metadata = field_metadata.split_metadata[paragraph_id.field_id.subfield_id]
     else:
@@ -777,148 +695,144 @@ async def conversation_prompt_context(
     metrics: Metrics,
     augmented_context: AugmentedContext,
 ):
-    analyzed_fields: List[str] = []
+    analyzed_fields: list[str] = []
     ops = 0
-    async with get_driver().ro_transaction() as txn:
-        storage = await get_storage()
-        kb = KnowledgeBoxORM(txn, storage, kbid)
-        for paragraph in ordered_paragraphs:
-            if paragraph.id not in context:
-                context[paragraph.id] = _clean_paragraph_text(paragraph)
-            # If the paragraph is a conversation and it matches semantically, we assume we
-            # have matched with the question, therefore try to include the answer to the
-            # context by pulling the next few messages of the conversation field
-            rid, field_type, field_id, mident = paragraph.id.split("/")[:4]
-            if field_type == "c" and paragraph.score_type in (
-                SCORE_TYPE.VECTOR,
-                SCORE_TYPE.BOTH,
-                SCORE_TYPE.BM25,
-            ):
-                field_unique_id = "-".join([rid, field_type, field_id])
-                if field_unique_id in analyzed_fields:
+    conversation_paragraphs = []
+    for paragraph in ordered_paragraphs:
+        if paragraph.id not in context:
+            context[paragraph.id] = _clean_paragraph_text(paragraph)
+        parent_paragraph_id = ParagraphId.from_string(paragraph.id)
+        if parent_paragraph_id.field_id.type != FieldTypeName.CONVERSATION.abbreviation():
+            # conversational strategy only applies to conversation fields
+            continue
+        field_unique_id = parent_paragraph_id.field_id.full_without_subfield()
+        if field_unique_id in analyzed_fields:
+            continue
+        conversation_paragraphs.append((parent_paragraph_id, paragraph))
+    # augment conversation paragraphs
+    if strategy.full:
+        full_conversation = True
+        max_conversation_messages = None
+    else:
+        full_conversation = False
+        max_conversation_messages = strategy.max_messages
+    augment = AugmentRequest(
+        fields=[
+            AugmentFields(
+                given=[paragraph_id.field_id.full() for paragraph_id, _ in conversation_paragraphs],
+                full_conversation=full_conversation,
+                max_conversation_messages=max_conversation_messages,
+                conversation_text_attachments=strategy.attachments_text,
+                conversation_image_attachments=strategy.attachments_images,
+            )
+        ]
+    )
+    augmented = await rpc.augment(kbid, augment)
+    attachments: dict[ParagraphId, list[FieldId]] = {}
+    for parent_paragraph_id, paragraph in conversation_paragraphs:
+        fid = parent_paragraph_id.field_id
+        field = augmented.fields.get(fid.full_without_subfield())
+        if field is not None:
+            field = cast(AugmentedConversationField, field)
+            for _message in field.messages or []:
+                ops += 1
+                if not _message.text:
+                    continue
+                text = _message.text
+                pid = ParagraphId(
+                    field_id=FieldId(
+                        rid=fid.rid,
+                        type=fid.type,
+                        key=fid.key,
+                        subfield_id=_message.ident,
+                    ),
+                    paragraph_start=0,
+                    paragraph_end=len(text),
+                ).full()
+                if pid in context:
                     continue
-                resource = await kb.get(rid)
-                if resource is None:  # pragma: no cover
+                context[pid] = text
+                attachments.setdefault(parent_paragraph_id, []).extend(
+                    [FieldId.from_string(attachment_id) for attachment_id in field.attachments or []]
+                )
+                augmented_context.paragraphs[pid] = AugmentedTextBlock(
+                    id=pid,
+                    text=text,
+                    parent=paragraph.id,
+                    augmentation_type=TextBlockAugmentationType.CONVERSATION,
+                )
+    # augment attachments
+    if strategy.attachments_text or (
+        (strategy.attachments_images and visual_llm) and len(attachments) > 0
+    ):
+        augment = AugmentRequest(
+            fields=[
+                AugmentFields(
+                    given=[
+                        id.full()
+                        for paragraph_attachments in attachments.values()
+                        for id in paragraph_attachments
+                    ],
+                    text=strategy.attachments_text,
+                    file_thumbnail=(strategy.attachments_images and visual_llm),
+                )
+            ]
+        )
+        augmented = await rpc.augment(kbid, augment)
+        for parent_paragraph_id, paragraph_attachments in attachments.items():
+            for attachment_id in paragraph_attachments:
+                attachment_field = augmented.fields.get(attachment_id.full())
+                if attachment_field is None:
                     continue
-                field_obj: Conversation = await resource.get_field(
-                    field_id, FIELD_TYPE_STR_TO_PB["c"], load=True
-                )  # type: ignore
-                cmetadata = await field_obj.get_metadata()
-                attachments: List[resources_pb2.FieldRef] = []
-                if strategy.full:
-                    ops += 5
-                    extracted_text = await field_obj.get_extracted_text()
-                    for current_page in range(1, cmetadata.pages + 1):
-                        conv = await field_obj.db_get_value(current_page)
-                        for message in conv.messages:
-                            ident = message.ident
-                            if extracted_text is not None:
-                                text = extracted_text.split_text.get(ident, message.content.text.strip())
-                            else:
-                                text = message.content.text.strip()
-                            pid = f"{rid}/{field_type}/{field_id}/{ident}/0-{len(text) + 1}"
-                            attachments.extend(message.content.attachments_fields)
-                            if pid in context:
-                                continue
-                            context[pid] = text
-                            augmented_context.paragraphs[pid] = AugmentedTextBlock(
-                                id=pid,
-                                text=text,
-                                parent=paragraph.id,
-                                augmentation_type=TextBlockAugmentationType.CONVERSATION,
-                            )
-                else:
-                    # Add first message
-                    extracted_text = await field_obj.get_extracted_text()
-                    first_page = await field_obj.db_get_value()
-                    if len(first_page.messages) > 0:
-                        message = first_page.messages[0]
-                        ident = message.ident
-                        if extracted_text is not None:
-                            text = extracted_text.split_text.get(ident, message.content.text.strip())
-                        else:
-                            text = message.content.text.strip()
-                        attachments.extend(message.content.attachments_fields)
-                        pid = f"{rid}/{field_type}/{field_id}/{ident}/0-{len(text) + 1}"
-                        if pid in context:
-                            continue
-                        context[pid] = text
-                        augmented_context.paragraphs[pid] = AugmentedTextBlock(
-                            id=pid,
-                            text=text,
-                            parent=paragraph.id,
-                            augmentation_type=TextBlockAugmentationType.CONVERSATION,
-                        )
+                if strategy.attachments_text and attachment_field.text:
+                    ops += 1
-                    messages: Deque[resources_pb2.Message] = deque(maxlen=strategy.max_messages)
-                    pending = -1
-                    for page in range(1, cmetadata.pages + 1):
-                        # Collect the messages with the window asked by the user arround the match paragraph
-                        conv = await field_obj.db_get_value(page)
-                        for message in conv.messages:
-                            messages.append(message)
-                            if pending > 0:
-                                pending -= 1
-                            if message.ident == mident:
-                                pending = (strategy.max_messages - 1) // 2
-                            if pending == 0:
-                                break
-                        if pending == 0:
-                            break
-                    for message in messages:
-                        ops += 1
-                        text = message.content.text.strip()
-                        attachments.extend(message.content.attachments_fields)
-                        pid = f"{rid}/{field_type}/{field_id}/{message.ident}/0-{len(message.content.text) + 1}"
-                        if pid in context:
-                            continue
+                    pid = f"{attachment_id.full_without_subfield()}/0-{len(attachment_field.text)}"
+                    if pid not in context:
+                        text = f"Attachment {attachment_id.key}: {attachment_field.text}\n\n"
                         context[pid] = text
                         augmented_context.paragraphs[pid] = AugmentedTextBlock(
                             id=pid,
                             text=text,
-                            parent=paragraph.id,
+                            parent=parent_paragraph_id.full(),
                             augmentation_type=TextBlockAugmentationType.CONVERSATION,
                         )
-                if strategy.attachments_text:
-                    # add on the context the images if vlm enabled
-                    for attachment in attachments:
-                        ops += 1
-                        field: File = await resource.get_field(
-                            attachment.field_id, attachment.field_type, load=True
-                        )  # type: ignore
-                        extracted_text = await field.get_extracted_text()
-                        if extracted_text is not None:
-                            pid = f"{rid}/{field_type}/{attachment.field_id}/0-{len(extracted_text.text) + 1}"
-                            if pid in context:
-                                continue
-                            text = f"Attachment {attachment.field_id}: {extracted_text.text}\n\n"
-                            context[pid] = text
-                            augmented_context.paragraphs[pid] = AugmentedTextBlock(
-                                id=pid,
-                                text=text,
-                                parent=paragraph.id,
-                                augmentation_type=TextBlockAugmentationType.CONVERSATION,
-                            )
-                if strategy.attachments_images and visual_llm:
-                    for attachment in attachments:
-                        ops += 1
-                        file_field: File = await resource.get_field(
-                            attachment.field_id, attachment.field_type, load=True
-                        )  # type: ignore
-                        image = await get_file_thumbnail_image(file_field)
-                        if image is not None:
-                            pid = f"{rid}/f/{attachment.field_id}/0-0"
-                            context.images[pid] = image
-                analyzed_fields.append(field_unique_id)
+                if (
+                    (strategy.attachments_images and visual_llm)
+                    and isinstance(attachment_field, AugmentedFileField)
+                    and attachment_field.thumbnail_image
+                ):
+                    ops += 1
+                    image = await rpc.download_image(
+                        kbid,
+                        attachment_id,
+                        attachment_field.thumbnail_image,
+                        # We assume the thumbnail is always generated as JPEG by Nuclia processing
+                        mime_type="image/jpeg",
+                    )
+                    if image is not None:
+                        pid = f"{attachment_id.rid}/f/{attachment_id.key}/0-0"
+                        context.images[pid] = image
+        analyzed_fields.append(field_unique_id)
     metrics.set("conversation_ops", ops)
@@ -939,66 +853,93 @@ async def hierarchy_prompt_context(
     # Make a copy of the ordered paragraphs to avoid modifying the original list, which is returned
     # in the response to the user
     ordered_paragraphs_copy = copy.deepcopy(ordered_paragraphs)
-    resources: Dict[str, ExtraCharsParagraph] = {}
+    resources: dict[str, ExtraCharsParagraph] = {}
     # Iterate paragraphs to get extended text
+    paragraphs_to_augment = []
     for paragraph in ordered_paragraphs_copy:
         paragraph_id = ParagraphId.from_string(paragraph.id)
-        extended_paragraph_text = paragraph.text
-        if paragraphs_extra_characters > 0:
-            extended_paragraph_text = await get_paragraph_text(
-                kbid=kbid,
-                paragraph_id=paragraph_id,
-                log_on_missing_field=True,
-            )
         rid = paragraph_id.rid
+        if paragraphs_extra_characters > 0:
+            paragraph_id.paragraph_end += paragraphs_extra_characters
+        paragraphs_to_augment.append(paragraph_id)
         if rid not in resources:
             # Get the title and the summary of the resource
-            title_text = await get_paragraph_text(
-                kbid=kbid,
-                paragraph_id=ParagraphId(
-                    field_id=FieldId(
-                        rid=rid,
-                        type="a",
-                        key="title",
-                    ),
-                    paragraph_start=0,
-                    paragraph_end=500,
+            title_paragraph_id = ParagraphId(
+                field_id=FieldId(
+                    rid=rid,
+                    type="a",
+                    key="title",
                 ),
-                log_on_missing_field=False,
+                paragraph_start=0,
+                paragraph_end=500,
             )
-            summary_text = await get_paragraph_text(
-                kbid=kbid,
-                paragraph_id=ParagraphId(
-                    field_id=FieldId(
-                        rid=rid,
-                        type="a",
-                        key="summary",
-                    ),
-                    paragraph_start=0,
-                    paragraph_end=1000,
+            summary_paragraph_id = ParagraphId(
+                field_id=FieldId(
+                    rid=rid,
+                    type="a",
+                    key="summary",
                 ),
-                log_on_missing_field=False,
+                paragraph_start=0,
+                paragraph_end=1000,
             )
+            paragraphs_to_augment.append(title_paragraph_id)
+            paragraphs_to_augment.append(summary_paragraph_id)
             resources[rid] = ExtraCharsParagraph(
-                title=title_text,
-                summary=summary_text,
-                paragraphs=[(paragraph, extended_paragraph_text)],
+                title=title_paragraph_id,
+                summary=summary_paragraph_id,
+                paragraphs=[(paragraph, paragraph_id)],
             )
         else:
-            resources[rid].paragraphs.append((paragraph, extended_paragraph_text))
+            resources[rid].paragraphs.append((paragraph, paragraph_id))
     metrics.set("hierarchy_ops", len(resources))
+    augmented = await rpc.augment(
+        kbid,
+        AugmentRequest(
+            paragraphs=[
+                AugmentParagraphs(
+                    given=[
+                        AugmentParagraph(id=paragraph_id.full())
+                        for paragraph_id in paragraphs_to_augment
+                    ],
+                    text=True,
+                )
+            ]
+        ),
+    )
     augmented_paragraphs = set()
     # Modify the first paragraph of each resource to include the title and summary of the resource, as well as the
     # extended paragraph text of all the paragraphs in the resource.
     for values in resources.values():
-        title_text = values.title
-        summary_text = values.summary
+        augmented_title = augmented.paragraphs.get(values.title.full())
+        if augmented_title:
+            title_text = augmented_title.text or ""
+        else:
+            title_text = ""
+        augmented_summary = augmented.paragraphs.get(values.summary.full())
+        if augmented_summary:
+            summary_text = augmented_summary.text or ""
+        else:
+            summary_text = ""
         first_paragraph = None
         text_with_hierarchy = ""
-        for paragraph, extended_paragraph_text in values.paragraphs:
+        for paragraph, paragraph_id in values.paragraphs:
+            augmented_paragraph = augmented.paragraphs.get(paragraph_id.full())
+            if augmented_paragraph:
+                extended_paragraph_text = augmented_paragraph.text or ""
+            else:
+                extended_paragraph_text = ""
             if first_paragraph is None:
                 first_paragraph = paragraph
             text_with_hierarchy += "\n EXTRACTED BLOCK: \n " + extended_paragraph_text + " \n\n "
@@ -1035,14 +976,14 @@ class PromptContextBuilder:
         self,
         kbid: str,
         ordered_paragraphs: list[FindParagraph],
-        resource: Optional[str] = None,
-        user_context: Optional[list[str]] = None,
-        user_image_context: Optional[list[Image]] = None,
-        strategies: Optional[Sequence[RagStrategy]] = None,
-        image_strategies: Optional[Sequence[ImageRagStrategy]] = None,
-        max_context_characters: Optional[int] = None,
+        resource: str | None = None,
+        user_context: list[str] | None = None,
+        user_image_context: list[Image] | None = None,
+        strategies: Sequence[RagStrategy] | None = None,
+        image_strategies: Sequence[ImageRagStrategy] | None = None,
+        max_context_characters: int | None = None,
         visual_llm: bool = False,
-        query_image: Optional[Image] = None,
+        query_image: Image | None = None,
         metrics: Metrics = Metrics("prompt_context_builder"),
     ):
         self.kbid = kbid
@@ -1088,10 +1029,10 @@ class PromptContextBuilder:
         if self.image_strategies is None or len(self.image_strategies) == 0:
             # Nothing to do
             return
-        page_image_strategy: Optional[PageImageStrategy] = None
+        page_image_strategy: PageImageStrategy | None = None
         max_page_images = 5
-        table_image_strategy: Optional[TableImageStrategy] = None
-        paragraph_image_strategy: Optional[ParagraphImageStrategy] = None
+        table_image_strategy: TableImageStrategy | None = None
+        paragraph_image_strategy: ParagraphImageStrategy | None = None
         for strategy in self.image_strategies:
             if strategy.name == ImageRagStrategyName.PAGE_IMAGE:
                 if page_image_strategy is None:
@@ -1121,7 +1062,12 @@ class PromptContextBuilder:
                 # page_image_id: rid/f/myfield/0
                 page_image_id = "/".join([pid.field_id.full(), str(paragraph_page_number)])
                 if page_image_id not in context.images:
-                    image = await get_page_image(self.kbid, pid, paragraph_page_number)
+                    image = await rpc.download_image(
+                        self.kbid,
+                        pid.field_id,
+                        f"generated/extracted_images_{paragraph_page_number}.png",
+                        mime_type="image/png",
+                    )
                     if image is not None:
                         ops += 1
                         context.images[page_image_id] = image
@@ -1141,7 +1087,9 @@ class PromptContextBuilder:
             if (add_table or add_paragraph) and (
                 paragraph.reference is not None and paragraph.reference != ""
             ):
-                pimage = await get_paragraph_image(self.kbid, pid, paragraph.reference)
+                pimage = await rpc.download_image(
+                    self.kbid, pid.field_id, f"generated/{paragraph.reference}", mime_type="image/png"
+                )
                 if pimage is not None:
                     ops += 1
                     context.images[paragraph.id] = pimage
@@ -1171,12 +1119,12 @@ class PromptContextBuilder:
             RagStrategyName.GRAPH,
         ]
-        full_resource: Optional[FullResourceStrategy] = None
-        hierarchy: Optional[HierarchyResourceStrategy] = None
-        neighbouring_paragraphs: Optional[NeighbouringParagraphsStrategy] = None
-        field_extension: Optional[FieldExtensionStrategy] = None
-        metadata_extension: Optional[MetadataExtensionStrategy] = None
-        conversational_strategy: Optional[ConversationalStrategy] = None
+        full_resource: FullResourceStrategy | None = None
+        hierarchy: HierarchyResourceStrategy | None = None
+        neighbouring_paragraphs: NeighbouringParagraphsStrategy | None = None
+        field_extension: FieldExtensionStrategy | None = None
+        metadata_extension: MetadataExtensionStrategy | None = None
+        conversational_strategy: ConversationalStrategy | None = None
         for strategy in self.strategies:
             if strategy.name == RagStrategyName.FIELD_EXTENSION:
                 field_extension = cast(FieldExtensionStrategy, strategy)
@@ -1269,7 +1217,7 @@ class PromptContextBuilder:
             )
-def get_paragraph_page_number(paragraph: FindParagraph) -> Optional[int]:
+def get_paragraph_page_number(paragraph: FindParagraph) -> int | None:
     if not paragraph.page_with_visual:
         return None
     if paragraph.position is None:
@@ -1279,9 +1227,9 @@ def get_paragraph_page_number(paragraph: FindParagraph) -> Optional[int]:
 @dataclass
 class ExtraCharsParagraph:
-    title: str
-    summary: str
-    paragraphs: List[Tuple[FindParagraph, str]]
+    title: ParagraphId
+    summary: ParagraphId
+    paragraphs: list[tuple[FindParagraph, ParagraphId]]
 def _clean_paragraph_text(paragraph: FindParagraph) -> str:

nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl