PyPI - nucliadb - Versions diffs - 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.9.1.post5192py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

migrations/0023_backfill_pg_catalog.py +2 -2
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +2 -2
migrations/0039_backfill_converation_splits_metadata.py +2 -2
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/interface.py +12 -12
nucliadb/common/catalog/pg.py +41 -29
nucliadb/common/catalog/utils.py +3 -3
nucliadb/common/cluster/manager.py +5 -4
nucliadb/common/cluster/rebalance.py +483 -114
nucliadb/common/cluster/rollover.py +25 -9
nucliadb/common/cluster/settings.py +3 -8
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +4 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +4 -5
nucliadb/common/filter_expression.py +128 -40
nucliadb/common/http_clients/processing.py +12 -23
nucliadb/common/ids.py +6 -4
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +3 -4
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +3 -8
nucliadb/ingest/consumer/service.py +3 -3
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +28 -49
nucliadb/ingest/fields/conversation.py +12 -12
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +78 -64
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +4 -4
nucliadb/ingest/orm/knowledgebox.py +18 -27
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +27 -27
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +72 -70
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +3 -109
nucliadb/ingest/settings.py +3 -4
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +11 -11
nucliadb/metrics_exporter.py +5 -4
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +3 -4
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/learning_config.py +24 -4
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +2 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +11 -15
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +25 -25
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +7 -7
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +24 -17
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -23
nucliadb/search/search/chat/ask.py +88 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +449 -36
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +3 -152
nucliadb/search/search/hydrator/fields.py +92 -50
nucliadb/search/search/hydrator/images.py +7 -7
nucliadb/search/search/hydrator/paragraphs.py +42 -26
nucliadb/search/search/hydrator/resources.py +20 -16
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +10 -9
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +13 -9
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -20
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +4 -5
nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
nucliadb/search/search/query_parser/parsers/common.py +5 -6
nucliadb/search/search/query_parser/parsers/find.py +6 -26
nucliadb/search/search/query_parser/parsers/graph.py +13 -23
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -53
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +5 -6
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +2 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +2 -2
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +7 -11
nucliadb/writer/api/v1/knowledgebox.py +3 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +7 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +1 -3
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +5 -6
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/search/search/hydrator/__init__.py CHANGED Viewed

@@ -17,33 +17,11 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-import asyncio
-import logging
-from contextlib import AsyncExitStack
-from typing import Optional
 from pydantic import BaseModel
-from nucliadb.common.external_index_providers.base import TextBlockMatch
-from nucliadb.common.ids import FieldId
-from nucliadb.common.maindb.utils import get_driver
-from nucliadb.ingest.serialize import managed_serialize
-from nucliadb.search.search import cache
-from nucliadb.search.search.paragraphs import get_paragraph_text
 from nucliadb_models.common import FieldTypeName
-from nucliadb_models.resource import ExtractedDataTypeName, Resource
-from nucliadb_models.search import (
-    FindParagraph,
-    ResourceProperties,
-)
-from nucliadb_telemetry.metrics import Observer
-from nucliadb_utils import const
-from nucliadb_utils.asyncio_utils import ConcurrentRunner
-from nucliadb_utils.utilities import has_feature
-logger = logging.getLogger(__name__)
-hydrator_observer = Observer("hydrator", labels={"type": ""})
+from nucliadb_models.resource import ExtractedDataTypeName
+from nucliadb_models.search import ResourceProperties
 class ResourceHydrationOptions(BaseModel):
@@ -65,134 +43,7 @@ class TextBlockHydrationOptions(BaseModel):
     highlight: bool = False
     # list of exact matches to highlight
-    ematches: Optional[list[str]] = None
+    ematches: list[str] | None = None
     # If true, only hydrate the text block if its text is not already populated
     only_hydrate_empty: bool = False
-@hydrator_observer.wrap({"type": "resource_text"})
-async def hydrate_resource_text(
-    kbid: str, rid: str, *, max_concurrent_tasks: int
-) -> list[tuple[FieldId, str]]:
-    resource = await cache.get_resource(kbid, rid)
-    if resource is None:  # pragma: no cover
-        return []
-    # Schedule the extraction of the text of each field in the resource
-    async with get_driver().ro_transaction() as txn:
-        resource.txn = txn
-        runner = ConcurrentRunner(max_tasks=max_concurrent_tasks)
-        for field_type, field_key in await resource.get_fields(force=True):
-            field_id = FieldId.from_pb(rid, field_type, field_key)
-            runner.schedule(hydrate_field_text(kbid, field_id))
-        # Include the summary aswell
-        runner.schedule(hydrate_field_text(kbid, FieldId(rid=rid, type="a", key="summary")))
-        # Wait for the results
-        field_extracted_texts = await runner.wait()
-    return [text for text in field_extracted_texts if text is not None]
-@hydrator_observer.wrap({"type": "resource_metadata"})
-async def hydrate_resource_metadata(
-    kbid: str,
-    resource_id: str,
-    options: ResourceHydrationOptions,
-    *,
-    concurrency_control: Optional[asyncio.Semaphore] = None,
-    service_name: Optional[str] = None,
-) -> Optional[Resource]:
-    """Fetch resource metadata and return it serialized."""
-    show = options.show
-    extracted = options.extracted
-    if ResourceProperties.EXTRACTED in show and has_feature(
-        const.Features.IGNORE_EXTRACTED_IN_SEARCH, context={"kbid": kbid}, default=False
-    ):
-        # Returning extracted metadata in search results is deprecated and this flag
-        # will be set to True for all KBs in the future.
-        show.remove(ResourceProperties.EXTRACTED)
-        extracted = []
-    async with AsyncExitStack() as stack:
-        if concurrency_control is not None:
-            await stack.enter_async_context(concurrency_control)
-        async with get_driver().ro_transaction() as ro_txn:
-            serialized_resource = await managed_serialize(
-                txn=ro_txn,
-                kbid=kbid,
-                rid=resource_id,
-                show=show,
-                field_type_filter=options.field_type_filter,
-                extracted=extracted,
-                service_name=service_name,
-            )
-            if serialized_resource is None:
-                logger.warning(
-                    "Resource not found in database", extra={"kbid": kbid, "rid": resource_id}
-                )
-    return serialized_resource
-@hydrator_observer.wrap({"type": "field_text"})
-async def hydrate_field_text(
-    kbid: str,
-    field_id: FieldId,
-) -> Optional[tuple[FieldId, str]]:
-    extracted_text_pb = await cache.get_extracted_text_from_field_id(kbid, field_id)
-    if extracted_text_pb is None:  # pragma: no cover
-        return None
-    if field_id.subfield_id:
-        return field_id, extracted_text_pb.split_text[field_id.subfield_id]
-    else:
-        return field_id, extracted_text_pb.text
-@hydrator_observer.wrap({"type": "text_block"})
-async def hydrate_text_block(
-    kbid: str,
-    text_block: TextBlockMatch,
-    options: TextBlockHydrationOptions,
-    *,
-    concurrency_control: Optional[asyncio.Semaphore] = None,
-) -> TextBlockMatch:
-    """Given a `text_block`, fetch its corresponding text, modify and return the
-    `text_block` object.
-    """
-    if options.only_hydrate_empty and text_block.text:
-        return text_block
-    async with AsyncExitStack() as stack:
-        if concurrency_control is not None:
-            await stack.enter_async_context(concurrency_control)
-        text_block.text = await get_paragraph_text(
-            kbid=kbid,
-            paragraph_id=text_block.paragraph_id,
-            highlight=options.highlight,
-            matches=[],  # TODO: this was never implemented
-            ematches=options.ematches,
-        )
-    return text_block
-def text_block_to_find_paragraph(text_block: TextBlockMatch) -> FindParagraph:
-    return FindParagraph(
-        id=text_block.paragraph_id.full(),
-        text=text_block.text or "",
-        score=text_block.score,
-        score_type=text_block.score_type,
-        order=text_block.order,
-        labels=text_block.paragraph_labels,
-        fuzzy_result=text_block.fuzzy_search,
-        is_a_table=text_block.is_a_table,
-        reference=text_block.representation_file,
-        page_with_visual=text_block.page_with_visual,
-        position=text_block.position,
-        relevant_relations=text_block.relevant_relations,
-    )

nucliadb/search/search/hydrator/fields.py CHANGED Viewed

@@ -17,12 +17,25 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
+from typing import cast
+from typing_extensions import assert_never
 from nucliadb.common.ids import FIELD_TYPE_STR_TO_NAME, FieldId
-from nucliadb.common.models_utils import from_proto
-from nucliadb.ingest.orm.resource import Resource
-from nucliadb.search.search.hydrator import hydrate_field_text
+from nucliadb.ingest.fields.base import Field
+from nucliadb.ingest.fields.conversation import Conversation
+from nucliadb.ingest.fields.file import File
+from nucliadb.ingest.fields.generic import Generic
+from nucliadb.ingest.fields.link import Link
+from nucliadb.ingest.fields.text import Text
+from nucliadb.models.internal.augment import ConversationProp, FieldProp, FieldText, FieldValue
+from nucliadb.search.augmentor.fields import (
+    db_augment_conversation_field,
+    db_augment_file_field,
+    db_augment_generic_field,
+    db_augment_link_field,
+    db_augment_text_field,
+)
 from nucliadb_models import hydration as hydration_models
 from nucliadb_models.common import FieldTypeName
@@ -32,144 +45,173 @@ def page_preview_id(page_number: int) -> str:
     return f"{page_number}"
-async def hydrate_field(resource: Resource, field_id: FieldId, config: hydration_models.FieldHydration):
+async def hydrate_field(field: Field, field_id: FieldId, config: hydration_models.FieldHydration):
     field_type = FIELD_TYPE_STR_TO_NAME[field_id.type]
     if field_type == FieldTypeName.TEXT:
         if not config.text is not None:
             return
-        return await hydrate_text_field(resource, field_id, config.text)
+        field = cast(Text, field)
+        return await hydrate_text_field(field, field_id, config.text)
     elif field_type == FieldTypeName.FILE is not None:
         if not config.file:
             return
-        return await hydrate_file_field(resource, field_id, config.file)
+        field = cast(File, field)
+        return await hydrate_file_field(field, field_id, config.file)
     elif field_type == FieldTypeName.LINK is not None:
         if not config.link:
             return
-        return await hydrate_link_field(resource, field_id, config.link)
+        field = cast(Link, field)
+        return await hydrate_link_field(field, field_id, config.link)
     elif field_type == FieldTypeName.CONVERSATION is not None:
         if not config.conversation:
             return
-        return await hydrate_conversation_field(resource, field_id, config.conversation)
+        field = cast(Conversation, field)
+        return await hydrate_conversation_field(field, field_id, config.conversation)
     elif field_type == FieldTypeName.GENERIC is not None:
         if not config.generic:
             return
-        return await hydrate_generic_field(resource, field_id, config.generic)
+        field = cast(Generic, field)
+        return await hydrate_generic_field(field, field_id, config.generic)
     else:  # pragma: no cover
-        # This is a trick so mypy generates an error if this branch can be reached,
-        # that is, if we are missing some ifs
-        _a: int = "a"
+        assert_never(field_type)
 async def hydrate_text_field(
-    resource: Resource,
+    field: Text,
     field_id: FieldId,
     config: hydration_models.TextFieldHydration,
 ) -> hydration_models.HydratedTextField:
+    select: list[FieldProp] = []
+    if config.value:
+        select.append(FieldValue())
+    if config.extracted_text:
+        select.append(FieldText())
+    augmented = await db_augment_text_field(field, field_id, select)
     hydrated = hydration_models.HydratedTextField(
         id=field_id.full(),
         resource=field_id.rid,
         field_type=FieldTypeName.TEXT,
     )
-    if config.extracted_text:
-        field_text = await hydrate_field_text(resource.kb.kbid, field_id)
-        if field_text is not None:
-            (_, text) = field_text
-            hydrated.extracted = hydration_models.FieldExtractedData(text=text)
+    if config.value and augmented.value:
+        hydrated.value = augmented.value
+    if config.extracted_text and augmented.text:
+        hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
     return hydrated
 async def hydrate_file_field(
-    resource: Resource,
+    field: File,
     field_id: FieldId,
     config: hydration_models.FileFieldHydration,
 ) -> hydration_models.HydratedFileField:
+    select: list[FieldProp] = []
+    if config.value:
+        select.append(FieldValue())
+    if config.extracted_text:
+        select.append(FieldText())
+    augmented = await db_augment_file_field(field, field_id, select)
     hydrated = hydration_models.HydratedFileField(
         id=field_id.full(),
         resource=field_id.rid,
         field_type=FieldTypeName.FILE,
     )
-    if config.value:
-        field = await resource.get_field(field_id.key, field_id.pb_type)
-        value = await field.get_value()
-        hydrated.value = from_proto.field_file(value)
+    if config.value and augmented.value:
+        hydrated.value = augmented.value
-    if config.extracted_text:
-        field_text = await hydrate_field_text(resource.kb.kbid, field_id)
-        if field_text is not None:
-            (_, text) = field_text
-            hydrated.extracted = hydration_models.FieldExtractedData(text=text)
+    if config.extracted_text and augmented.text:
+        hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
     return hydrated
 async def hydrate_link_field(
-    resource: Resource,
+    field: Link,
     field_id: FieldId,
     config: hydration_models.LinkFieldHydration,
 ) -> hydration_models.HydratedLinkField:
+    select: list[FieldProp] = []
+    if config.value:
+        select.append(FieldValue())
+    if config.extracted_text:
+        select.append(FieldText())
+    augmented = await db_augment_link_field(field, field_id, select)
     hydrated = hydration_models.HydratedLinkField(
         id=field_id.full(),
         resource=field_id.rid,
         field_type=FieldTypeName.LINK,
     )
-    if config.value:
-        field = await resource.get_field(field_id.key, field_id.pb_type)
-        value = await field.get_value()
-        hydrated.value = from_proto.field_link(value)
+    if config.value and augmented.value:
+        hydrated.value = augmented.value
-    if config.extracted_text:
-        field_text = await hydrate_field_text(resource.kb.kbid, field_id)
-        if field_text is not None:
-            (_, text) = field_text
-            hydrated.extracted = hydration_models.FieldExtractedData(text=text)
+    if config.extracted_text and augmented.text:
+        hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
     return hydrated
 async def hydrate_conversation_field(
-    resource: Resource,
+    field: Conversation,
     field_id: FieldId,
     config: hydration_models.ConversationFieldHydration,
 ) -> hydration_models.HydratedConversationField:
+    select: list[ConversationProp] = []
+    if config.value:
+        select.append(FieldValue())
+    augmented = await db_augment_conversation_field(field, field_id, select)
     hydrated = hydration_models.HydratedConversationField(
         id=field_id.full(),
         resource=field_id.rid,
         field_type=FieldTypeName.CONVERSATION,
     )
-    # TODO: implement conversation fields
+    if config.value and augmented.value:
+        hydrated.value = augmented.value
     return hydrated
 async def hydrate_generic_field(
-    resource: Resource,
+    field: Generic,
     field_id: FieldId,
     config: hydration_models.GenericFieldHydration,
 ) -> hydration_models.HydratedGenericField:
+    select: list[FieldProp] = []
+    if config.value:
+        select.append(FieldValue())
+    if config.extracted_text:
+        select.append(FieldText())
+    augmented = await db_augment_generic_field(field, field_id, select)
     hydrated = hydration_models.HydratedGenericField(
         id=field_id.full(),
         resource=field_id.rid,
         field_type=FieldTypeName.GENERIC,
     )
-    if config.value:
-        field = await resource.get_field(field_id.key, field_id.pb_type)
-        value = await field.get_value()
-        hydrated.value = value
+    if config.value and augmented.value:
+        hydrated.value = augmented.value
-    if config.extracted_text:
-        field_text = await hydrate_field_text(resource.kb.kbid, field_id)
-        if field_text is not None:
-            (_, text) = field_text
-            hydrated.extracted = hydration_models.FieldExtractedData(text=text)
+    if config.extracted_text and augmented.text:
+        hydrated.extracted = hydration_models.FieldExtractedData(text=augmented.text)
     return hydrated

nucliadb/search/search/hydrator/images.py CHANGED Viewed

@@ -18,7 +18,9 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 import base64
-from typing import Optional, cast
+from typing import cast
+from typing_extensions import assert_never
 from nucliadb.common.ids import FIELD_TYPE_STR_TO_NAME, FieldId, ParagraphId
 from nucliadb.ingest.fields.base import Field
@@ -32,7 +34,7 @@ from nucliadb_utils.utilities import get_storage
 async def paragraph_source_image(
     kbid: str, paragraph_id: ParagraphId, paragraph: resources_pb2.Paragraph
-) -> Optional[Image]:
+) -> Image | None:
     """Certain paragraphs are extracted from images using techniques like OCR or
     inception. If that's the case, return the original image for this paragraph.
@@ -66,7 +68,7 @@ async def paragraph_source_image(
 async def download_image(
     kbid: str, field_id: FieldId, image_path: str, *, mime_type: str
-) -> Optional[Image]:
+) -> Image | None:
     storage = await get_storage(service_name=SERVICE_NAME)
     sf = storage.file_extracted(
         kbid,
@@ -81,7 +83,7 @@ async def download_image(
     return Image(content_type=mime_type, b64encoded=base64.b64encode(raw_image).decode())
-async def download_page_preview(field: Field, page: int) -> Optional[Image]:
+async def download_page_preview(field: Field, page: int) -> Image | None:
     """Download a specific page preview for a field and return it as an Image.
     As not all fields have previews, this function can return None.
@@ -123,8 +125,6 @@ async def download_page_preview(field: Field, page: int) -> Optional[Image]:
         image = None
     else:  # pragma: no cover
-        # This is a trick so mypy generates an error if this branch can be reached,
-        # that is, if we are missing some ifs
-        _a: int = "a"
+        assert_never(field_type)
     return image

nucliadb/search/search/hydrator/paragraphs.py CHANGED Viewed

@@ -19,12 +19,11 @@
 #
 import asyncio
 from dataclasses import dataclass
-from typing import Optional, Union
 from nucliadb.common.ids import FieldId, ParagraphId
 from nucliadb.ingest.fields.base import Field
 from nucliadb.ingest.orm.resource import Resource
-from nucliadb.search.search import paragraphs
+from nucliadb.search.augmentor.paragraphs import get_paragraph_text
 from nucliadb.search.search.hydrator.fields import page_preview_id
 from nucliadb.search.search.hydrator.images import paragraph_source_image
 from nucliadb_models import hydration as hydration_models
@@ -112,19 +111,19 @@ class ParagraphIndex:
                 replacement for replacement in paragraph.relations.replacements
             ]
-    def get(self, paragraph_id: Union[str, ParagraphId]) -> Optional[resources_pb2.Paragraph]:
+    def get(self, paragraph_id: str | ParagraphId) -> resources_pb2.Paragraph | None:
         paragraph_id = str(paragraph_id)
         return self.paragraphs.get(paragraph_id)
-    def previous(self, paragraph_id: Union[str, ParagraphId]) -> Optional[str]:
+    def previous(self, paragraph_id: str | ParagraphId) -> str | None:
         paragraph_id = str(paragraph_id)
         return self.neighbours.get((paragraph_id, ParagraphIndex.PREVIOUS))
-    def next(self, paragraph_id: Union[str, ParagraphId]) -> Optional[str]:
+    def next(self, paragraph_id: str | ParagraphId) -> str | None:
         paragraph_id = str(paragraph_id)
         return self.neighbours.get((paragraph_id, ParagraphIndex.NEXT))
-    def n_previous(self, paragraph_id: Union[str, ParagraphId], count: int = 1) -> list[str]:
+    def n_previous(self, paragraph_id: str | ParagraphId, count: int = 1) -> list[str]:
         assert count >= 1, f"can't find negative previous {count}"
         paragraph_id = str(paragraph_id)
         previous: list[str] = []
@@ -138,7 +137,7 @@ class ParagraphIndex:
             current_id = previous_id
         return previous
-    def n_next(self, paragraph_id: Union[str, ParagraphId], count: int = 1) -> list[str]:
+    def n_next(self, paragraph_id: str | ParagraphId, count: int = 1) -> list[str]:
         assert count >= 1, f"can't find negative nexts {count}"
         paragraph_id = str(paragraph_id)
         nexts = []
@@ -152,23 +151,23 @@ class ParagraphIndex:
             nexts.append(next_id)
         return nexts
-    def parents(self, paragraph_id: Union[str, ParagraphId]) -> list[str]:
+    def parents(self, paragraph_id: str | ParagraphId) -> list[str]:
         paragraph_id = str(paragraph_id)
         return self.related.get((paragraph_id, ParagraphIndex.PARENTS), [])
-    def siblings(self, paragraph_id: Union[str, ParagraphId]) -> list[str]:
+    def siblings(self, paragraph_id: str | ParagraphId) -> list[str]:
         paragraph_id = str(paragraph_id)
         return self.related.get((paragraph_id, ParagraphIndex.SIBLINGS), [])
-    def replacements(self, paragraph_id: Union[str, ParagraphId]) -> list[str]:
+    def replacements(self, paragraph_id: str | ParagraphId) -> list[str]:
         paragraph_id = str(paragraph_id)
         return self.related.get((paragraph_id, ParagraphIndex.REPLACEMENTS), [])
 @dataclass
 class ExtraParagraphHydration:
-    field_page: Optional[int]
-    field_table_page: Optional[int]
+    field_page: int | None
+    field_table_page: int | None
     related_paragraph_ids: list[ParagraphId]
@@ -187,7 +186,7 @@ async def hydrate_paragraph(
     include more or less text than the originally extracted.
     """
-    kbid = resource.kb.kbid
+    kbid = resource.kbid
     hydrated = hydration_models.HydratedParagraph(
         id=paragraph_id.full(),
@@ -199,7 +198,7 @@ async def hydrate_paragraph(
     )
     if config.text:
-        text = await paragraphs.get_paragraph_text(kbid=kbid, paragraph_id=paragraph_id)
+        text = await get_paragraph_text(field, paragraph_id)
         hydrated.text = text
     requires_paragraph_metadata = config.image or config.table or config.page or config.related
@@ -210,8 +209,20 @@ async def hydrate_paragraph(
             # otherwise, this is a fake paragraph. We can't hydrate anything else here
             if config.related:
+                if config.related.neighbours is not None:
+                    before = config.related.neighbours.before
+                    after = config.related.neighbours.after
+                else:
+                    before, after = None, None
                 hydrated.related, related_ids = await related_paragraphs_refs(
-                    paragraph_id, field_paragraphs_index, config.related
+                    paragraph_id,
+                    field_paragraphs_index,
+                    neighbours_before=before,
+                    neighbours_after=after,
+                    parents=config.related.parents or False,
+                    siblings=config.related.siblings or False,
+                    replacements=config.related.replacements or False,
                 )
                 extra_hydration.related_paragraph_ids = related_ids
@@ -259,7 +270,12 @@ async def hydrate_paragraph(
 async def related_paragraphs_refs(
     paragraph_id: ParagraphId,
     index: ParagraphIndex,
-    config: hydration_models.RelatedParagraphHydration,
+    *,
+    neighbours_before: int | None = None,
+    neighbours_after: int | None = None,
+    parents: bool = False,
+    siblings: bool = False,
+    replacements: bool = False,
 ) -> tuple[hydration_models.RelatedParagraphRefs, list[ParagraphId]]:
     """Compute the related paragraph references for a specific `paragraph_id`
     and return them with the plain list of unique related paragraphs (to
@@ -269,36 +285,36 @@ async def related_paragraphs_refs(
     hydrated = hydration_models.RelatedParagraphRefs()
     related = set()
-    if config.neighbours:
+    if neighbours_before or neighbours_after:
         hydrated.neighbours = hydration_models.RelatedNeighbourParagraphRefs()
-        if config.neighbours.before is not None:
+        if neighbours_before is not None:
             hydrated.neighbours.before = []
-            if config.neighbours.before > 0:
-                for previous_id in index.n_previous(paragraph_id, config.neighbours.before):
+            if neighbours_before > 0:
+                for previous_id in index.n_previous(paragraph_id, neighbours_before):
                     hydrated.neighbours.before.insert(0, previous_id)
                     related.add(ParagraphId.from_string(previous_id))
-        if config.neighbours.after is not None:
+        if neighbours_after is not None:
             hydrated.neighbours.after = []
-            if config.neighbours.after > 0:
-                for next_id in index.n_next(paragraph_id, config.neighbours.after):
+            if neighbours_after > 0:
+                for next_id in index.n_next(paragraph_id, neighbours_after):
                     hydrated.neighbours.after.append(next_id)
                     related.add(ParagraphId.from_string(next_id))
-    if config.parents:
+    if parents:
         hydrated.parents = []
         for parent_id in index.parents(paragraph_id):
             hydrated.parents.append(parent_id)
             related.add(ParagraphId.from_string(parent_id))
-    if config.siblings:
+    if siblings:
         hydrated.siblings = []
         for sibling_id in index.siblings(paragraph_id):
             hydrated.siblings.append(sibling_id)
             related.add(ParagraphId.from_string(sibling_id))
-    if config.replacements:
+    if replacements:
         hydrated.replacements = []
         for replacement_id in index.replacements(paragraph_id):
             hydrated.replacements.append(replacement_id)

nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.9.1.post5192py3-none-any.whl → 6.10.0.post5705py3-none-any.whl