PyPI - nucliadb - Versions diffs - 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.9.1.post5192py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

migrations/0023_backfill_pg_catalog.py +2 -2
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +2 -2
migrations/0039_backfill_converation_splits_metadata.py +2 -2
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/interface.py +12 -12
nucliadb/common/catalog/pg.py +41 -29
nucliadb/common/catalog/utils.py +3 -3
nucliadb/common/cluster/manager.py +5 -4
nucliadb/common/cluster/rebalance.py +483 -114
nucliadb/common/cluster/rollover.py +25 -9
nucliadb/common/cluster/settings.py +3 -8
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +4 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +4 -5
nucliadb/common/filter_expression.py +128 -40
nucliadb/common/http_clients/processing.py +12 -23
nucliadb/common/ids.py +6 -4
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +3 -4
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +3 -8
nucliadb/ingest/consumer/service.py +3 -3
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +28 -49
nucliadb/ingest/fields/conversation.py +12 -12
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +78 -64
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +4 -4
nucliadb/ingest/orm/knowledgebox.py +18 -27
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +27 -27
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +72 -70
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +3 -109
nucliadb/ingest/settings.py +3 -4
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +11 -11
nucliadb/metrics_exporter.py +5 -4
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +3 -4
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/learning_config.py +24 -4
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +2 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +11 -15
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +25 -25
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +7 -7
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +24 -17
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -23
nucliadb/search/search/chat/ask.py +88 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +449 -36
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +3 -152
nucliadb/search/search/hydrator/fields.py +92 -50
nucliadb/search/search/hydrator/images.py +7 -7
nucliadb/search/search/hydrator/paragraphs.py +42 -26
nucliadb/search/search/hydrator/resources.py +20 -16
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +10 -9
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +13 -9
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -20
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +4 -5
nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
nucliadb/search/search/query_parser/parsers/common.py +5 -6
nucliadb/search/search/query_parser/parsers/find.py +6 -26
nucliadb/search/search/query_parser/parsers/graph.py +13 -23
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -53
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +5 -6
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +2 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +2 -2
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +7 -11
nucliadb/writer/api/v1/knowledgebox.py +3 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +7 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +1 -3
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +5 -6
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/search/augmentor/fields.py ADDED Viewed

@@ -0,0 +1,704 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+import asyncio
+from collections import deque
+from collections.abc import AsyncIterator, Sequence
+from typing import Deque, cast
+from typing_extensions import assert_never
+from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, FieldId
+from nucliadb.common.models_utils import from_proto
+from nucliadb.ingest.fields.base import Field
+from nucliadb.ingest.fields.conversation import Conversation
+from nucliadb.ingest.fields.file import File
+from nucliadb.ingest.fields.generic import Generic
+from nucliadb.ingest.fields.link import Link
+from nucliadb.ingest.fields.text import Text
+from nucliadb.ingest.orm.resource import Resource
+from nucliadb.models.internal.augment import (
+    AnswerSelector,
+    AugmentedConversationField,
+    AugmentedConversationMessage,
+    AugmentedField,
+    AugmentedFileField,
+    AugmentedGenericField,
+    AugmentedLinkField,
+    AugmentedTextField,
+    ConversationAnswerOrAfter,
+    ConversationAttachments,
+    ConversationProp,
+    ConversationSelector,
+    ConversationText,
+    FieldClassificationLabels,
+    FieldEntities,
+    FieldProp,
+    FieldText,
+    FieldValue,
+    FileProp,
+    FileThumbnail,
+    FullSelector,
+    MessageSelector,
+    NeighboursSelector,
+    PageSelector,
+    WindowSelector,
+)
+from nucliadb.search.augmentor.metrics import augmentor_observer
+from nucliadb.search.augmentor.resources import get_basic
+from nucliadb.search.augmentor.utils import limited_concurrency
+from nucliadb.search.search import cache
+from nucliadb_models.common import FieldTypeName
+from nucliadb_protos import resources_pb2
+from nucliadb_utils.storages.storage import STORAGE_FILE_EXTRACTED
+# Number of messages to pull after a match in a message
+# The hope here is it will be enough to get the answer to the question.
+CONVERSATION_MESSAGE_CONTEXT_EXPANSION = 15
+async def augment_fields(
+    kbid: str,
+    given: list[FieldId],
+    select: list[FieldProp | ConversationProp],
+    *,
+    concurrency_control: asyncio.Semaphore | None = None,
+) -> dict[FieldId, AugmentedField | None]:
+    """Augment a list of fields following an augmentation"""
+    ops = []
+    for field_id in given:
+        task = asyncio.create_task(
+            limited_concurrency(
+                augment_field(kbid, field_id, select),
+                max_ops=concurrency_control,
+            )
+        )
+        ops.append(task)
+    results: list[AugmentedField | None] = await asyncio.gather(*ops)
+    augmented = {}
+    for field_id, augmentation in zip(given, results):
+        augmented[field_id] = augmentation
+    return augmented
+@augmentor_observer.wrap({"type": "field"})
+async def augment_field(
+    kbid: str,
+    field_id: FieldId,
+    select: Sequence[FieldProp | ConversationProp],
+) -> AugmentedField | None:
+    rid = field_id.rid
+    resource = await cache.get_resource(kbid, rid)
+    if resource is None:
+        # skip resources that aren't in the DB
+        return None
+    field_type_pb = FIELD_TYPE_STR_TO_PB[field_id.type]
+    # we must check if field exists or get_field will return an empty field
+    # (behaviour thought for ingestion) that we don't want
+    if not (await resource.field_exists(field_type_pb, field_id.key)):
+        # skip a fields that aren't in the DB
+        return None
+    field = await resource.get_field(field_id.key, field_id.pb_type)
+    return await db_augment_field(field, field_id, select)
+async def db_augment_field(
+    field: Field,
+    field_id: FieldId,
+    select: Sequence[FieldProp | FileProp | ConversationProp],
+) -> AugmentedField:
+    select = dedup_field_select(select)
+    field_type = field_id.type
+    # Note we cast `select` to the specific Union type required by the
+    # db_augment_ function. This is safe even if there are props that are not
+    # for a specific field, as they will be ignored
+    if field_type == FieldTypeName.TEXT.abbreviation():
+        field = cast(Text, field)
+        select = cast(list[FieldProp], select)
+        return await db_augment_text_field(field, field_id, select)
+    elif field_type == FieldTypeName.FILE.abbreviation():
+        field = cast(File, field)
+        select = cast(list[FileProp], select)
+        return await db_augment_file_field(field, field_id, select)
+    elif field_type == FieldTypeName.LINK.abbreviation():
+        field = cast(Link, field)
+        select = cast(list[FieldProp], select)
+        return await db_augment_link_field(field, field_id, select)
+    elif field_type == FieldTypeName.CONVERSATION.abbreviation():
+        field = cast(Conversation, field)
+        select = cast(list[ConversationProp], select)
+        return await db_augment_conversation_field(field, field_id, select)
+    elif field_type == FieldTypeName.GENERIC.abbreviation():
+        field = cast(Generic, field)
+        select = cast(list[FieldProp], select)
+        return await db_augment_generic_field(field, field_id, select)
+    else:  # pragma: no cover
+        assert False, f"unknown field type: {field_type}"
+def dedup_field_select(
+    select: Sequence[FieldProp | FileProp | ConversationProp],
+) -> Sequence[FieldProp | FileProp | ConversationProp]:
+    """Merge any duplicated property taking the broader augmentation possible."""
+    merged = {}
+    # TODO(decoupled-ask): deduplicate conversation props.
+    #
+    # Note that only conversation properties can be deduplicated (none of the
+    # others have any field). However, deduplicating the selector is not
+    # possible in many cases, so we do nothing
+    unmergeable = []
+    for prop in select:
+        if prop.prop not in merged:
+            merged[prop.prop] = prop
+        else:
+            if isinstance(prop, ConversationText) or isinstance(prop, ConversationAttachments):
+                unmergeable.append(prop)
+            elif (
+                isinstance(prop, FieldText)
+                or isinstance(prop, FieldValue)
+                or isinstance(prop, FieldClassificationLabels)
+                or isinstance(prop, FieldEntities)
+                or isinstance(prop, FileThumbnail)
+                or isinstance(prop, ConversationAnswerOrAfter)
+            ):
+                # properties without parameters
+                pass
+            else:  # pragma: no cover
+                assert_never(prop)
+    return [*merged.values(), *unmergeable]
+@augmentor_observer.wrap({"type": "db_text_field"})
+async def db_augment_text_field(
+    field: Text,
+    field_id: FieldId,
+    select: Sequence[FieldProp],
+) -> AugmentedTextField:
+    augmented = AugmentedTextField(id=field.field_id)
+    for prop in select:
+        if isinstance(prop, FieldText):
+            augmented.text = await get_field_extracted_text(field_id, field)
+        elif isinstance(prop, FieldClassificationLabels):
+            augmented.classification_labels = await classification_labels(field_id, field.resource)
+        elif isinstance(prop, FieldEntities):
+            augmented.entities = await field_entities(field_id, field)
+        # text field props
+        elif isinstance(prop, FieldValue):
+            db_value = await field.get_value()
+            if db_value is None:
+                continue
+            augmented.value = from_proto.field_text(db_value)
+        else:  # pragma: no cover
+            assert_never(prop)
+    return augmented
+@augmentor_observer.wrap({"type": "db_file_field"})
+async def db_augment_file_field(
+    field: File,
+    field_id: FieldId,
+    select: Sequence[FileProp],
+) -> AugmentedFileField:
+    augmented = AugmentedFileField(id=field.field_id)
+    for prop in select:
+        if isinstance(prop, FieldText):
+            augmented.text = await get_field_extracted_text(field_id, field)
+        elif isinstance(prop, FieldClassificationLabels):
+            augmented.classification_labels = await classification_labels(field_id, field.resource)
+        elif isinstance(prop, FieldEntities):
+            augmented.entities = await field_entities(field_id, field)
+        # file field props
+        elif isinstance(prop, FieldValue):
+            db_value = await field.get_value()
+            if db_value is None:
+                continue
+            augmented.value = from_proto.field_file(db_value)
+        elif isinstance(prop, FileThumbnail):
+            augmented.thumbnail_path = await get_file_thumbnail_path(field, field_id)
+        else:  # pragma: no cover
+            assert_never(prop)
+    return augmented
+@augmentor_observer.wrap({"type": "db_link_field"})
+async def db_augment_link_field(
+    field: Link,
+    field_id: FieldId,
+    select: Sequence[FieldProp],
+) -> AugmentedLinkField:
+    augmented = AugmentedLinkField(id=field.field_id)
+    for prop in select:
+        if isinstance(prop, FieldText):
+            augmented.text = await get_field_extracted_text(field_id, field)
+        elif isinstance(prop, FieldClassificationLabels):
+            augmented.classification_labels = await classification_labels(field_id, field.resource)
+        elif isinstance(prop, FieldEntities):
+            augmented.entities = await field_entities(field_id, field)
+        # link field props
+        elif isinstance(prop, FieldValue):
+            db_value = await field.get_value()
+            if db_value is None:
+                continue
+            augmented.value = from_proto.field_link(db_value)
+        else:  # pragma: no cover
+            assert_never(prop)
+    return augmented
+@augmentor_observer.wrap({"type": "db_conversation_field"})
+async def db_augment_conversation_field(
+    field: Conversation,
+    field_id: FieldId,
+    select: list[ConversationProp],
+) -> AugmentedConversationField:
+    augmented = AugmentedConversationField(id=field.field_id)
+    # map (page, index) -> augmented message. The key uniquely identifies and
+    # orders messages
+    messages: dict[tuple[int, int], AugmentedConversationMessage] = {}
+    for prop in select:
+        if isinstance(prop, FieldText):
+            if isinstance(prop, ConversationText):
+                selector = prop.selector
+            else:
+                # when asking for the conversation text without details, we
+                # choose the message if a split is provided in the id or the
+                # full conversation otherwise
+                if field_id.subfield_id is not None:
+                    selector = MessageSelector()
+                else:
+                    selector = FullSelector()
+            # gather the text from each message matching the selector
+            extracted_text_pb = await cache.get_field_extracted_text(field)
+            async for page, index, message in conversation_selector(field, field_id, selector):
+                augmented_message = messages.setdefault(
+                    (page, index), AugmentedConversationMessage(ident=message.ident)
+                )
+                if extracted_text_pb is not None and message.ident in extracted_text_pb.split_text:
+                    augmented_message.text = extracted_text_pb.split_text[message.ident]
+                else:
+                    augmented_message.text = message.content.text
+        elif isinstance(prop, FieldValue):
+            db_value = await field.get_metadata()
+            augmented.value = from_proto.field_conversation(db_value)
+        elif isinstance(prop, FieldClassificationLabels):
+            augmented.classification_labels = await classification_labels(field_id, field.resource)
+        elif isinstance(prop, FieldEntities):
+            augmented.entities = await field_entities(field_id, field)
+        elif isinstance(prop, ConversationAttachments):
+            # Each message on a conversation field can have attachments as
+            # references to other fields in the same resource.
+            #
+            # Here, we iterate through all the messages matched by the selector
+            # and collect all the attachment references
+            async for page, index, message in conversation_selector(field, field_id, prop.selector):
+                augmented_message = messages.setdefault(
+                    (page, index), AugmentedConversationMessage(ident=message.ident)
+                )
+                augmented_message.attachments = []
+                for ref in message.content.attachments_fields:
+                    field_id = FieldId.from_pb(
+                        field.uuid, ref.field_type, ref.field_id, ref.split or None
+                    )
+                    augmented_message.attachments.append(field_id)
+        elif isinstance(prop, ConversationAnswerOrAfter):
+            async for page, index, message in conversation_answer_or_after(field, field_id):
+                augmented_message = messages.setdefault(
+                    (page, index), AugmentedConversationMessage(ident=message.ident)
+                )
+                if not augmented_message.text:
+                    augmented_message.text = message.content.text
+        else:  # pragma: no cover
+            assert_never(prop)
+    if len(messages) > 0:
+        augmented.messages = []
+        for (_page, _index), m in sorted(messages.items()):
+            augmented.messages.append(m)
+    return augmented
+@augmentor_observer.wrap({"type": "db_generic_field"})
+async def db_augment_generic_field(
+    field: Generic,
+    field_id: FieldId,
+    select: Sequence[FieldProp],
+) -> AugmentedGenericField:
+    augmented = AugmentedGenericField(id=field.field_id)
+    for prop in select:
+        if isinstance(prop, FieldText):
+            augmented.text = await get_field_extracted_text(field_id, field)
+        elif isinstance(prop, FieldClassificationLabels):
+            augmented.classification_labels = await classification_labels(field_id, field.resource)
+        elif isinstance(prop, FieldEntities):
+            augmented.entities = await field_entities(field_id, field)
+        # generic field props
+        elif isinstance(prop, FieldValue):
+            db_value = await field.get_value()
+            augmented.value = db_value
+        else:  # pragma: no cover
+            assert_never(prop)
+    return augmented
+@augmentor_observer.wrap({"type": "field_text"})
+async def get_field_extracted_text(id: FieldId, field: Field) -> str | None:
+    extracted_text_pb = await cache.get_field_extracted_text(field)
+    if extracted_text_pb is None:  # pragma: no cover
+        return None
+    if id.subfield_id:
+        return extracted_text_pb.split_text[id.subfield_id]
+    else:
+        return extracted_text_pb.text
+async def classification_labels(id: FieldId, resource: Resource) -> dict[str, set[str]] | None:
+    basic = await get_basic(resource)
+    if basic is None:
+        return None
+    labels: dict[str, set[str]] = {}
+    for fc in basic.computedmetadata.field_classifications:
+        if fc.field.field == id.key and fc.field.field_type == id.pb_type:
+            for classification in fc.classifications:
+                if classification.cancelled_by_user:  # pragma: no cover
+                    continue
+                labels.setdefault(classification.labelset, set()).add(classification.label)
+    return labels
+async def field_entities(id: FieldId, field: Field) -> dict[str, set[str]] | None:
+    field_metadata = await field.get_field_metadata()
+    if field_metadata is None:
+        return None
+    ners: dict[str, set[str]] = {}
+    # Data Augmentation + Processor entities
+    for (
+        data_aumgentation_task_id,
+        entities_wrapper,
+    ) in field_metadata.metadata.entities.items():
+        for entity in entities_wrapper.entities:
+            ners.setdefault(entity.label, set()).add(entity.text)
+    # Legacy processor entities
+    # TODO(decoupled-ask): Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
+    for token, family in field_metadata.metadata.ner.items():
+        ners.setdefault(family, set()).add(token)
+    return ners
+async def get_file_thumbnail_path(field: File, field_id: FieldId) -> str | None:
+    thumbnail = await field.thumbnail()
+    if thumbnail is None:
+        return None
+    # When ingesting file processed data, we move thumbnails to a owned
+    # path. The thumbnail.key must then match this path so we can safely
+    # return a path that can be used with the download API to get the
+    # actual image
+    _expected_prefix = STORAGE_FILE_EXTRACTED.format(
+        kbid=field.kbid, uuid=field.uuid, field_type=field_id.type, field=field_id.key, key=""
+    )
+    assert thumbnail.key.startswith(_expected_prefix), (
+        "we use a hardcoded path for file thumbnails and we assume is this"
+    )
+    thumbnail_path = thumbnail.key.removeprefix(_expected_prefix)
+    return thumbnail_path
+async def find_conversation_message(
+    field: Conversation, ident: str
+) -> tuple[int, int, resources_pb2.Message] | None:
+    """Find a message in the conversation identified by `ident`."""
+    conversation_metadata = await field.get_metadata()
+    for page in range(1, conversation_metadata.pages + 1):
+        conversation = await field.db_get_value(page)
+        for idx, message in enumerate(conversation.messages):
+            if message.ident == ident:
+                return page, idx, message
+    return None
+async def iter_conversation_messages(
+    field: Conversation,
+    *,
+    start_from: tuple[int, int] = (1, 0),  # (page, message)
+) -> AsyncIterator[tuple[int, int, resources_pb2.Message]]:
+    """Iterate through the conversation messages starting from an specific page
+    and index.
+    """
+    start_page, start_index = start_from
+    conversation_metadata = await field.get_metadata()
+    for page in range(start_page, conversation_metadata.pages + 1):
+        conversation = await field.db_get_value(page)
+        for idx, message in enumerate(conversation.messages[start_index:]):
+            yield (page, start_index + idx, message)
+        # next iteration we want all messages
+        start_index = 0
+async def conversation_answer(
+    field: Conversation,
+    *,
+    start_from: tuple[int, int] = (1, 0),  # (page, message)
+) -> tuple[int, int, resources_pb2.Message] | None:
+    """Find the next conversation message of type ANSWER starting from an
+    specific page and index.
+    """
+    async for page, index, message in iter_conversation_messages(field, start_from=start_from):
+        if message.type == resources_pb2.Message.MessageType.ANSWER:
+            return page, index, message
+    return None
+async def conversation_messages_after(
+    field: Conversation,
+    *,
+    start_from: tuple[int, int] = (1, 0),  # (page, index)
+    limit: int | None = None,
+) -> AsyncIterator[tuple[int, int, resources_pb2.Message]]:
+    assert limit is None or limit > 0, "this function can't iterate backwards"
+    async for page, index, message in iter_conversation_messages(field, start_from=start_from):
+        yield page, index, message
+        if limit is not None:
+            limit -= 1
+            if limit == 0:
+                break
+async def conversation_selector(
+    field: Conversation,
+    field_id: FieldId,
+    selector: ConversationSelector,
+) -> AsyncIterator[tuple[int, int, resources_pb2.Message]]:
+    """Given a conversation, iterate through the messages matched by a
+    selector.
+    """
+    split = field_id.subfield_id
+    if isinstance(selector, MessageSelector):
+        if selector.id is None and selector.index is None and split is None:
+            return
+        if selector.index is not None:
+            metadata = await field.get_metadata()
+            if metadata is None:
+                # we can't know about pages/messages
+                return
+            if isinstance(selector.index, int):
+                page = selector.index // metadata.size + 1
+                index = selector.index % metadata.size
+            elif isinstance(selector.index, str):
+                if selector.index == "first":
+                    page, index = (1, 0)
+                elif selector.index == "last":
+                    page = metadata.pages
+                    index = metadata.total % metadata.size - 1
+                else:  # pragma: no cover
+                    assert_never(selector.index)
+            else:  # pragma: no cover
+                assert_never(selector.index)
+            found = None
+            async for found in iter_conversation_messages(field, start_from=(page, index)):
+                break
+            if found is None:
+                return
+            page, index, message = found
+            yield page, index, message
+        else:
+            # selector.id takes priority over the field id, as it is more specific
+            if selector.id is not None:
+                split = selector.id
+            assert split is not None
+            found = await find_conversation_message(field, split)
+            if found is None:
+                return
+            page, index, message = found
+            yield page, index, message
+    elif isinstance(selector, PageSelector):
+        if split is None:
+            return
+        found = await find_conversation_message(field, split)
+        if found is None:
+            return
+        page, _, _ = found
+        conversation_page = await field.db_get_value(page)
+        for index, message in enumerate(conversation_page.messages):
+            yield page, index, message
+    elif isinstance(selector, NeighboursSelector):
+        selector = cast(NeighboursSelector, selector)
+        if split is None:
+            return
+        found = await find_conversation_message(field, split)
+        if found is None:
+            return
+        page, index, message = found
+        yield page, index, message
+        start_from = (page, index + 1)
+        async for page, index, message in conversation_messages_after(
+            field, start_from=start_from, limit=selector.after
+        ):
+            yield page, index, message
+    elif isinstance(selector, WindowSelector):
+        if split is None:
+            return
+        # Find the position of the `split` message and get the window
+        # surrounding it. If there are not enough preceding/following messages,
+        # the window won't be centered
+        messages: Deque[tuple[int, int, resources_pb2.Message]] = deque(maxlen=selector.size)
+        metadata = await field.get_metadata()
+        pending = -1
+        for page in range(1, metadata.pages + 1):
+            conversation_page = await field.db_get_value(page)
+            for index, message in enumerate(conversation_page.messages):
+                messages.append((page, index, message))
+                if pending > 0:
+                    pending -= 1
+                if message.ident == split:
+                    pending = (selector.size - 1) // 2
+                if pending == 0:
+                    break
+            if pending == 0:
+                break
+        for page, index, message in messages:
+            yield page, index, message
+    elif isinstance(selector, AnswerSelector):
+        if split is None:
+            return
+        found = await find_conversation_message(field, split)
+        if found is None:
+            return
+        page, index, message = found
+        found = await conversation_answer(field, start_from=(page, index))
+        if found is not None:
+            page, index, answer = found
+            yield page, index, answer
+    elif isinstance(selector, FullSelector):
+        async for page, index, message in iter_conversation_messages(field):
+            yield page, index, message
+    else:  # pragma: no cover
+        assert_never(selector)
+async def conversation_answer_or_after(
+    field: Conversation, field_id: FieldId
+) -> AsyncIterator[tuple[int, int, resources_pb2.Message]]:
+    m: resources_pb2.Message | None = None
+    # first search the message in the conversation
+    async for page, index, m in conversation_selector(field, field_id, MessageSelector()):
+        pass
+    if m is None:
+        return
+    if m.type == resources_pb2.Message.MessageType.QUESTION:
+        # try to find an answer for this question
+        found = await conversation_answer(field, start_from=(page, index + 1))
+        if found is None:
+            return
+        else:
+            page, index, answer = found
+            yield page, index, answer
+    else:
+        # add a bunch of messages after this for more context
+        async for page, index, message in conversation_messages_after(
+            field, start_from=(page, index + 1), limit=CONVERSATION_MESSAGE_CONTEXT_EXPANSION
+        ):
+            yield page, index, message

nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.9.1.post5192py3-none-any.whl → 6.10.0.post5705py3-none-any.whl