PyPI - nucliadb - Versions diffs - 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (246) hide show

migrations/0023_backfill_pg_catalog.py +8 -4
migrations/0028_extracted_vectors_reference.py +1 -1
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +8 -4
migrations/0039_backfill_converation_splits_metadata.py +106 -0
migrations/0040_migrate_search_configurations.py +79 -0
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/__init__.py +79 -0
nucliadb/common/catalog/dummy.py +36 -0
nucliadb/common/catalog/interface.py +85 -0
nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
nucliadb/common/catalog/utils.py +56 -0
nucliadb/common/cluster/manager.py +8 -23
nucliadb/common/cluster/rebalance.py +484 -112
nucliadb/common/cluster/rollover.py +36 -9
nucliadb/common/cluster/settings.py +4 -9
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +9 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +5 -34
nucliadb/common/external_index_providers/settings.py +1 -27
nucliadb/common/filter_expression.py +129 -41
nucliadb/common/http_clients/exceptions.py +8 -0
nucliadb/common/http_clients/processing.py +16 -23
nucliadb/common/http_clients/utils.py +3 -0
nucliadb/common/ids.py +82 -58
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +22 -5
nucliadb/common/vector_index_config.py +1 -1
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +10 -8
nucliadb/ingest/consumer/service.py +5 -30
nucliadb/ingest/consumer/shard_creator.py +16 -5
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +37 -49
nucliadb/ingest/fields/conversation.py +55 -9
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +89 -57
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +128 -113
nucliadb/ingest/orm/knowledgebox.py +91 -59
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +98 -153
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +82 -71
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/partitions.py +12 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +15 -114
nucliadb/ingest/settings.py +36 -15
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +23 -26
nucliadb/metrics_exporter.py +20 -6
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +4 -11
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/knowledgebox.py +37 -9
nucliadb/reader/api/v1/learning_config.py +33 -14
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +3 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +15 -19
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +328 -0
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +28 -8
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +33 -19
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -42
nucliadb/search/search/chat/ask.py +131 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +453 -32
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +49 -0
nucliadb/search/search/hydrator/fields.py +217 -0
nucliadb/search/search/hydrator/images.py +130 -0
nucliadb/search/search/hydrator/paragraphs.py +323 -0
nucliadb/search/search/hydrator/resources.py +60 -0
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +24 -7
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +44 -18
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -48
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +5 -6
nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
nucliadb/search/search/query_parser/parsers/common.py +21 -13
nucliadb/search/search/query_parser/parsers/find.py +6 -29
nucliadb/search/search/query_parser/parsers/graph.py +18 -28
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -56
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +6 -7
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +5 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +4 -10
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +15 -14
nucliadb/writer/api/v1/knowledgebox.py +18 -56
nucliadb/writer/api/v1/learning_config.py +5 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +43 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +5 -7
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +15 -22
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb/common/external_index_providers/pinecone.py +0 -894
nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
nucliadb/search/search/hydrator.py +0 -197
nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/models/internal/augment.py ADDED Viewed

@@ -0,0 +1,614 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Annotated, Any, Literal
+from pydantic import BaseModel, Discriminator, Field, Tag, model_validator
+from typing_extensions import Self
+import nucliadb_models
+from nucliadb.common.external_index_providers.base import TextBlockMatch
+from nucliadb.common.ids import FieldId, ParagraphId
+from nucliadb_models import filters
+from nucliadb_models.augment import ResourceId
+from nucliadb_models.common import FieldTypeName
+from nucliadb_models.conversation import FieldConversation
+from nucliadb_models.file import FieldFile
+from nucliadb_models.link import FieldLink
+from nucliadb_models.metadata import Extra, Origin
+from nucliadb_models.resource import ExtractedDataTypeName, Resource
+from nucliadb_models.search import (
+    ResourceProperties,
+    SearchParamDefaults,
+    TextPosition,
+)
+from nucliadb_protos import resources_pb2
+class SelectProp(BaseModel):
+    prop: Any
+    @model_validator(mode="after")
+    def set_discriminator(self) -> Self:
+        # Ensure discriminator is explicitly set so it's always serialized
+        self.prop = self.prop
+        return self
+def discriminator(name: str) -> Callable[[Any], str | None]:
+    def _inner(v: Any) -> str | None:
+        if isinstance(v, dict):
+            return v.get(name, None)
+        else:
+            return getattr(v, name, None)
+    return _inner
+prop_discriminator = discriminator(name="prop")
+from_discriminator = discriminator(name="from")
+name_discriminator = discriminator(name="name")
+# Complex ids
+class Metadata(BaseModel):
+    is_an_image: bool
+    is_a_table: bool
+    # for extracted from visual content (ocr, inception, tables)
+    source_file: str | None
+    # for documents (pdf, docx...) only
+    page: int | None
+    in_page_with_visual: bool | None
+    @classmethod
+    def from_text_block_match(cls, text_block: TextBlockMatch) -> Self:
+        return cls(
+            is_an_image=text_block.is_an_image,
+            is_a_table=text_block.is_a_table,
+            source_file=text_block.representation_file,
+            page=text_block.position.page_number,
+            in_page_with_visual=text_block.page_with_visual,
+        )
+    @classmethod
+    def from_db_paragraph(cls, paragraph: resources_pb2.Paragraph) -> Self:
+        is_an_image = paragraph.kind in (
+            resources_pb2.Paragraph.TypeParagraph.OCR,
+            resources_pb2.Paragraph.TypeParagraph.INCEPTION,
+        )
+        # REVIEW(decoupled-ask): can a paragraph be of a different type and still be a table?
+        is_a_table = (
+            paragraph.kind == resources_pb2.Paragraph.TypeParagraph.TABLE
+            or paragraph.representation.is_a_table
+        )
+        if paragraph.representation.reference_file:
+            source_file = paragraph.representation.reference_file
+        else:
+            source_file = None
+        if paragraph.HasField("page"):
+            page = paragraph.page.page
+            in_page_with_visual = paragraph.page.page_with_visual
+        else:
+            page = None
+            in_page_with_visual = None
+        return cls(
+            is_an_image=is_an_image,
+            is_a_table=is_a_table,
+            source_file=source_file,
+            page=page,
+            in_page_with_visual=in_page_with_visual,
+        )
+class Paragraph(BaseModel):
+    id: ParagraphId
+    metadata: Metadata | None = None
+    @classmethod
+    def from_text_block_match(cls, text_block: TextBlockMatch) -> Self:
+        return cls(
+            id=text_block.paragraph_id,
+            metadata=Metadata.from_text_block_match(text_block),
+        )
+    @classmethod
+    def from_db_paragraph(cls, id: ParagraphId, paragraph: resources_pb2.Paragraph) -> Self:
+        return cls(
+            id=id,
+            metadata=Metadata.from_db_paragraph(paragraph),
+        )
+# SELECT props
+class ParagraphText(SelectProp):
+    prop: Literal["text"] = "text"
+class ParagraphPosition(SelectProp):
+    prop: Literal["position"] = "position"
+class ParagraphImage(SelectProp):
+    prop: Literal["image"] = "image"
+class ParagraphTable(SelectProp):
+    prop: Literal["table"] = "table"
+    # sometimes, due to a not perfect extraction, is better to use the page
+    # preview instead of the table image for context. This options let users
+    # choose
+    prefer_page_preview: bool = False
+class ParagraphPage(SelectProp):
+    prop: Literal["page"] = "page"
+    preview: bool = True
+class RelatedParagraphs(SelectProp):
+    prop: Literal["related"] = "related"
+    neighbours_before: int = Field(ge=0, description="Number of previous paragraphs to hydrate")
+    neighbours_after: int = Field(ge=0, description="Number of following paragraphs to hydrate")
+ParagraphProp = Annotated[
+    (
+        Annotated[ParagraphText, Tag("text")]
+        | Annotated[ParagraphPosition, Tag("position")]
+        | Annotated[ParagraphImage, Tag("image")]
+        | Annotated[ParagraphTable, Tag("table")]
+        | Annotated[ParagraphPage, Tag("page")]
+        | Annotated[RelatedParagraphs, Tag("related")]
+    ),
+    Discriminator(prop_discriminator),
+]
+class FieldText(SelectProp):
+    prop: Literal["text"] = "text"
+class FieldValue(SelectProp):
+    prop: Literal["value"] = "value"
+class FieldClassificationLabels(SelectProp):
+    prop: Literal["classification_labels"] = "classification_labels"
+class FieldEntities(SelectProp):
+    """Same as MetadataExtensionStrategy asking for ners"""
+    prop: Literal["entities"] = "entities"
+FieldProp = Annotated[
+    (
+        Annotated[FieldText, Tag("text")]
+        | Annotated[FieldValue, Tag("value")]
+        | Annotated[FieldClassificationLabels, Tag("classification_labels")]
+        | Annotated[FieldEntities, Tag("entities")]
+    ),
+    Discriminator(prop_discriminator),
+]
+class FileThumbnail(SelectProp):
+    """File field thumbnail image"""
+    prop: Literal["thumbnail"] = "thumbnail"
+FileProp = Annotated[
+    (
+        Annotated[FieldText, Tag("text")]
+        | Annotated[FieldValue, Tag("value")]
+        | Annotated[FieldClassificationLabels, Tag("classification_labels")]
+        | Annotated[FieldEntities, Tag("entities")]
+        | Annotated[FileThumbnail, Tag("thumbnail")]
+    ),
+    Discriminator(prop_discriminator),
+]
+class MessageSelector(BaseModel):
+    """Selects the message specified by the field id."""
+    name: Literal["message"] = "message"
+    id: str | None = None
+    index: Literal["first"] | Literal["last"] | int | None = Field(
+        default=None,
+        description="Index of the message in the conversation. Indexing starts at 0",
+    )
+    @model_validator(mode="after")
+    def id_or_index(self) -> Self:
+        if self.id is not None and self.index is not None:
+            raise ValueError("Can't define both `id` and `index`")
+        return self
+class PageSelector(BaseModel):
+    """Selects all messages from the page of the message specified by the field
+    id.
+    """
+    name: Literal["page"] = "page"
+class NeighboursSelector(BaseModel):
+    """Selects a bunch of messages preceding or following the one specified by
+    the field id.
+    """
+    name: Literal["neighbours"] = "neighbours"
+    after: int = Field(ge=1)
+class WindowSelector(BaseModel):
+    """Selects a window of certain size around the message specified by the
+    field id.
+    If size=1, this behaves as MessageSelector.
+    If, for example, size=5 and there are 2 messages preceding and 2 following,
+    it behaves as a NeighbourSelector(before=2, after=2). However, if there's
+    not enough messages before/after, the window will be offset. For example, if
+    the selected message is the first on the conversation and size=5, it'll
+    select the first 5 messages of the conversation.
+    """
+    name: Literal["window"] = "window"
+    size: int = Field(ge=1)
+class AnswerSelector(BaseModel):
+    """Search for the next message of type ANSWER. For ids containing the split,
+    search starts from that message rather than the beginning of the
+    conversation.
+    """
+    name: Literal["answer"] = "answer"
+class FullSelector(BaseModel):
+    """Selects the whole conversation"""
+    name: Literal["full"] = "full"
+ConversationSelector = Annotated[
+    (
+        Annotated[MessageSelector, Tag("message")]
+        | Annotated[PageSelector, Tag("page")]
+        | Annotated[NeighboursSelector, Tag("neighbours")]
+        | Annotated[WindowSelector, Tag("window")]
+        | Annotated[AnswerSelector, Tag("answer")]
+        | Annotated[FullSelector, Tag("full")]
+    ),
+    Discriminator(name_discriminator),
+]
+class ConversationText(FieldText):
+    prop: Literal["text"] = "text"
+    selector: ConversationSelector
+class ConversationAttachments(SelectProp):
+    prop: Literal["attachments"] = "attachments"
+    selector: ConversationSelector = Field(default_factory=FullSelector)
+class ConversationAnswerOrAfter(SelectProp):
+    """Hacky conversation prop that given a conversation message (paragraph or
+    split), if it's type QUESTION, searches an answer and otherwise provides a
+    fixed window of messages after.
+    This was originally used in the /ask endpoint for conversation matches if no
+    strategy was selected, however, many bugs around it made it not really used.
+    Thus, the value provided by this is not clear and further evaluation should
+    be performed.
+    """
+    prop: Literal["answer_or_after"] = "answer_or_after"
+ConversationProp = Annotated[
+    (
+        Annotated[ConversationText, Tag("text")]
+        | Annotated[FieldText, Tag("text")]
+        | Annotated[FieldValue, Tag("value")]
+        | Annotated[FieldClassificationLabels, Tag("classification_labels")]
+        | Annotated[FieldEntities, Tag("entities")]
+        | Annotated[ConversationAttachments, Tag("attachments")]
+        | Annotated[ConversationAnswerOrAfter, Tag("answer_or_after")]
+    ),
+    Discriminator(prop_discriminator),
+]
+class ResourceTitle(SelectProp):
+    prop: Literal["title"] = "title"
+class ResourceSummary(SelectProp):
+    prop: Literal["summary"] = "summary"
+class ResourceOrigin(SelectProp):
+    """Same as show=["origin"] using GET resource or search endpoints"""
+    prop: Literal["origin"] = "origin"
+class ResourceExtra(SelectProp):
+    """Same as show=["extra"] and MetadataExtensionStrategy asking for
+    extra_metadata
+    """
+    prop: Literal["extra"] = "extra"
+class ResourceSecurity(SelectProp):
+    """Same as show=["security"] using GET resource or search endpoints"""
+    prop: Literal["security"] = "security"
+class ResourceClassificationLabels(SelectProp):
+    """Same as MetadataExtensionStrategy asking for classification_labels"""
+    prop: Literal["classification_labels"] = "classification_labels"
+class ResourceFieldsFilter(BaseModel):
+    ids: list[str]
+ResourceProp = Annotated[
+    (
+        Annotated[ResourceTitle, Tag("title")]
+        | Annotated[ResourceSummary, Tag("summary")]
+        | Annotated[ResourceOrigin, Tag("origin")]
+        | Annotated[ResourceExtra, Tag("extra")]
+        | Annotated[ResourceSecurity, Tag("security")]
+        | Annotated[ResourceClassificationLabels, Tag("classification_labels")]
+    ),
+    Discriminator(prop_discriminator),
+]
+# Augmentations
+class ResourceAugment(BaseModel, extra="forbid"):
+    given: list[ResourceId | FieldId | ParagraphId]
+    select: list[ResourceProp]
+    from_: Literal["resources"] = Field(default="resources", alias="from")
+class DeepResourceAugment(BaseModel, extra="forbid"):
+    given: list[ResourceId]
+    # old style serialization parameters
+    show: list[ResourceProperties] = SearchParamDefaults.show.to_pydantic_field()
+    extracted: list[ExtractedDataTypeName] = SearchParamDefaults.extracted.to_pydantic_field()
+    field_type_filter: list[FieldTypeName] = SearchParamDefaults.field_type_filter.to_pydantic_field()
+    from_: Literal["resources.deep"] = Field(default="resources.deep", alias="from")
+class FileAugment(BaseModel, extra="forbid"):
+    given: list[FieldId | ParagraphId]
+    select: list[FileProp]
+    from_: Literal["files"] = Field(default="files", alias="from")
+class ConversationAugmentLimits(BaseModel):
+    max_messages: int | None = Field(default=15, ge=0)
+class ConversationAugment(BaseModel, extra="forbid"):
+    given: list[FieldId | ParagraphId]
+    select: list[ConversationProp]
+    from_: Literal["conversations"] = Field(default="conversations", alias="from")
+    # TODO(decoupled-storage): remove?
+    limits: ConversationAugmentLimits | None = Field(default_factory=ConversationAugmentLimits)
+FieldFilter = Annotated[
+    (Annotated[filters.Field, Tag("field")] | Annotated[filters.Generated, Tag("generated")]),
+    Discriminator(prop_discriminator),
+]
+class FieldAugment(BaseModel, extra="forbid"):
+    given: list[ResourceId] | list[FieldId] | list[ParagraphId]
+    select: list[FieldProp]
+    from_: Literal["fields"] = Field(default="fields", alias="from")
+    filter: list[FieldFilter] | None = None
+class ParagraphAugment(BaseModel, extra="forbid"):
+    given: list[Paragraph]
+    select: list[ParagraphProp]
+    from_: Literal["paragraphs"] = Field(default="paragraphs", alias="from")
+class AugmentationLimits(BaseModel, extra="forbid"):
+    # TODO(decoupled-ask): global augmentation limits (max chars, images, image size...)
+    ...
+Augment = Annotated[
+    (
+        Annotated[ResourceAugment, Tag("resources")]
+        | Annotated[DeepResourceAugment, Tag("resources.deep")]
+        | Annotated[FieldAugment, Tag("fields")]
+        | Annotated[FileAugment, Tag("files")]
+        | Annotated[ConversationAugment, Tag("conversations")]
+        | Annotated[ParagraphAugment, Tag("paragraphs")]
+    ),
+    Discriminator(from_discriminator),
+]
+class AugmentRequest(BaseModel, extra="forbid"):
+    augmentations: list[Augment] = Field(
+        default_factory=list,
+        description="List of augmentations to be performed",
+    )
+    limits: AugmentationLimits | None = Field(
+        default=None,
+        description="Global hydration limits applied to the whole request",
+    )
+# Augmented data models
+@dataclass
+class AugmentedRelatedParagraphs:
+    neighbours_before: list[ParagraphId]
+    neighbours_after: list[ParagraphId]
+@dataclass
+class AugmentedParagraph:
+    id: ParagraphId
+    # textual representation of the paragraph
+    text: str | None
+    position: TextPosition | None
+    # original image for the paragraph when it has been extracted from an image
+    # or a table. This value is the path to be used in the download endpoint
+    source_image_path: str | None
+    # image extracted from the table. It can be just from the table or the page,
+    # depending on the augment parameters
+    table_image_path: str | None
+    # if the paragraph comes from a page, this is the path for the download
+    # endpoint to get the page preview image
+    page_preview_path: str | None
+    related: AugmentedRelatedParagraphs | None
+@dataclass
+class BaseAugmentedField:
+    id: FieldId
+    classification_labels: dict[str, set[str]] | None = None
+    entities: dict[str, set[str]] | None = None
+@dataclass
+class AugmentedTextField(BaseAugmentedField):
+    value: nucliadb_models.text.FieldText | None = None
+    text: str | None = None
+@dataclass
+class AugmentedFileField(BaseAugmentedField):
+    value: FieldFile | None = None
+    text: str | None = None
+    thumbnail_path: str | None = None
+@dataclass
+class AugmentedLinkField(BaseAugmentedField):
+    value: FieldLink | None = None
+    text: str | None = None
+@dataclass
+class AugmentedConversationMessage:
+    ident: str
+    text: str | None = None
+    attachments: list[FieldId] | None = None
+@dataclass
+class AugmentedConversationField(BaseAugmentedField):
+    value: FieldConversation | None = None
+    messages: list[AugmentedConversationMessage] | None = None
+@dataclass
+class AugmentedGenericField(BaseAugmentedField):
+    value: str | None = None
+    text: str | None = None
+AugmentedField = (
+    BaseAugmentedField
+    | AugmentedTextField
+    | AugmentedFileField
+    | AugmentedLinkField
+    | AugmentedConversationField
+    | AugmentedGenericField
+)
+@dataclass
+class AugmentedResource:
+    id: str
+    title: str | None
+    summary: str | None
+    origin: Origin | None
+    extra: Extra | None
+    security: nucliadb_models.security.ResourceSecurity | None
+    classification_labels: dict[str, set[str]] | None
+@dataclass
+class Augmented:
+    resources: dict[str, AugmentedResource]
+    resources_deep: dict[str, Resource]
+    fields: dict[FieldId, AugmentedField]
+    paragraphs: dict[ParagraphId, AugmentedParagraph]

nucliadb/models/internal/processing.py CHANGED Viewed

@@ -24,7 +24,7 @@
 from datetime import datetime
 from enum import Enum
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 from pydantic import BaseModel, Field
@@ -65,8 +65,8 @@ class PushTextFormat(int, Enum):
 class Text(BaseModel):
     body: str
     format: PushTextFormat
-    extract_strategy: Optional[str] = None
-    split_strategy: Optional[str] = None
+    extract_strategy: str | None = None
+    split_strategy: str | None = None
     classification_labels: list[ClassificationLabel] = []
@@ -75,18 +75,18 @@ class LinkUpload(BaseModel):
     headers: dict[str, str] = {}
     cookies: dict[str, str] = {}
     localstorage: dict[str, str] = {}
-    css_selector: Optional[str] = Field(
+    css_selector: str | None = Field(
         None,
         title="Css selector",
         description="Css selector to parse the link",
     )
-    xpath: Optional[str] = Field(
+    xpath: str | None = Field(
         None,
         title="Xpath",
         description="Xpath to parse the link",
     )
-    extract_strategy: Optional[str] = None
-    split_strategy: Optional[str] = None
+    extract_strategy: str | None = None
+    split_strategy: str | None = None
     classification_labels: list[ClassificationLabel] = []
@@ -99,14 +99,14 @@ class PushMessageFormat(int, Enum):
 class PushMessageContent(BaseModel):
-    text: Optional[str] = None
+    text: str | None = None
     format: PushMessageFormat
     attachments: list[str] = []
 class PushMessage(BaseModel):
-    timestamp: Optional[datetime] = None
-    who: Optional[str] = None
+    timestamp: datetime | None = None
+    who: str | None = None
     to: list[str] = []
     content: PushMessageContent
     ident: str
@@ -114,8 +114,8 @@ class PushMessage(BaseModel):
 class PushConversation(BaseModel):
     messages: list[PushMessage] = []
-    extract_strategy: Optional[str] = None
-    split_strategy: Optional[str] = None
+    extract_strategy: str | None = None
+    split_strategy: str | None = None
     classification_labels: list[ClassificationLabel] = []
@@ -125,19 +125,19 @@ class Source(SourceValue, Enum):  # type: ignore
 class ProcessingInfo(BaseModel):
-    seqid: Optional[int] = None
-    account_seq: Optional[int] = None
-    queue: Optional[QueueType] = None
+    seqid: int | None = None
+    account_seq: int | None = None
+    queue: QueueType | None = None
 class PushPayload(BaseModel):
     uuid: str
-    slug: Optional[str] = None
+    slug: str | None = None
     kbid: str
-    source: Optional[Source] = None
+    source: Source | None = None
     userid: str
-    title: Optional[str] = None
+    title: str | None = None
     genericfield: dict[str, Text] = {}
@@ -160,4 +160,4 @@ class PushPayload(BaseModel):
     partition: int
     # List of available processing options (with default values)
-    processing_options: Optional[PushProcessingOptions] = Field(default_factory=PushProcessingOptions)
+    processing_options: PushProcessingOptions | None = Field(default_factory=PushProcessingOptions)

nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl